def update_comment_status(self, comment, post): if debugging: dLogger.log("<ThreadComment#%s>::update_comment_status()" % self.ident) #dLogger.log(" comment: %s"%comment) #dLogger.log(" message: %s"%unicode(comment['message'])) fbcomment = None try: try: fbcomment = FBComment.objects.get(fid=comment["id"]) except ObjectDoesNotExist: fbcomment = FBComment(post=post) fbcomment.save() fbcomment.update_from_facebook(comment, post) except IntegrityError: try: fbcomment = FBComment.objects.get(fid=comment["id"]) fbcomment.update_from_facebook(comment, post) except ObjectDoesNotExist: msg = u"ERROR! Comments already exist but not found%s for %s" % ( unicode(comment), post.fid if post.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) except: msg = u"<p style='red'>Cannot update comment %s for %s</p>" % ( unicode(comment), post.fid if post.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) return fbcomment
def get_search_status_chart(request, harvester_id, search_term): try: search = get_list_or_404(TWSearch, term=search_term)[0] count = search.status_list.count() fromto = search.status_list.order_by(u"created_at") base = fromto[0].created_at if count != 0 else dt.datetime.now() order = 1 while fromto[0].created_at == None and order < len(fromto): base = fromto[order].created_at order += 1 to = fromto[count-1].created_at if count != 0 else dt.datetime.now() logger.debug("to: %s"%to) logger.debug("base: %s"%base) days = (to - base).days + 1 dateList = [ base + dt.timedelta(days=x) for x in range(0,days) ] description = {"date_val": ("date", "Date"), "status_count": ("number", "Status count"), } data = [] for date in dateList: c = search.status_list.filter(created_at__year=date.year,created_at__month=date.month,created_at__day=date.day).count() data.append({"date_val":date, "status_count":c}) data_table = gviz_api.DataTable(description) data_table.LoadData(data) logger.debug(data_table.ToJSon()) response = HttpResponse(data_table.ToJSon(), mimetype='application/javascript') return response except: dLogger.exception("AN ERROR HAS OCCURED WHILE RENDERING STATUS CHART: SEARCH_TERM: %s"%search_term)
def get_fb_harvester_comment_list(request, call_type, harvester_id): querySet = None #columnIndexNameMap is required for correct sorting behavior columnIndexNameMap = { 0: u'created_time', 1: u'ffrom__username', 2: u'post__ffrom__name', 3: u'post__fid', 4: u'message', 5: u'likes', 6: u'user_likes', 7: u'ftype', 8: u'ffrom__name', 9: u'ffrom__fid', 10: u'post__ffrom__fid', } try: if harvester_id == '0': querySet = FBComment.objects.all() else: harvester = get_list_or_404(FacebookHarvester, pk=harvester_id)[0] querySet = FBComment.objects.filter( post__user__harvester_in_charge=harvester).distinct() except ObjectDoesNotExist: pass dLogger.exception('ERROR OCCURED IN get_fb_harvester_comment_list:') #call to generic function from utils return get_datatables_records(request, querySet, columnIndexNameMap, call_type)
def update_statuses(harvester, snh_search, status_id_list): if debugging: dLogger.log('update_statuses()') dLogger.log(' snh_search: %s' % snh_search) dLogger.log(' status_id_list: %s' % status_id_list) statuses_ids = status_id_list[0] for status_id in status_id_list[1:]: statuses_ids += ',%s' % status_id #dLogger.log(' statuses_ids: %s'%statuses_ids) statuses = api_statuses_lookup(harvester, statuses_ids, include_entities=True) for tw_status in sorted(statuses, key=lambda x: x['created_at'], reverse=True): #dLogger.log(tw_status['created_at']) tw_user = tw_status['user'] snh_user, new = TWUser.objects.get_or_create(fid=tw_user['id']) if new: snh_user.screen_name = tw_user['screen_name'] if debugging: dLogger.log(' new user created: %s' % snh_user) logger.info('New user created from search: %s' % snh_user) snh_user.harvester = harvester try: snh_user.save() except: dLogger.log(' GODDAMNIT') user = TWUser.objects.get(screen_name=tw_user['screen_name']) for post in user.postedStatuses.all(): post.user = snh_user post.save() user.delete() snh_user.save() try: snh_status = TWStatus.objects.get(fid=tw_status['id_str']) except: snh_status = TWStatus.objects.create(fid=tw_status['id_str'], user=snh_user) if debugging: dLogger.log(' new status created: %s' % snh_status) logger.debug('New status created from search: %s' % snh_status) try: snh_status.update_from_rawtwitter(tw_status, snh_user, harvester.keep_raw_statuses) update_search(snh_search, snh_status) except Exception as e: if debugging: dLogger.exception(e) logger.exception('AN ERROR HAS OCCURED WHILE SAVING TWEET TO DB:') snh_search.latest_status_harvested = snh_status snh_search.save()
def manage_exception(retry_count, harvester, user): if debugging: dLogger.log( "manage_exception(retry_count: %s, harvester: %s, user: %s)" % (retry_count, harvester, user)) msg = u"Exception for the harvester %s for %s. Retry:%d" % ( harvester, unicode(user), retry_count) logger.exception(msg) if debugging: dLogger.exception(msg) retry_count += 1 return (retry_count, retry_count > harvester.max_retry_on_fail)
def update_from_youtube(self, snh_video, snh_user, yt_comment): #Comment if debugging: dLogger.log("<YTComment: '%s'>::update_from_youtube()" % self) #dLogger.pretty(yt_comment) model_changed = False fid = yt_comment['id'] if self.fid != fid: self.fid = fid model_changed = True snippet = yt_comment['snippet'] if self.video != snh_video: self.video = snh_video model_changed = True if self.user != snh_user: self.user = snh_user model_changed = True yt_published = snippet['publishedAt'] date_val = datetime.strptime(yt_published[:-5], '%Y-%m-%dT%H:%M:%S') if self.published != date_val: self.published = date_val model_changed = True yt_updated = snippet['updatedAt'] date_val = datetime.strptime(yt_updated[:-5], '%Y-%m-%dT%H:%M:%S') if self.updated != date_val: self.updated = date_val model_changed = True content = snippet['textDisplay'].encode('unicode_escape') content = re.sub(r'\\\\x..', '', content) if self.message != content: self.message = content model_changed = True like_count = snippet['likeCount'] if self.like_count != like_count: self.like_count = like_count model_changed = True if model_changed: self.model_update_date = datetime.utcnow() try: self.save() except Exception, e: dLogger.log(' Error while saving:') dLogger.exception(e) dLogger.pretty(str(yt_comment).encode('unicode_escape'))
def run_harvester_search(harvester): if debugging: dLogger.log("run_harvester_search(harvester: %s)" % (harvester)) logger.info(u"START SEARCH: %s Stats:%s" % (harvester, unicode(harvester.get_stats()))) try: all_twsearch = harvester.twsearch_to_harvest.all() search_all_terms(harvester, all_twsearch) except twitter.TwitterError, e: msg = u"ERROR for %s" % harvester logger.exception(msg) if debugging: dLogger.exception(msg)
def handle(self, *args, **options): me = singleton.SingleInstance(flavor_id="crontw") try: logger.info("Will run the Twitter harvesters.") twitterch.run_twitter_harvester() except: print "Global failure. exception logged in 'twitter.log'" msg = u"Highest exception for the twitter cron. Not good." logger.exception(msg) dLogger.exception('TOP LEVEL ERROR:') logger.info("The harvest has end for the Twitter harvesters." + " " * 200)
def update_user_batch(harvester, user_batch): if debugging: dLogger.log("update_user_batch(%d items)" % (len(user_batch) if user_batch else 0)) userList = [] userObjects = {} try: for user in user_batch: userList.append(user.screen_name) if user.fid: userObjects[user.fid] = user #dLogger.log(' userObjects: %s'%userObjects) twModels = harvester.api_call('UsersLookup', { 'screen_name': userList, 'include_entities': True }) harvester.remaining_user_lookup_hits -= 1 harvester.save() for twModel in twModels: #dLogger.pretty(twModel.AsDict()) try: userObjects[twModel.id].update_from_twitter(twModel) except KeyError: try: snh_user = TWUser.objects.get(fid=twModel.id) except: dLogger.log(' GETTING USER BY SCREEN_NAME') snh_user = TWUser.objects.get( screen_name=twModel.screen_name) try: snh_user.update_from_twitter(twModel) except: dLogger.log(' NEED TO TRANSFER USER!') user = TWUser.objects.get(screen_name=twModel.screen_name) for status in user.postedStatuses.all(): status.user = snh_user status.save() user.delete() snh_user.update_from_twitter(twModel) except: if debugging: dLogger.exception("ERROR UPDATING FROM TWITTER: %s" % twModel.screen_name) pass except: if debugging: dLogger.exception("ERROR WHILE UPDATING USER BATCH:")
def update_user_status_from_batch(harvester, snhuser, status): #if debugging: #dLogger.log("update_user_status_from_batch()") try: res = FBResult() res.harvester = harvester res.result = status res.ftype = "FBPost" res.fid = status["id"] res.parent = snhuser.fid res.save() except: if debugging: dLogger.exception('ERROR WHILE CREATING A NEW FBRESULT<FBPOST>:') dLogger.log(' snhuser: %s' % snhuser) dLogger.log(' status: %s' % status) logger.debug('Error while adding %s\'s status')
def api_one_zero(request, command): try: dLogger.log(command) if command == 'authent': return authent(request) elif 'oauth' in request.GET and request.GET[ 'oauth'] == DEFAULT_OAUTH_KEY: return command_management(command, request) else: return error( 'UnauthentifiedError', 'You must use an authentification token to use AspirAPI', command) except Exception as e: dLogger.exception('Error occured in an API view') dLogger.log(request) return error('UnknownServerError', 'An error has occured while proceeding the request', command)
def run_harvester_v3(harvester): if debugging: dLogger.log("run_harvester_v3()") harvester.start_new_harvest() try: compute_results(harvester) update_user_batch(harvester) if harvester.harvester_name != 'FBUser Updater': update_user_statuses_batch(harvester) compute_results(harvester) except: logger.exception(u"EXCEPTION: %s" % harvester) if debugging: dLogger.exception(u"EXCEPTION: %s" % harvester) finally: #usage = psutil.virtual_memory() harvester.end_current_harvest() logger.info(u"End: %s Stats:%s" % (harvester, unicode(harvester.get_stats())))
def update_user_status(self, fbstatus, user): if debugging: dLogger.log("<ThreadStatus#%s>::update_user_status()" % self.ident) dLogger.log(" id: %s" % fbstatus["id"]) snh_status = None try: try: snh_status = FBPost.objects.get(fid=fbstatus["id"]) except ObjectDoesNotExist: snh_status = FBPost(user=user) snh_status.save() if debugging: dLogger.log( " New empty status created, to be processed.") snh_status.update_from_facebook(fbstatus, user) likes_list = FBResult.objects.filter(ftype="FBPost.likes").filter( parent=fbstatus["id"]) all_likes = [] if debugging: dLogger.log(' likes_list: %s, parent: %s' % (likes_list, fbstatus["id"])) for likes in likes_list: all_likes += eval(likes.result) snh_status.update_likes_from_facebook(all_likes) likes_list.delete() #if debugging: dLogger.log(" deleted likes_List %s"%likes_list) except IntegrityError: try: snh_status = FBPost.objects.get(fid=fbstatus["id"]) snh_status.update_from_facebook(fbstatus, user) except ObjectDoesNotExist: msg = u"<p style='red'>ERROR! Post already exist but not found %s for %s</p>" % ( unicode(fbstatus), user.fid if user.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) except: msg = u"<p style='red'>Cannot update status %s for %s</p>" % ( unicode(fbstatus)[:100], user.fid if user.fid else "0") logger.exception(msg) if debugging: dLogger.exception(msg) return snh_status
def fill_db(): try: dLogger.log('fill_db()') all_queries = FBResult.objects.all() count = FBResult.objects.count() BAR_LENGTH = 50 print 'db_count: %i\n' % count, print 'Progress:[%s]0%%' % (' ' * BAR_LENGTH), for i in range(0, count - 1): if i % 100 == 0: print '\r', print 'Progress:[%s%s]%s/%s' % ( '#' * (i * BAR_LENGTH // count), ' ' * (BAR_LENGTH - i * BAR_LENGTH // count), i, count), if i % 1000 == 0: print '\r\r', print 'db_count: %i\n' % FBResult.objects.count(), create_post(all_queries[i]) except: dLogger.exception('EXCEPTION:')
def api_statuses_lookup(harvester, ids, include_entities=False): if debugging: dLogger.log('api_statuses_lookup()') #dLogger.log(' ids: %s'%ids) url = 'https://api.twitter.com/1.1/statuses/lookup.json?id=%s' % ids if include_entities: url += '&include_entities=1' auth = OAuth1(harvester.consumer_key, harvester.consumer_secret, harvester.access_token_key, harvester.access_token_secret) response = [] try: response = requests.get(url, auth=auth).json() harvester.remaining_search_hits -= 1 harvester.save() except Exception as e: dLogger.exception(e) return response
def run(self): if debugging: dLogger.log("<ThreadStatus#%s>::run()" % self.ident) statuscount = 0 logger.info(u"ThreadStatus %s. Start." % self) while True: try: fid = self.queue.get() fbpost = FBResult.objects.filter(fid=fid).filter( ftype="FBPost")[0] user = FBUser.objects.get(fid=fbpost.parent) rez = eval(fbpost.result) snh_status = self.update_user_status(rez, user) fbpost.delete() #if debugging: dLogger.log(" deleted FBStatus result %s"%fbpost) qsize = self.queue.qsize() if debugging: dLogger.log(" %s Posts left in queue" % qsize) if qsize % 100 == 0: logger.info(" less than %s posts left in queue" % self.queue.qsize()) #signals to queue job is done except ObjectDoesNotExist: logger.exception("DEVED %s %s" % (fbpost.parent, fbpost.ftype)) if debugging: dLogger.exception(msg) except Queue.Empty: logger.info(u"ThreadStatus %s. Queue is empty." % self) break except: msg = u"ThreadStatus %s. Error" % self logger.exception(msg) if debugging: dLogger.exception(msg) self._Thread__stop() finally: self.queue.task_done() logger.info(u"ThreadStatus %s. End." % self) if debugging: dLogger.log(" <ThreadStatus#%s> ended" % self.ident)
def run(self): if debugging: dLogger.log("<ThreadComment#%s>::run()" % self.ident) logger.info(u"ThreadComment %s. Start." % self) while True: try: fid = self.queue.get() if fid: fbcomment = FBResult.objects.filter(fid=fid)[0] post = FBPost.objects.get(fid=fbcomment.parent) self.update_comment_status(eval(fbcomment.result), post) fbcomment.delete() #if debugging: dLogger.log(" deleted fbcomment result %s"%fbcomment) qsize = self.queue.qsize() if debugging: dLogger.log(" %s Comments left in queue" % qsize) if qsize % 10000 == 0: logger.info(" less than %s comments left in queue" % qsize) else: logger.error(u"ThreadComment %s. fid is none! %s." % (self, fid)) #signals to queue job is done except Queue.Empty: logger.info(u"ThreadComment %s. Queue is empty." % self) break except: msg = u"<p style='red'>ThreadComment %s. Error.</p>" % self logger.exception(msg) if debugging: dLogger.exception(msg) finally: self.queue.task_done() logger.info(u"ThreadComment %s. End." % self) if debugging: dLogger.log(" <ThreadComment#%s> ended" % self.ident)
def get_fb_harvester_post_list(request, call_type, harvester_id): querySet = None #dLogger.log('harvester_id: %s'%harvester_id) #columnIndexNameMap is required for correct sorting behavior columnIndexNameMap = { 0: u'created_time', 1: u'fid', 2: u'ffrom__username', 3: u'name', 4: u'description', 5: u'caption', 6: u'message', 7: u'link__original_url', 8: u'ftype', 9: u'likes_count', 10: u'shares_count', 11: u'comments_count', 12: u'application_raw', 13: u'updated_time', 14: u'story', 15: u'ffrom__name', 16: u'ffrom__fid', } try: if harvester_id == '0': querySet = FBPost.objects.all() else: harvester = get_list_or_404(FacebookHarvester, pk=harvester_id)[0] querySet = FBPost.objects.filter( user__harvester_in_charge=harvester) except: dLogger.exception("EXCEPTION OCCURED IN get_fb_harvester_post_list") #call to generic function from utils return get_datatables_records(request, querySet, columnIndexNameMap, call_type)
def manage_twitter_exception(retry_count, harvester, user, tex): if debugging: dLogger.log("manage_twitter_exception()") retry_count += 1 need_a_break = retry_count > harvester.max_retry_on_fail if unicode(tex).find(u"Sorry, that page does not exist."): user.error_triggered = True user.save() need_a_break = True msg = u"Exception for the harvester %s for %s. Retry:%d. The user does not exists!" % ( harvester, unicode(user), retry_count) logger.exception(msg) if debugging: dLogger.exception(msg) elif unicode(tex) == u"Capacity Error": logger.debug(u"%s:%s. Capacity Error. Retrying." % (harvester, unicode(user))) elif unicode(tex).startswith(u"Rate limit exceeded"): harvester.update_client_stats() msg = u"Exception for the harvester %s for %s. Retry:%d." % ( harvester, unicode(user), retry_count) logger.exception(msg) if debugging: dLogger.exception(msg) raise elif unicode(tex) == u"{u'error': u'Invalid query'}" or unicode( tex) == u"Invalid query": logger.debug(u"%s:%s. Invalid query. Breaking." % (harvester, unicode(user))) need_a_break = True elif unicode(tex) == u"Not authorized": logger.debug( u"Error occured in %s:%s, the user has disabled scrapping." % (harvester, unicode(user))) need_a_break = True else: msg = u"Exception for the harvester %s for %s. Retry:%d. %s" % ( harvester, unicode(user), retry_count, tex) logger.exception(msg) if debugging: dLogger.exception(msg) user.error_triggered = True user.save() return (retry_count, need_a_break)
def update_from_rawtwitter(self, twitter_model, user, keepRaw, twython=False): #if debugging: #dLogger.log("%s::update_from_rawtwitter()"%self) #dLogger.pretty(twitter_model) model_changed = False props_to_check = { u"fid": u"id", u"favorited": u"favorited", u"retweet_count": u"retweet_count", u"retweeted": u"retweeted", u"source": u"source", u"text": u"text", u"truncated": u"truncated", } date_to_check = ["created_at"] self.user = user for prop in props_to_check: prop_name = props_to_check[prop] if prop_name in twitter_model: tw_prop_val = twitter_model[prop_name] if self.__dict__[prop] != tw_prop_val: self.__dict__[prop] = tw_prop_val model_changed = True #if debugging: dLogger.log(' %s has changed: %s'%(prop, self.__dict__[prop])) for prop in date_to_check: if prop in twitter_model: tw_prop_val = twitter_model[prop] format = '%a %b %d %H:%M:%S +0000 %Y' if twython: format = '%a %b %d %H:%M:%S +0000 %Y' date_val = datetime.strptime(tw_prop_val, format) if self.__dict__[prop] != date_val: self.__dict__[prop] = date_val model_changed = True if "entities" in twitter_model: entities = twitter_model["entities"] if "hashtags" in entities: tw_prop_val = entities["hashtags"] for twtag in tw_prop_val: tag = None try: tag = Tag.objects.get(text__exact=twtag["text"]) except: pass if tag is None: tag = Tag(text=twtag["text"]) try: tag.save() except: tag = Tag( text=twtag["text"].encode('unicode-escape')) tag.save() self.hash_tags.add(tag) model_changed = True else: if tag not in self.hash_tags.all(): self.hash_tags.add(tag) model_changed = True if "urls" in entities: tw_prop_val = entities["urls"] for twurl in tw_prop_val: url = None try: url = URL.objects.get(original_url__exact=twurl['url']) except: pass if url is None: url = URL(original_url=twurl['url']) url.save() self.text_urls.add(url) model_changed = True elif url not in self.text_urls.all(): self.text_urls.add(url) model_changed = True if "user_mentions" in entities: tw_prop_val = entities["user_mentions"] for tw_mention in tw_prop_val: usermention = None try: usermention = self.get_existing_user( {"fid": tw_mention['id']}) if not usermention: usermention = self.get_existing_user( {'screen_name': tw_mention['screen_name']}) #if debugging: dLogger.log(" usermention: %s"%usermention) if not usermention: usermention = TWUser( fid=tw_mention['id'], screen_name=tw_mention['screen_name'], harvester=user.harvester) usermention.update_from_rawtwitter(tw_mention, twython) usermention.save() #if debugging: dLogger.log(" user created from user mention: %s"%usermention) except: if debugging: dLogger.exception( "Exception occured while saving user:") if usermention is None: usermention = TWUser(fid=tw_mention['id'], harvester=user.harvester) usermention.update_from_rawtwitter(tw_mention, twython) usermention.save() self.user_mentions.add(usermention) model_changed = True else: if usermention not in self.user_mentions.all(): self.user_mentions.add(usermention) model_changed = True if model_changed: self.model_update_date = datetime.utcnow() self.error_on_update = False if keepRaw: raw_data = self.raw_twitter_response.all() if len(raw_data) > 0: raw_data[0].data = twitter_model raw_data[0].save() else: raw_data = TWStatusRaw.objects.create(snh_status=self, data=twitter_model) try: self.save() except: self.text = self.text.encode('unicode-escape') self.source = self.source.encode('unicode-escape') self.save()
def update_from_twitter(self, twitter_model, user, keepRaw): #if debugging: #dLogger.log("update_from_twitter()") #dLogger.log(" twitter_model: %s"%twitter_model) model_changed = False props_to_check = { u"fid": u"id", u"favorited": u"favorited", u"retweet_count": u"retweet_count", u"retweeted": u"retweeted", u"source": u"source", u"text": u"text", u"truncated": u"truncated", } date_to_check = ["created_at"] self.user = user for prop in props_to_check: prop_name = "_" + props_to_check[prop] if prop_name in twitter_model.__dict__: tw_prop_val = twitter_model.__dict__[prop_name] if self.__dict__[prop] != tw_prop_val: self.__dict__[prop] = tw_prop_val model_changed = True #if debugging: dLogger.log(' %s has changed: %s'%(prop, self.__dict__[prop])) for prop in date_to_check: prop_name = "_" + prop if prop_name in twitter_model.__dict__: tw_prop_val = twitter_model.__dict__[prop_name] date_val = datetime.strptime(tw_prop_val, '%a %b %d %H:%M:%S +0000 %Y') if self.__dict__[prop] != date_val: self.__dict__[prop] = date_val model_changed = True if "hashtags" in twitter_model.__dict__: tw_prop_val = twitter_model.__dict__["hashtags"] for twtag in tw_prop_val: tag = None try: tag = Tag.objects.filter(text=twtag.text)[0] except: pass if tag is None: try: tag = Tag(text=twtag.text) tag.save() except: tag = Tag(text=twtag.text.encode('unicode-escape')) tag.save() self.hash_tags.add(tag) model_changed = True else: if tag not in self.hash_tags.all(): self.hash_tags.add(tag) model_changed = True if "urls" in twitter_model.__dict__: tw_prop_val = twitter_model.__dict__["urls"] for twurl in tw_prop_val: url = None try: url = URL.objects.filter(original_url=twurl.url)[0] except: pass if url is None: url = URL(original_url=twurl.url) url.save() self.text_urls.add(url) model_changed = True else: if url not in self.text_urls.all(): self.text_urls.add(url) model_changed = True if "user_mentions" in twitter_model.__dict__: tw_prop_val = twitter_model.__dict__["user_mentions"] for tw_mention in tw_prop_val: usermention = None try: usermention = self.get_existing_user( {"fid": tw_mention.id}) #if debugging: dLogger.log(" usermention: %s"%usermention) if not usermention: usermention = self.get_existing_user( {"screen_name": tw_mention.screen_name}) if not usermention: usermention = TWUser( fid=tw_mention.id, screen_name=tw_mention.screen_name, harvester=user.harvester) usermention.update_from_twitter(tw_mention) usermention.save() if debugging: dLogger.log( " user created from user mention: %s" % usermention) except: if debugging: dLogger.exception( "AN EXCEPTION OCCURED WHILE CREATING NEW USER:"******" Status %s has changed. Updated" % self)