def generate_tasks(self): """ This method does not generate tasks. It filters the current twitter clients and updates the clients_queue continuously. """ all_profiles = UserProfile.objects.filter( twitterApp_parameters_error=False) clients_list = get_client_list(all_profiles) all_profiles = all_profiles.filter( twitterApp_parameters_error=False ) # 2 times insures the Twitter app is valid if len(all_profiles) == 0: log('No valid Twitter client exists!') for profile in UserProfile.objects.all(): profile.twitterApp_parameters_error = False profile.save() raise MailReportableException( 'Twitter harvest has not launched', 'No valid Twitter client exists! (reseting them all)') clients_queue.maxsize = len(clients_list) clear_twitter_client_queue() log('Valid Twitter clients: %s' % [str(client) for client in clients_list]) for client in clients_list: clients_queue.put(client) yield None
def harvest_twitter_user(twitter_user_harvester): twitter_user = twitter_user_harvester.twitter_user cursor = CustomCursor('user_timeline', id=twitter_user._ident, count=200) log('harvesting {} tweets from {} to {}'.format( twitter_user, twitter_user_harvester.harvest_since.strftime("%Y-%m-%d"), twitter_user_harvester.harvest_until.strftime("%Y-%m-%d"))) none_received_count = 0 while True: tweet = cursor.next() if not tweet: none_received_count += 1 if none_received_count > 10: break continue else: none_received_count = 0 created_at = tweet.created_at.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=utc) if created_at <= twitter_user_harvester.harvest_until: global_task_queue.add(update_tweet_from_response, [tweet]) if created_at < twitter_user_harvester.harvest_since: break log('Tweet-harvest completed for {}'.format(twitter_user_harvester)) twitter_user_harvester.harvest_completed = True twitter_user_harvester.save()
def _fetch_tweets_from_html(term, since, until): monitor_stop_flag() url = 'https://twitter.com/search?q={} since%3A{} until%3A{}'.format( term, since.strftime("%Y-%m-%d"), until.strftime("%Y-%m-%d")) log(url) request = Request(url, headers={ 'User-Agent': random.choice(BROWSER_USER_AGENTS), 'Host': 'twitter.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,fr-CA;q=0.8,en;q=0.5,fr;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://twitter.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0, no-cache', 'TE': 'Trailers', 'Pragma': 'no-cache', }) try: data = urlopen(request, timeout=5, context=ssl._create_unverified_context()) page = bs(data, "html.parser") except socket.timeout: log('Socket timeout while fetching tweets from hashtag: #{}'.format(term)) safe_sleep(1) return _fetch_tweets_from_html(term, since, until) tweets = page.find_all('li', {"data-item-type": "tweet"}) tweet_list = [int(tweet['data-item-id']) for tweet in tweets if tweet.has_attr('data-item-id')] return tweet_list
def monitor_progress(): time.sleep(MONITORING_DELAY_IN_SECONDS) while True: if not global_errors.empty(): thread, error = global_errors.get() log('ERROR OCCURED IN THREAD: {}'.format(thread)) manage_exception(error) if global_process.memory_info()[0] // 1000000 > MAX_RAM_USAGE_LIMIT_IN_MEGABYTE: raise MaxRAMUsageLimitReachedException display_jobs_statuses() time.sleep(MONITORING_DELAY_IN_SECONDS)
def harvest_twitter_hashtag(twitter_hashtag_harvester): twitter_hashtag = twitter_hashtag_harvester.twitter_hashtag log(twitter_hashtag) while True: monitor_stop_flag() tweet_ids = _fetch_tweets_from_html( twitter_hashtag.term, twitter_hashtag_harvester.harvest_since, twitter_hashtag_harvester.harvest_until ) pretty(tweet_ids)
def generate_tasks(self): twitter_user_harvesters = _get_twitter_user_list() twitter_user_harvesters_count = twitter_user_harvesters.count() if twitter_user_harvesters_count: log("{}/{} Twitter users to tweet-harvest".format( twitter_user_harvesters_count, ItemHarvester.objects.filter( twitter_user__isnull=False).count())) for twitter_user in twitter_user_harvesters: yield harvest_twitter_user, [twitter_user]
def update_tweet_from_response(tweet_response): tweet, new = Tweet.objects.get_or_create(_ident=tweet_response.id) try: tweet.UpdateFromResponse(tweet_response._json) except TWUser.DoesNotExist: log("tweet #{}'s user does not exists!".format(tweet._ident)) tweet.user = None tweet.save() if tweet.user.harvested_by: tweet._update_frequency = 1 else: tweet._update_frequency = 5
def setInReplyToUser(self, **kwargs): try: twuser, new = get_from_any_or_create(TWUser, **kwargs) except: log('kwargs: %s' % kwargs) doubles = TWUser.objects.filter(**kwargs) doubles[0]._has_duplicate = True doubles[0].save() log('TWUSER %s HAS %s DUPLICATES!' % (doubles[0], doubles.count() - 1)) time.sleep(3) raise self.in_reply_to_user = twuser
def execute(self): try: log('New job started.\n\n') log('Running job: "{}"'.format(self.name)) generate_consumers() generate_producers() monitor_progress() log('Job "{}" has completed.'.format(self.name)) except MaxRAMUsageLimitReachedException: logError("Max RAM usage limit reached {} Mb. Restarting".format( MAX_RAM_USAGE_LIMIT_IN_MEGABYTE )) end_threads() global_task_queue.clear() global_thread_stop_flag[0] = False time.sleep(5) # return self.execute() except Exception: end_threads() msg = "An unknown exception occured while harvesting data." logError(msg) if DEBUG: raise else: mail_log('Aspira - Harvest Unknown Error', msg) log('harvest ended')
def updateStatistics(self, jObject): for attrName in self.statistics: countObjs = getattr(self, attrName).order_by('-recorded_time') objType = countObjs.model val = jObject for key in self.statistics[attrName]: if key in val: val = val[key] else: log('Invalid dict sequence: %s' % self.statistics[attrName]) if not countObjs.exists(): objType.objects.create(comment=self, value=val) else: if countObjs[0].value != int( val) and countObjs[0].recorded_time != today(): objType.objects.create(comment=self, value=val)
def update_twitter_users(twitter_user_batch): if not twitter_user_batch.count(): return twitter_user_batch = list(twitter_user_batch) client = get_client('lookup_users') try: responses = client.call( 'lookup_users', user_ids=[user._ident for user in twitter_user_batch]) except tweepy.error.TweepError: log('got tweepy.error.TweepError!') log('user_ids = %s' % [user._ident for user in twitter_user_batch]) return_client(client) raise return_client(client) for response in responses: monitor_stop_flag() tw_user = next((user for user in twitter_user_batch if user._ident == response._json['id']), None) if tw_user: global_task_queue.add(update_twitter_user_from_response, args=[tw_user, response._json]) twitter_user_batch.remove(tw_user) for tw_user in twitter_user_batch: log('Twitter user (%s) has returned no result.' % tw_user) # twUser._error_on_update = True tw_user._last_updated = today() tw_user._update_frequency = 5 tw_user.save()
def update_tweets(tweet_batch): if not tweet_batch.count(): return tweet_batch = list(tweet_batch) client = get_client('statuses_lookup') try: responses = client.call('statuses_lookup', id_=[tweet._ident for tweet in tweet_batch], trim_user=True) except tweepy.error.TweepError: log('got tweepy.error.TweepError!') log('tweet ids = %s' % [tweet._ident for tweet in tweet_batch]) return_client(client) raise return_client(client) for response in responses: monitor_stop_flag() tweet = next( (tweet for tweet in tweet_batch if tweet._ident == response._json['id']), None) if tweet: global_task_queue.add(update_tweet_from_response, [response]) tweet_batch.remove(tweet) deleted_count = 0 for tweet in tweet_batch: deleted_count += 1 tweet.deleted_at = today() tweet.save() if deleted_count > 0: log("{} tweets have been deleted".format(deleted_count))
def run(self): log('%s has started' % self.name) try: while True: monitor_stop_flag() self.execute() except NonFatalExeption: logError( "({}) has encountered a non-fatal error. Relaunching in {} " "seconds".format( self.name, self.relaunch_delay_in_seconds ) ) safe_sleep(self.relaunch_delay_in_seconds) return self.run() except GlobalStopFlag: log("Thread ended gracefully.") return except Exception as e: global_errors.put((self, e)) return
def end_threads(): log('Ending all threads.') global_thread_stop_flag[0] = True for thread in threads_list[0]: if thread.is_alive(): log('Joining thread %s' % thread.name) thread.join(timeout=3) log('Successfully joined all threads')
def joinTWUsers(user1, user2): if user2.screen_name: user1.screen_name = user2.screen_name if user2._ident: user1._ident = user2._ident for label in [ 'screen_names', 'names', 'time_zones', 'urls', 'descriptions', 'statuses_counts', 'favourites_counts', 'followers_counts', 'friends_counts', 'listed_counts', ]: log('transfering all %s from %s to %s' % (label, user2, user1)) for item in getattr(user2, label).all(): item.twuser = user1 item.save() user1.save() user2.delete() return user1
def clear_twitter_client_queue(): log('Clearing Twitter Clients queue') while not clients_queue.empty(): clients_queue.get()
def clear(self): with self._mutex: for q in self._tasks_queues.values(): del q log("Cleared the tasks queues.")
def truncate_text(self): if not self.text: self.text = "" if len(self.text) >= self._text_max_length: self.text = self.text[0:self._text_max_length - 3] + '...' log('%s\'s text has been truncated!' % self)