def doUserScrape(self, credentials=False): """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """ keep_going = cache.get('user_scrape_' + self.request.root_id) if (not keep_going) or keep_going.decode('utf-8') != 'true': logger.info('*** STOPPED USER SCRAPE ***') # mark crawl as stopped on crawl node db = get_neo_driver() update_crawl(db, crawl_task=self.request.root_id, status='done') db.close() return False user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8') logger.info('*** SCRAPING USER: %s... ***' % (user,)) this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8') if (not this_friend) or this_friend == 'done': db = get_neo_driver() next_friends = nextNearest(db, user, 'friends', self.request.root_id) db.close() if next_friends: cache.set('scrape_friends_' + self.request.root_id, 'running') getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id) else: logger.info('*** FRIENDS BUSY ***') this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8') if (not this_follower) or this_follower == 'done': db = get_neo_driver() next_followers = nextNearest(db, user, 'followers', self.request.root_id) db.close() if next_followers: cache.set('scrape_followers_' + self.request.root_id, 'running') getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id) else: logger.info('*** FOLLOWERS BUSY ***') this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8') if (not this_tweet) or this_tweet == 'done': db = get_neo_driver() next_tweets = nextNearest(db, user, 'tweets', self.request.root_id) db.close() if next_tweets: cache.set('scrape_tweets_' + self.request.root_id, 'running') getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id) else: logger.info('*** TWEETS BUSY ***') if 'running' in [cache.get(k).decode('utf-8') for k in ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id, 'scrape_tweets_' + self.request.root_id]]: doUserScrape.apply_async(countdown=30, credentials=credentials) else: cache.set('user_scrape_' + self.request.root_id, 'false') cache.set('scrape_mode_' + self.request.root_id, '') logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
def cluster(seed, seed_type, query_name): logger.info('*** START CLUSTERING: seed %s, seed_type %s, query_name %s ***' % (seed, seed_type, query_name)) if seed_type == 'twitter_user': seed_id_name = 'screen_name' if query_name == "TransFoF": query = twitterTransFofQuery(seed) elif query_name == 'FoF': query = twitterTransFofQuery(seed) else: logger.warn('*** CLUSTERING: not yet implemented for seed type %s ***' % seed_type) return else: logger.warn('*** CLUSTERING: not yet implemented for seed type %s ***' % seed_type) return db = get_neo_driver() logger.info('*** CLUSTERING: get matrix for seed %s ***' % seed) matrix_labels_and_results = twitterMatrix(db, query) logger.info('*** CLUSTERING: find clusters for seed %s ***' ) cluster_results = clusterize(matrix_labels_and_results[1]) logger.info('*** CLUSTERING: label clusters for seed %s ***' ) labelled_clusters = labelClusters(cluster_results[0], matrix_labels_and_results[0]) if seed_type == 'twitter_user': logger.info('*** CLUSTERING: push seed %s ***' % seed) user_clusters_to_neo(db, labelled_clusters, [seed], query) else: logger.warn('*** CLUSTERING: not yet implemented for seed type %s ***' % seed_type) db.close() logger.info('*** CLUSTERING FINISHED: seed %s, seed_type %s, query_name %s ***' % (seed, seed_type, query_name))
def getTweets(self, user, maxTweets=3000, count=0, tweetId=0, cacheKey=False, credentials=False): logger.info('Executing getTweets task id {0.id}, args: {0.args!r} kwargs: {0.kwargs!r}'.format(self.request)) logger.info('task parent id {0.parent_id}, root id {0.root_id}'.format(self.request)) """Get tweets from the timeline of the given user, push them to Neo4J. Positional arguments: user -- The screen_name of the user Keyword arguments: maxTweets -- The maximum number of tweets to retrieve cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline count -- The number of tweets already retrieved, set when the task calls itself tweetId -- The maximum tweet ID to retrieve, set when the task calls itself """ api = RatedTwitter(credentials=credentials) limit = api.get_user_timeline_wait() if limit: logger.info('*** TWITTER RATE-LIMITED: statuses.user_timeline: %s:%d ***' % (user, str(count))) raise getTweets.retry(countdown=limit) else: args = {'screen_name': user, 'exclude_replies': False, 'include_rts': True, 'trim_user': False, 'count': 200} if tweetId: args['max_id'] = tweetId okay, result = api.get_user_timeline(**args) if okay: logger.info('*** TWITTER USER_TIMELINE: %s:%s ***' % (user, str(tweetId))) if result: newCount = count + len(result) if maxTweets: if newCount > maxTweets: # No need for the task to call itself again. pushTweets.delay(result, user, cacheKey=cacheKey) # Give pushTweets the cache-key to end the job. return else: pushTweets.delay(result, user) newTweetId = min([t['id'] for t in result]) - 1 # Not done yet, the task calls itself with an updated count and tweetId. getTweets.delay(user, maxTweets=maxTweets, count=newCount, tweetId=newTweetId, cacheKey=cacheKey, credentials=credentials) else: pushTweets.delay([], user, cacheKey=cacheKey) # Nothing more found, so tell pushTweets the job is done. else: if result == '404': db = get_neo_driver() setUserDefunct(db, user) db.close() cache.set('scrape_tweets_' + self.request.root_id, 'done') if result == 'limited': raise getTweets.retry(countdown=api.get_user_timeline_wait())
def getTwitterConnections(self, user, friends=True, cursor=-1, credentials=False, cacheKey=False): """Get the connections of the given user, push them to Neo4J. Positional arguments: user -- The screen_name of the user Keyword arguments: friends -- "twits" are the user's friends if True, (default) else they're followers cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers cursor -- Id of the next block of connections to retrieve, set when the task calls itself """ api = RatedTwitter(credentials=credentials) if friends: method = api.get_friends_list limit = api.get_friends_list_wait() method_name = 'get_friends_list' else: method = api.get_followers_list limit = api.get_followers_list_wait() method_name = 'get_followers_list' if limit: logger.info('*** TWITTER RATE-LIMITED: %s:%s ***' % (method_name, str(cursor))) raise getTwitterConnections.retry(countdown=limit) else: okay, result = method(screen_name=user, cursor=cursor, count=200) # We can get a maximum of 200 connections at once. if okay: logger.info('*** TWITTER CURSOR: %s:%s:%s ***' % (method_name, user, str(cursor))) twits = result['users'] next_cursor = result.get('next_cursor', False) if next_cursor: # Unless the next cursor is 0, we're not done yet. getTwitterConnections.delay(user, friends=friends, cursor=next_cursor, cacheKey=cacheKey, credentials=credentials) pushTwitterConnections.delay(twits, user, friends=friends) else: pushTwitterConnections.delay(twits, user, friends=friends, cacheKey=cacheKey) # All done, send the cacheKey. else: if result == 'limited': raise getTwitterConnections.retry(exc=Exception('Twitter rate-limited', method_name), countdown=API_TIMEOUT) if result == '404': db = get_neo_driver() setUserDefunct(db, user) db.close() if friends: cache.set('scrape_friends_' + self.request.root_id, 'done') else: cache.set('scrape_followers_' + self.request.root_id, 'done')
def startUserScrape(self, user, credentials=False): """Start scraping around the given user.""" logger.info('*** STARTED SCRAPING: USER: %s ***' % (user,)) cache.set('user_scrape_' + self.request.root_id, 'true') cache.set('scrape_mode_' + self.request.root_id, 'user') cache.set('scrape_user_' + self.request.root_id, user) # add crawl node for this user as centre of scrape db = get_neo_driver() start_user_crawl(db, user, crawl_task=self.request.root_id, status='initiated') db.close() for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']: cache.set(key + '_' + self.request.root_id, '') for job in ['friends', 'followers', 'tweets']: cache_key = '_'.join(['nextnearest', job, user, self.request.root_id]) cache.set(cache_key, '') doUserScrape.delay(credentials=credentials)
def pushRenderedTwits2Neo(self, twits): db = get_neo_driver() users2Neo(db, twits) db.close()
def pushRenderedConnections2Neo(self, user, renderedTwits, friends=True): db = get_neo_driver() connections2Neo(db, user,renderedTwits,friends=friends) db.close()
def pushRenderedTweets2Neo(self, user, tweetDump): db = get_neo_driver() tweetDump2Neo(db, user, tweetDump) db.close()
def pushRenderedMultiUserTweets2Neo(self, all_tweets_dump): db = get_neo_driver() multiUserTweetDump2Neo(db, all_tweets_dump) db.close()