def farm_user_timelines(apikeys, seeds, output_folder): user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=os.path.abspath(output_folder)) try: #get user id first user_ids = user_farm.get_user_ids(seeds) for user_id in user_ids: # current it skips the user if the result file is already there. Obviously this is not reliable since the error could raise when only half of the tweets for an user is finished... this will mean losing the other half for this user... but my current use case doesn't really care... since I have millions of users to worry about, losing one isn't that big of deal... but certainly needs a better way to track progress if not os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))): user_farm.user_timeline(user_id) except KeyboardInterrupt: logger.warn('You pressed Ctrl+C!') raise except: raise finally: user_farm.close()
def farm_user_network(apikeys, config={}, output_folder='./farm/', network_type="followers"): network_output_folder = os.path.abspath( '%s/%s/' % (output_folder, network_type)) # by user id shutil.rmtree(network_output_folder, True) user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder) seeds = config['seeds'] if 'seeds' in config else [] depth = int(config.get('depth', 3)) # by default only fetch 3 layers #progress = config.get('progress', {}) #current_depth = progress.get('current_depth', 0) # start from the first layer #queue = progess.get('queue', {}) #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example") user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder)) p.start() user_favorites_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) q = multiprocessing.Process(target=farm_user_favorites, args=(apikeys, user_favorites_queue, output_folder)) q.start() user_retweets_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) r = multiprocessing.Process(target=farm_user_retweets, args=(apikeys, user_retweets_queue, output_folder)) r.start() user_mentions_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) s = multiprocessing.Process(target=farm_user_mentions, args=(apikeys, user_mentions_queue, output_folder)) s.start() # get user_ids for the seeds user_network_queue = user_network_farmer.get_user_ids(seeds) try: #get user id first while depth > 0 and len(user_network_queue) > 0: temp_user_network_queue = set() for user_id in user_network_queue: time.sleep(5) if network_type == 'friends': f_ids = user_network_farmer.find_all_friends(user_id) else: f_ids = user_network_farmer.find_all_followers(user_id) logger.info('user_id: %d has %d friends' % (user_id, len(f_ids))) for f_id in f_ids: # user_timeline_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e a timeline dos amigos... Não é o nosso caso. # user_favorites_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os favoritos dos amigos.... Não é o nosso caso. # user_retweets_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os retweets dos amigos.... Não é o nosso caso. # user_mentions_queue.put(f_id, block=True) # AVALIAR - Me parece que se eu habilitar ele pega os amigos e os menções dos amigos.... Não é o nosso caso. temp_user_network_queue.add(f_id) user_network_farmer.close() # force flush once logger.info('finish depth: %d' % (depth)) depth -= 1 user_network_queue = temp_user_network_queue except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_network_farmer.close() user_timeline_queue.put_nowait(-1) user_favorites_queue.put_nowait(-1) user_retweets_queue.put_nowait(-1) user_mentions_queue.put_nowait(-1) p.join() q.join() r.join() s.join() logger.info('all done')
def farm_user_network(apikeys, config = {}, output_folder='./farm/', network_type="followers"): network_output_folder = os.path.abspath('%s/%s/'%(output_folder, network_type)) # by user id shutil.rmtree(network_output_folder, True) user_network_farmer = UserFarm(apikeys=apikeys, verbose=False, output_folder=network_output_folder) seeds = config['seeds'] if 'seeds' in config else [] depth = int(config.get('depth', 3)) # by default only fetch 3 layers #progress = config.get('progress', {}) #current_depth = progress.get('current_depth', 0) # start from the first layer #queue = progess.get('queue', {}) #queue = queue if type(queue) is dict else raise Exception("the queue must be a dict, see twitter_crawler_config.json as an example") user_timeline_queue = multiprocessing.Queue(maxsize=MAX_QUEUE_SIZE) p = multiprocessing.Process(target=farm_user_timelines, args=(apikeys, user_timeline_queue, output_folder)) p.start() # get user_ids for the seeds user_network_queue = user_network_farmer.get_user_ids(seeds) try: #get user id first while depth > 0 and len(user_network_queue) > 0: temp_user_network_queue = set() for user_id in user_network_queue: time.sleep(5) if network_type == 'friends': f_ids = user_network_farmer.find_all_friends(user_id) else: f_ids = user_network_farmer.find_all_followers(user_id) logger.info('user_id: %d has %d friends'%(user_id, len(f_ids))) for f_id in f_ids: user_timeline_queue.put(f_id, block=True) temp_user_network_queue.add(f_id) user_network_farmer.close() # force flush once logger.info('finish depth: %d'%(depth)) depth -= 1 user_network_queue = temp_user_network_queue except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_network_farmer.close() user_timeline_queue.put_nowait(-1) p.join() logger.info('all done')
def farm_user_network(apikeys=None, seeds=[], depth=3, output_folder='./user_network', network_type='followers'): output_folder = os.path.abspath('%s/%s' % (output_folder, network_type)) user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder) progress = {} try: with open('progress.pickle', 'rb') as pf: progress = pickle.load(pf) except: pass try: depth = max(progress.keys()) logger.info('resume from depth: %d' % (depth)) except: pass try: #get user id first user_ids = user_farm.get_user_ids(seeds) progress[depth] = user_ids logger.info("number of seeds: %d" % len(user_ids)) while depth > 0 and len(user_ids) > 0: time.sleep(5) progress[depth - 1] = set() while len(progress[depth]) > 0: user_id = progress[depth].pop() logger.info("fetching %s of %d" % (network_type, user_id)) if os.path.exists( os.path.abspath('%s/%s' % (output_folder, user_id))): logger.info("%d already fetched... pass" % user_id) continue retry = False retry_cnt = MAX_RETRY_CNT while True: try: if network_type == 'friends': f_ids = user_farm.find_all_friends(user_id) else: f_ids = user_farm.find_all_followers(user_id) retry = False retry_cnt = MAX_RETRY_CNT if depth - 1 > 0: progress[depth - 1].update(f_ids) except: retry = True retry_cnt -= 1 time.sleep(60) logger.info("retries remaining if failed %d" % (retry_cnt)) if not retry or retry_cnt == 0: break # retry failed if retry and retry_cnt == 0: # add unprocessed back to the queue progress[depth].add(user_id) logger.info('finish depth: %d' % (depth)) depth -= 1 except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_farm.close() with open('progress.pickle', 'wb') as pf: pickle.dump(progress, pf)
def farm_user_network(apikeys=None, seeds= [], depth=3, output_folder='./user_network', network_type='followers'): output_folder = os.path.abspath('%s/%s'%(output_folder, network_type)) user_farm = UserFarm(apikeys=apikeys, verbose=False, output_folder=output_folder) progress = {} try: with open('progress.pickle', 'rb') as pf: progress = pickle.load(pf) except: pass try: depth = max(progress.keys()) logger.info('resume from depth: %d'%(depth)) except: pass try: #get user id first user_ids = user_farm.get_user_ids(seeds) progress[depth] = user_ids logger.info("number of seeds: %d"%len(user_ids)) while depth > 0 and len(user_ids) > 0: time.sleep(5) progress[depth-1] = set() while len(progress[depth]) > 0: user_id = progress[depth].pop() logger.info("fetching %s of %d"%(network_type, user_id)) if os.path.exists(os.path.abspath('%s/%s'%(output_folder, user_id))): logger.info("%d already fetched... pass"%user_id) continue retry = False retry_cnt = MAX_RETRY_CNT while True: try: if network_type == 'friends': f_ids = user_farm.find_all_friends(user_id) else: f_ids = user_farm.find_all_followers(user_id) retry = False retry_cnt = MAX_RETRY_CNT if depth - 1 > 0: progress[depth-1].update(f_ids) except: retry = True retry_cnt -= 1 time.sleep(60) logger.info("retries remaining if failed %d"%(retry_cnt)) if not retry or retry_cnt == 0: break # retry failed if retry and retry_cnt == 0: # add unprocessed back to the queue progress[depth].add(user_id) logger.info('finish depth: %d'%(depth)) depth -= 1 except KeyboardInterrupt: print() logger.error('You pressed Ctrl+C!') raise except: raise finally: user_farm.close() with open('progress.pickle', 'wb') as pf: pickle.dump(progress, pf)