def get_tasks(self): tasks = [] # relationships = [] for f in self.followings: if f['hashid']: dst_node = self.merge_node(f) follows = py2neo.Relationship(self.src_node, 'FOLLOWS', dst_node) # relationships.append(follows) self.g.create_unique(follows) if not rh.is_user_crawled(f['domain']): tasks.append(f['domain'].encode('utf-8')) # if len(relationships) > 0: # self.g.create_unique(*relationships) return tasks
def start(instance_id): global task global status task = '' try: logger.warning('Instance id: %s', instance_id) hostname = socket.gethostname() ip = socket.gethostbyname(hostname) start_time = int(time.time()) session = SessionHelper() status = { 'id': hostname + '-' + str(instance_id), 'hostname': hostname, 'ip': ip, 'finished': finished, 'task': '', 'status': 'init', 'message': '', 'account': Account.get_using(), 'start_time': start_time, 'update_time': int(time.time()) } rh.publish_status(status) while True: task = '' task = rh.get_task_user() logger.warning('Get task: ' + task) if rh.is_user_crawled(task): logger.warning("User %s crawled, skip", task) continue status.update({ 'finished': finished, 'task': task, 'status': 'crawling', 'account': Account.get_using(), 'update_time': int(time.time()) }) rh.publish_status(status) try: fc = FollowingsCrawler(session, task) user = fc.get() logger.warning('Push result: ' + task) rh.push_result_user(user) finished['user'] += 1 finished['followings'] += len(user['followings']) time.sleep(random.uniform(1, 5)) except NotFoundException: message = "User %s not found, continue" % task logger.error(message) continue except ResponseException: message = 'Crawling response error, push back task, quit' terminate(message) except RedisException: logger.error('Redis connection error, quit') sys.exit('Redis Error!') except NetworkException: message = 'Network connection error, quit' terminate(message) except Exception as e: print(traceback.format_exc()) terminate(e)