def run(self): Connection.Instance().set_access_token_secret(self.job.access_token, self.job.access_secret) if not Connection.Instance().job_exists(self.job): initialize_job(self.job) print("{} started !".format(self.job.name)) collection_job = Connection.Instance().jobs_db[self.job.name] current_number_of_users = collection_job.count() while current_number_of_users < self.job.user_limit and not self.exit.is_set( ): print( "Heartbeat from job {}, access token = {}, access_secret = {}". format(self.job.name, self.job.access_token, self.job.access_secret)) next_user = self.job.crawling_strategy(collection_job) print("Fetching followers of {}...".format( next_user["screen_name"])) process_user(next_user, self.job, collection_job) current_number_of_users = collection_job.count() else: print( "!!!JOB FINISHED!!!\nuser limit : {}, number of collected users : {}" .format(self.job.user_limit, current_number_of_users))
def execute_job(job): Connection.Instance().set_access_token_secret(job.access_token, job.access_secret) if not Connection.Instance().job_exists(job): initialize_job(job) print("{} started !".format(job.name)) collection_job = Connection.Instance().jobs_db[job.name] current_number_of_users = collection_job.count() while current_number_of_users < job.user_limit: next_user = job.crawling_strategy(collection_job) print("Fetching followers of {}...".format(next_user["screen_name"])) process_user(next_user, job, collection_job) current_number_of_users = collection_job.count() else: print( "!!!JOB FINISHED!!!\nuser limit : {}, number of collected users : {}" .format(job.user_limit, current_number_of_users))
def process_user(user, job, collection_job): result = get_followers_page_and_next_cursor(user["screen_name"], user["last_cursor"]) if result: page, next_cursor = result else: print("...Account unauthorized, skipping") collection_job.update_one({"id": user["id"]}, {"$set": { "authorized": False }}) return # find user id's that are not currently in the database and fetch their profiles try: q = Queue("default", connection=Connection.Instance().redis_server) ret = q.enqueue(save_new_users, args=( page, job.name, )) except ModuleNotFoundError as e: print(e) collection_job.update({"id": user["id"]}, { "$addToSet": { "follower_ids": { "$each": page } }, "$set": { "finished": next_cursor == 0, "last_cursor": next_cursor } })
def initialize_job(job): print("Initializing job : {}".format(job.name)) user_profiles = get_user_profiles_single_request(job.seed_list) # Determine features for each profile for profile in user_profiles: profile["features"] = { func.__name__: func(profile) for func in job.classifiers } db = Connection.Instance().jobs_db collection_job = db[job.name] collection_job.create_index("id", unique=True) collection_job.insert_many(user_profiles)