def log_time_things(request_time_metric_handler, start_request_time, site): time_difference = time.time() - start_request_time request_time_metric_handler.add_to_list("list", time_difference) utilities.push_influx_data("crawling_response_times", dict(kind="general_for_now", site=site, value=time_difference))
def retrieve_submissions(record, custom, all_sites=current.SITES.keys(), codechef_retrieval=False): """ Retrieve submissions that are not already in the database """ global INVALID_HANDLES global failed_user_retrievals global todays_date global metric_handlers if concurrent_submission_retrieval_handler("GET", record.id, custom) == "ONGOING": print "Already ongoing retrieval for", record.id, custom return else: concurrent_submission_retrieval_handler("SET", record.id, custom) stopstalk_retrieval_start_time = time.time() sites_retrieval_timings = 0 list_of_submissions = [] retrieval_failures = [] should_clear_cache = False nrtable = db.next_retrieval user_column_name = "custom_user_id" if custom else "user_id" nrtable_record = db( nrtable[user_column_name] == record.id).select().first() skipped_retrieval = set([]) is_daily_retrieval = (retrieval_type == "daily_retrieve") logger = Logger(record.stopstalk_handle, custom) if nrtable_record is None: print "Record not found", user_column_name, record.id nrtable.insert(**{user_column_name: record.id}) nrtable_record = db( nrtable[user_column_name] == record.id).select().first() for site in all_sites: Site = getattr(sites, site.lower()) if Site.Profile.is_website_down(): all_sites.remove(site) common_influx_params = dict(stopstalk_handle=record.stopstalk_handle, retrieval_type=retrieval_type, value=1) for site in all_sites: common_influx_params["site"] = site lower_site = site.lower() site_handle = record[lower_site + "_handle"] site_lr = lower_site + "_lr" site_delay = lower_site + "_delay" last_retrieved = record[site_lr] # Rocked it totally ! ;) if is_daily_retrieval and \ datetime.timedelta(days=nrtable_record[site_delay] / 3 + 1) + \ last_retrieved.date() > todays_date: utilities.push_influx_data( "retrieval_stats", dict(kind="skipped", **common_influx_params)) logger.log(site, "skipped") metric_handlers[lower_site]["skipped_retrievals"].increment_count( "total", 1) skipped_retrieval.add(site) continue last_retrieved = time.strptime(str(last_retrieved), TIME_CONVERSION) if (site_handle, site) in INVALID_HANDLES: logger.log(site, "not found:" + site_handle) utilities.push_influx_data( "retrieval_stats", dict(kind="not_found", **common_influx_params)) metric_handlers[lower_site]["handle_not_found"].increment_count( "total", 1) record.update({site_lr: datetime.datetime.now()}) should_clear_cache = True continue if site_handle: Site = getattr(sites, site.lower()) P = Site.Profile(site_handle) # Retrieve submissions from the profile site site_method = P.get_submissions start_retrieval_time = time.time() if site == "UVa": submissions = site_method(last_retrieved, uva_problem_dict, is_daily_retrieval) else: submissions = site_method(last_retrieved, is_daily_retrieval) total_retrieval_time = time.time() - start_retrieval_time sites_retrieval_timings += total_retrieval_time metric_handlers[lower_site]["retrieval_times"].add_to_list( "list", total_retrieval_time) if submissions in (SERVER_FAILURE, OTHER_FAILURE): utilities.push_influx_data( "retrieval_stats", dict(kind=submissions.lower(), **common_influx_params)) logger.log(site, submissions) metric_handlers[lower_site]["retrieval_count"].increment_count( "failure", 1) # Add the failure sites for inserting data into failed_retrieval retrieval_failures.append(site) should_clear_cache = True current.REDIS_CLIENT.sadd("website_down_" + site.lower(), record.stopstalk_handle) elif submissions == NOT_FOUND: utilities.push_influx_data( "retrieval_stats", dict(kind="new_invalid_handle", **common_influx_params)) logger.log(site, "new invalid handle:" + site_handle) new_handle_not_found(site, site_handle) # Update the last retrieved of an invalid handle as we don't # want new_user script to pick this user again and again record.update({site_lr: datetime.datetime.now()}) should_clear_cache = True else: utilities.push_influx_data( "retrieval_stats", dict(kind="success", **common_influx_params)) submission_len = len(submissions) metric_handlers[lower_site]["retrieval_count"].increment_count( "success", 1) metric_handlers[lower_site][ "submission_count"].increment_count( "total", submission_len) logger.log(site, submission_len) list_of_submissions.append((site, submissions)) # Immediately update the last_retrieved of the record # Note: Only the record object is updated & not reflected in DB record.update({site_lr: datetime.datetime.now()}) should_clear_cache = True else: # Update this time so that this user is not picked # up again and again by new_user cron record.update({site_lr: datetime.datetime.now()}) should_clear_cache = True if retrieval_type == "daily_retrieve": nrtable_record.update({site_delay: 100000}) total_submissions_retrieved = 0 for submissions in list_of_submissions: site = submissions[0] lower_site = site.lower() site_delay = lower_site + "_delay" submissions_count = get_submissions(record.id, record[lower_site + "_handle"], record.stopstalk_handle, submissions[1], site, custom) total_submissions_retrieved += submissions_count if retrieval_type == "daily_retrieve" and \ site not in skipped_retrieval and \ site not in retrieval_failures: if submissions_count == 0: nrtable_record.update( {site_delay: nrtable_record[site_delay] + 1}) else: nrtable_record.update({site_delay: 0}) elif retrieval_type == "daily_retrieve" and site in retrieval_failures: # If retrieval failed for the user, then reset the delay so that # the details can be retrieved the next day nrtable_record.update({site_delay: 0}) # Clear the profile page cache in case there is atleast one submission retrieved if should_clear_cache: utilities.clear_profile_page_cache(record.stopstalk_handle) # To reflect all the updates to record into DB record.update_record() if retrieval_type == "daily_retrieve": nrtable_record.update_record() if retrieval_type == "refreshed_users" and len(retrieval_failures): current.REDIS_CLIENT.rpush( "next_retrieve_custom_user" if custom else "next_retrieve_user", record.id) else: # @ToDo: Too much main memory usage as strings are stored in a list # Aim to store only the ints and let typecasting and # "NULL" insertions happen just when required for site in retrieval_failures: if custom: failed_user_retrievals.append("(%s,%s,'%s')" % ("NULL", str(record.id), site)) else: failed_user_retrievals.append("(%s,%s,'%s')" % (str(record.id), "NULL", site)) # Keep committing the updates to the db to avoid lock wait timeouts db.commit() if total_submissions_retrieved > 0 and not custom: log_message = "Rating updated from %f to " % record.stopstalk_rating new_rating = update_stopstalk_rating(record.id, record.stopstalk_handle, custom) log_message += str(new_rating) logger.generic_log(log_message) concurrent_submission_retrieval_handler("DEL", record.id, custom) total_retrieval_time = time.time() - stopstalk_retrieval_start_time metric_handlers["overall"]["just_stopstalk_code_time"].add_to_list( "list", total_retrieval_time - sites_retrieval_timings)