예제 #1
0
def log_time_things(request_time_metric_handler, start_request_time, site):
    time_difference = time.time() - start_request_time
    request_time_metric_handler.add_to_list("list", time_difference)
    utilities.push_influx_data("crawling_response_times",
                               dict(kind="general_for_now",
                                    site=site,
                                    value=time_difference))
def retrieve_submissions(record,
                         custom,
                         all_sites=current.SITES.keys(),
                         codechef_retrieval=False):
    """
        Retrieve submissions that are not already in the database
    """

    global INVALID_HANDLES
    global failed_user_retrievals
    global todays_date
    global metric_handlers

    if concurrent_submission_retrieval_handler("GET", record.id,
                                               custom) == "ONGOING":
        print "Already ongoing retrieval for", record.id, custom
        return
    else:
        concurrent_submission_retrieval_handler("SET", record.id, custom)

    stopstalk_retrieval_start_time = time.time()
    sites_retrieval_timings = 0
    list_of_submissions = []
    retrieval_failures = []
    should_clear_cache = False
    nrtable = db.next_retrieval
    user_column_name = "custom_user_id" if custom else "user_id"
    nrtable_record = db(
        nrtable[user_column_name] == record.id).select().first()
    skipped_retrieval = set([])

    is_daily_retrieval = (retrieval_type == "daily_retrieve")
    logger = Logger(record.stopstalk_handle, custom)

    if nrtable_record is None:
        print "Record not found", user_column_name, record.id
        nrtable.insert(**{user_column_name: record.id})
        nrtable_record = db(
            nrtable[user_column_name] == record.id).select().first()

    for site in all_sites:
        Site = getattr(sites, site.lower())
        if Site.Profile.is_website_down():
            all_sites.remove(site)

    common_influx_params = dict(stopstalk_handle=record.stopstalk_handle,
                                retrieval_type=retrieval_type,
                                value=1)

    for site in all_sites:

        common_influx_params["site"] = site
        lower_site = site.lower()
        site_handle = record[lower_site + "_handle"]
        site_lr = lower_site + "_lr"
        site_delay = lower_site + "_delay"
        last_retrieved = record[site_lr]

        # Rocked it totally ! ;)
        if is_daily_retrieval and \
           datetime.timedelta(days=nrtable_record[site_delay] / 3 + 1) + \
           last_retrieved.date() > todays_date:
            utilities.push_influx_data(
                "retrieval_stats", dict(kind="skipped",
                                        **common_influx_params))
            logger.log(site, "skipped")
            metric_handlers[lower_site]["skipped_retrievals"].increment_count(
                "total", 1)
            skipped_retrieval.add(site)
            continue

        last_retrieved = time.strptime(str(last_retrieved), TIME_CONVERSION)

        if (site_handle, site) in INVALID_HANDLES:
            logger.log(site, "not found:" + site_handle)
            utilities.push_influx_data(
                "retrieval_stats",
                dict(kind="not_found", **common_influx_params))
            metric_handlers[lower_site]["handle_not_found"].increment_count(
                "total", 1)
            record.update({site_lr: datetime.datetime.now()})
            should_clear_cache = True
            continue

        if site_handle:
            Site = getattr(sites, site.lower())
            P = Site.Profile(site_handle)

            # Retrieve submissions from the profile site
            site_method = P.get_submissions
            start_retrieval_time = time.time()
            if site == "UVa":
                submissions = site_method(last_retrieved, uva_problem_dict,
                                          is_daily_retrieval)
            else:
                submissions = site_method(last_retrieved, is_daily_retrieval)
            total_retrieval_time = time.time() - start_retrieval_time
            sites_retrieval_timings += total_retrieval_time
            metric_handlers[lower_site]["retrieval_times"].add_to_list(
                "list", total_retrieval_time)
            if submissions in (SERVER_FAILURE, OTHER_FAILURE):
                utilities.push_influx_data(
                    "retrieval_stats",
                    dict(kind=submissions.lower(), **common_influx_params))
                logger.log(site, submissions)

                metric_handlers[lower_site]["retrieval_count"].increment_count(
                    "failure", 1)
                # Add the failure sites for inserting data into failed_retrieval
                retrieval_failures.append(site)
                should_clear_cache = True
                current.REDIS_CLIENT.sadd("website_down_" + site.lower(),
                                          record.stopstalk_handle)
            elif submissions == NOT_FOUND:
                utilities.push_influx_data(
                    "retrieval_stats",
                    dict(kind="new_invalid_handle", **common_influx_params))
                logger.log(site, "new invalid handle:" + site_handle)
                new_handle_not_found(site, site_handle)
                # Update the last retrieved of an invalid handle as we don't
                # want new_user script to pick this user again and again
                record.update({site_lr: datetime.datetime.now()})
                should_clear_cache = True
            else:
                utilities.push_influx_data(
                    "retrieval_stats",
                    dict(kind="success", **common_influx_params))
                submission_len = len(submissions)
                metric_handlers[lower_site]["retrieval_count"].increment_count(
                    "success", 1)
                metric_handlers[lower_site][
                    "submission_count"].increment_count(
                        "total", submission_len)

                logger.log(site, submission_len)
                list_of_submissions.append((site, submissions))
                # Immediately update the last_retrieved of the record
                # Note: Only the record object is updated & not reflected in DB
                record.update({site_lr: datetime.datetime.now()})
                should_clear_cache = True
        else:
            # Update this time so that this user is not picked
            # up again and again by new_user cron
            record.update({site_lr: datetime.datetime.now()})
            should_clear_cache = True
            if retrieval_type == "daily_retrieve":
                nrtable_record.update({site_delay: 100000})

    total_submissions_retrieved = 0
    for submissions in list_of_submissions:
        site = submissions[0]
        lower_site = site.lower()
        site_delay = lower_site + "_delay"
        submissions_count = get_submissions(record.id,
                                            record[lower_site + "_handle"],
                                            record.stopstalk_handle,
                                            submissions[1], site, custom)
        total_submissions_retrieved += submissions_count
        if retrieval_type == "daily_retrieve" and \
           site not in skipped_retrieval and \
           site not in retrieval_failures:
            if submissions_count == 0:
                nrtable_record.update(
                    {site_delay: nrtable_record[site_delay] + 1})
            else:
                nrtable_record.update({site_delay: 0})
        elif retrieval_type == "daily_retrieve" and site in retrieval_failures:
            # If retrieval failed for the user, then reset the delay so that
            # the details can be retrieved the next day
            nrtable_record.update({site_delay: 0})

    # Clear the profile page cache in case there is atleast one submission retrieved
    if should_clear_cache:
        utilities.clear_profile_page_cache(record.stopstalk_handle)

    # To reflect all the updates to record into DB
    record.update_record()
    if retrieval_type == "daily_retrieve":
        nrtable_record.update_record()

    if retrieval_type == "refreshed_users" and len(retrieval_failures):
        current.REDIS_CLIENT.rpush(
            "next_retrieve_custom_user" if custom else "next_retrieve_user",
            record.id)
    else:
        # @ToDo: Too much main memory usage as strings are stored in a list
        #        Aim to store only the ints and let typecasting and
        #        "NULL" insertions happen just when required
        for site in retrieval_failures:
            if custom:
                failed_user_retrievals.append("(%s,%s,'%s')" %
                                              ("NULL", str(record.id), site))
            else:
                failed_user_retrievals.append("(%s,%s,'%s')" %
                                              (str(record.id), "NULL", site))

    # Keep committing the updates to the db to avoid lock wait timeouts
    db.commit()
    if total_submissions_retrieved > 0 and not custom:
        log_message = "Rating updated from %f to " % record.stopstalk_rating
        new_rating = update_stopstalk_rating(record.id,
                                             record.stopstalk_handle, custom)
        log_message += str(new_rating)
        logger.generic_log(log_message)

    concurrent_submission_retrieval_handler("DEL", record.id, custom)
    total_retrieval_time = time.time() - stopstalk_retrieval_start_time
    metric_handlers["overall"]["just_stopstalk_code_time"].add_to_list(
        "list", total_retrieval_time - sites_retrieval_timings)