Пример #1
0
def get_page(page_url, max_retries=10, **query_dict):
    """
    Sends a GET request, while also printing the page to console.
    :param max_retries: the maximum number of retries
    :param page_url: the url of the GET request
    :param query_dict: the GET query parameters
    :return: the page received
    """
    if len(query_dict) > 0:
        query_string = urllib.parse.urlencode(query_dict)
        page_url += "?" + query_string

    page = None
    for tries in range(max_retries):
        log.debug(f"GET: {page_url}")
        page = requests.get(page_url)
        if page.status_code == 200 or page.status_code == 400 or page.status_code == 404:
            break
        else:
            log.warning(
                f'Request failed (status code: {page.status_code}). Sleeping for 2 seconds...'
            )
            time.sleep(2)
            log.info('Retrying...')

    if not page:
        log.error("Request failed. Page not found.")
    if page.status_code != 200:
        log.error("Request failed. Status code: %d" % page.status_code)

    return page
Пример #2
0
def write_submissions(submissions):
    submissions = list(submissions)
    # Get all handles.
    handles = {
        (handle.judge.judge_id, handle.handle.lower()): handle
        for handle in
        UserHandle.objects.annotate(handle_lower=Lower('handle')).filter(
            handle_lower__in={sub['author_id'].lower()
                              for sub in submissions}).select_related('judge')
    }
    # Get all required tasks.
    tasks = {(task.judge.judge_id, task.task_id): task
             for task in Task.objects.filter(
                 task_id__in={sub['task_id']
                              for sub in submissions}).select_related('judge')}

    log.info(f"Writing {len(submissions)} submissions to database...")
    log.debug(f"TASKS: {tasks}")
    log.debug(f"HANDLES: {handles}")

    submission_models = []
    for sub in submissions:
        author = handles.get((sub['judge_id'], sub['author_id']))
        task = tasks.get((sub['judge_id'], sub['task_id']))
        if not author or not task:
            continue

        fields = dict(
            submission_id=sub['submission_id'],
            author=author,
            submitted_on=timezone.make_aware(sub['submitted_on']),
            task=task,
            verdict=sub['verdict'],
            language=sub.get('language'),
            source_size=sub.get('source_size'),
            score=sub.get('score'),
            exec_time=sub.get('exec_time'),
            memory_used=sub.get('memory_used'),
        )
        if fields['score'] and math.isnan(fields['score']):
            fields['score'] = None
        fields = {k: v for k, v in fields.items() if v is not None}
        submission_models.append(Submission(**fields))

    if submission_models:
        result = Submission.objects.bulk_create(submission_models,
                                                ignore_conflicts=True)
        to_update = [x for x in result if x.pk is None]
        log.warning("TODO: Implement update!")
        log.success(
            f"Successfully upserted {len(submission_models)} submissions! "
            f"({len(result) - len(to_update)} created, 0 updated)")
    else:
        log.info("No submissions to upsert.")
Пример #3
0
def scrape_submissions(from_page=1,
                       to_page=SCRAPER_LIMIT,
                       results_per_page=200,
                       **query_dict):
    """
    Scrapes all submissions from the eval monitor.
    :param from_page: first page of the pagination
    :param to_page: last page of the pagination
    :param results_per_page: number of results to get for each request
    :param query_dict: optional GET query params to give to the monitor (e.g. user='******')
    """
    page_url = "https://www.infoarena.ro/monitor"
    rows = __scrape_paginated_table_rows(page_url,
                                         from_page,
                                         to_page,
                                         results_per_page,
                                         table_css_selector="#monitor-table",
                                         **query_dict)
    for row in rows:
        if len(row) != 7:
            raise Exception("Unexpected number of columns.")
        # Parse required information.
        submission_id = None
        try:
            verdict_text = row[6].find("span").text
            submission_id = row[0].find("a", href=True)['href'].split('/')[-1]
            if not row[4].find("a"):
                log.debug(f"Skipped submission #{submission_id}: private.")
                continue
            if verdict_text.startswith("Eroare"):
                log.debug(
                    f"Skipped submission #{submission_id}: system error.")
                continue
            submission = dict(
                judge_id=INFOARENA_JUDGE_ID,
                submission_id=submission_id,
                author_id=row[1].find(
                    "a", href=True)['href'].split('/')[-1].lower(),
                task_id=row[2].find("a",
                                    href=True)['href'].split('/')[-1].lower(),
                source_size=parsers.parse_source_size(row[4].find("a").text),
                submitted_on=parsers.parse_date(row[5].text),
                verdict=parsers.parse_verdict(verdict_text),
                score=parsers.parse_score(verdict_text),
            )
            yield submission
        except (TypeError, AttributeError) as e:
            # Probably task name was hidden.
            log.warning(f"Error scraping submission #{submission_id}: {e}")
Пример #4
0
def scrape_task_info(task):
    log.info(f"Scraping task info for task '{task}'...")
    judge_id, task_id = task.split('/', 1)
    task_ids = __expand_task(judge_id, task_id)

    scraper = scrapers.create_scraper(judge_id)
    task_infos = []
    log.info(f"Task ids: {task_ids}")

    for task_id in task_ids:
        try:
            task_info = scraper.scrape_task_info(task_id)
            if task_info is None:
                log.warning(
                    f"Did not find task info for '{task_id}'. Skipping...")
                continue

            log.debug(task_info)
            log.info(
                f"Successfully scraped '{task_id}' [{task_info['title']}]...")

            try:
                statement_info = scraper.scrape_task_statement(task_id)
                task_info.update(statement_info)
            except NotImplementedError:
                log.warning(
                    f"Could not get statement of task {task_id}: not implemented."
                )
            except Exception as ex:
                log.warning(f"Could not get statement of task {task_id}: {ex}")

            task_infos.append(task_info)
        except NotImplementedError:
            log.warning(
                f'Scraping tasks not implemented for {scraper.__class__.__name__}.'
            )
            return
        except Exception as ex:
            log.exception(ex)

    queries.write_tasks(task_infos)
Пример #5
0
def parse_submission(submission_data):
    try:
        submission_id = submission_data['id']
        task_id = '/'.join([
            str(submission_data['problem']['contestId']),
            submission_data['problem']['index']
        ])

        if submission_data['verdict'] == 'TESTING':
            log.debug(f'Skipped submission {submission_id}: still testing.')
            return []

        if 'verdict' not in submission_data:
            log.warning(f'Skipped submission {submission_id}: no verdict?.')
            return []

        for author in submission_data['author']['members']:
            author_id = author['handle']
            submission = dict(
                judge_id=CODEFORCES_JUDGE_ID,
                submission_id=str(submission_id),
                task_id=task_id.lower(),
                submitted_on=datetime.datetime.utcfromtimestamp(
                    submission_data['creationTimeSeconds']),
                language=submission_data['programmingLanguage'],
                verdict=parse_verdict(submission_data['verdict']),
                author_id=author_id.lower(),
                time_exec=submission_data['timeConsumedMillis'],
                memory_used=round(submission_data['memoryConsumedBytes'] /
                                  1024),
            )
            yield submission
    except Exception as ex:
        log.error(
            f"Failed to parse submission.\nSubmission data:{submission_data}\nError: {ex}"
        )
Пример #6
0
def scrape_handle_info(handle):
    log.info(f"Scraping info for handle '{handle}'...")
    judge_id, handle_id = handle.split('/', 1)
    handles = __expand_handle(judge_id, handle_id)
    log.info(f"Handles: {handles}")

    scraper = scrapers.create_scraper(judge_id)

    user_infos = []
    for handle in handles:
        try:
            user_info = scraper.scrape_user_info(handle)
            log.info(f"Successfully scraped user info for '{handle}'")
            log.debug(user_info)
            if user_info:
                user_infos.append(user_info)
        except NotImplementedError:
            log.warning(
                f'Scraping handles not implemented for {scraper.__class__.__name__}.')
            return
        except Exception as ex:
            log.exception(ex)

    queries.write_handles(user_infos)
Пример #7
0
def get_csrf_token():
    response = requests.get('https://csacademy.com/')
    csrf_token = response.cookies['csrftoken']
    log.debug('Got csrf token: {}'.format(csrf_token))
    return csrf_token
Пример #8
0
 def scrape_task_statement(self, task_id):
     statement = utils.scrape_task_statement(task_id)
     statement['statement'] = translators.translate_ro_en(
         statement['statement'])
     log.debug(statement['statement'])
     return statement
Пример #9
0
def create_scraper(judge_id: str) -> Scraper:
    for scraper_kls in __SUBMISSION_SCRAPERS__:
        if scraper_kls.JUDGE_ID == judge_id:
            log.debug(f"Found scraper for judge '{judge_id}': {scraper_kls.__name__}")
            return scraper_kls()
    raise Exception(f"No scraper configured for {judge_id}.")