def get_page(page_url, max_retries=10, **query_dict): """ Sends a GET request, while also printing the page to console. :param max_retries: the maximum number of retries :param page_url: the url of the GET request :param query_dict: the GET query parameters :return: the page received """ if len(query_dict) > 0: query_string = urllib.parse.urlencode(query_dict) page_url += "?" + query_string page = None for tries in range(max_retries): log.debug(f"GET: {page_url}") page = requests.get(page_url) if page.status_code == 200 or page.status_code == 400 or page.status_code == 404: break else: log.warning( f'Request failed (status code: {page.status_code}). Sleeping for 2 seconds...' ) time.sleep(2) log.info('Retrying...') if not page: log.error("Request failed. Page not found.") if page.status_code != 200: log.error("Request failed. Status code: %d" % page.status_code) return page
def write_submissions(submissions): submissions = list(submissions) # Get all handles. handles = { (handle.judge.judge_id, handle.handle.lower()): handle for handle in UserHandle.objects.annotate(handle_lower=Lower('handle')).filter( handle_lower__in={sub['author_id'].lower() for sub in submissions}).select_related('judge') } # Get all required tasks. tasks = {(task.judge.judge_id, task.task_id): task for task in Task.objects.filter( task_id__in={sub['task_id'] for sub in submissions}).select_related('judge')} log.info(f"Writing {len(submissions)} submissions to database...") log.debug(f"TASKS: {tasks}") log.debug(f"HANDLES: {handles}") submission_models = [] for sub in submissions: author = handles.get((sub['judge_id'], sub['author_id'])) task = tasks.get((sub['judge_id'], sub['task_id'])) if not author or not task: continue fields = dict( submission_id=sub['submission_id'], author=author, submitted_on=timezone.make_aware(sub['submitted_on']), task=task, verdict=sub['verdict'], language=sub.get('language'), source_size=sub.get('source_size'), score=sub.get('score'), exec_time=sub.get('exec_time'), memory_used=sub.get('memory_used'), ) if fields['score'] and math.isnan(fields['score']): fields['score'] = None fields = {k: v for k, v in fields.items() if v is not None} submission_models.append(Submission(**fields)) if submission_models: result = Submission.objects.bulk_create(submission_models, ignore_conflicts=True) to_update = [x for x in result if x.pk is None] log.warning("TODO: Implement update!") log.success( f"Successfully upserted {len(submission_models)} submissions! " f"({len(result) - len(to_update)} created, 0 updated)") else: log.info("No submissions to upsert.")
def scrape_submissions(from_page=1, to_page=SCRAPER_LIMIT, results_per_page=200, **query_dict): """ Scrapes all submissions from the eval monitor. :param from_page: first page of the pagination :param to_page: last page of the pagination :param results_per_page: number of results to get for each request :param query_dict: optional GET query params to give to the monitor (e.g. user='******') """ page_url = "https://www.infoarena.ro/monitor" rows = __scrape_paginated_table_rows(page_url, from_page, to_page, results_per_page, table_css_selector="#monitor-table", **query_dict) for row in rows: if len(row) != 7: raise Exception("Unexpected number of columns.") # Parse required information. submission_id = None try: verdict_text = row[6].find("span").text submission_id = row[0].find("a", href=True)['href'].split('/')[-1] if not row[4].find("a"): log.debug(f"Skipped submission #{submission_id}: private.") continue if verdict_text.startswith("Eroare"): log.debug( f"Skipped submission #{submission_id}: system error.") continue submission = dict( judge_id=INFOARENA_JUDGE_ID, submission_id=submission_id, author_id=row[1].find( "a", href=True)['href'].split('/')[-1].lower(), task_id=row[2].find("a", href=True)['href'].split('/')[-1].lower(), source_size=parsers.parse_source_size(row[4].find("a").text), submitted_on=parsers.parse_date(row[5].text), verdict=parsers.parse_verdict(verdict_text), score=parsers.parse_score(verdict_text), ) yield submission except (TypeError, AttributeError) as e: # Probably task name was hidden. log.warning(f"Error scraping submission #{submission_id}: {e}")
def scrape_task_info(task): log.info(f"Scraping task info for task '{task}'...") judge_id, task_id = task.split('/', 1) task_ids = __expand_task(judge_id, task_id) scraper = scrapers.create_scraper(judge_id) task_infos = [] log.info(f"Task ids: {task_ids}") for task_id in task_ids: try: task_info = scraper.scrape_task_info(task_id) if task_info is None: log.warning( f"Did not find task info for '{task_id}'. Skipping...") continue log.debug(task_info) log.info( f"Successfully scraped '{task_id}' [{task_info['title']}]...") try: statement_info = scraper.scrape_task_statement(task_id) task_info.update(statement_info) except NotImplementedError: log.warning( f"Could not get statement of task {task_id}: not implemented." ) except Exception as ex: log.warning(f"Could not get statement of task {task_id}: {ex}") task_infos.append(task_info) except NotImplementedError: log.warning( f'Scraping tasks not implemented for {scraper.__class__.__name__}.' ) return except Exception as ex: log.exception(ex) queries.write_tasks(task_infos)
def parse_submission(submission_data): try: submission_id = submission_data['id'] task_id = '/'.join([ str(submission_data['problem']['contestId']), submission_data['problem']['index'] ]) if submission_data['verdict'] == 'TESTING': log.debug(f'Skipped submission {submission_id}: still testing.') return [] if 'verdict' not in submission_data: log.warning(f'Skipped submission {submission_id}: no verdict?.') return [] for author in submission_data['author']['members']: author_id = author['handle'] submission = dict( judge_id=CODEFORCES_JUDGE_ID, submission_id=str(submission_id), task_id=task_id.lower(), submitted_on=datetime.datetime.utcfromtimestamp( submission_data['creationTimeSeconds']), language=submission_data['programmingLanguage'], verdict=parse_verdict(submission_data['verdict']), author_id=author_id.lower(), time_exec=submission_data['timeConsumedMillis'], memory_used=round(submission_data['memoryConsumedBytes'] / 1024), ) yield submission except Exception as ex: log.error( f"Failed to parse submission.\nSubmission data:{submission_data}\nError: {ex}" )
def scrape_handle_info(handle): log.info(f"Scraping info for handle '{handle}'...") judge_id, handle_id = handle.split('/', 1) handles = __expand_handle(judge_id, handle_id) log.info(f"Handles: {handles}") scraper = scrapers.create_scraper(judge_id) user_infos = [] for handle in handles: try: user_info = scraper.scrape_user_info(handle) log.info(f"Successfully scraped user info for '{handle}'") log.debug(user_info) if user_info: user_infos.append(user_info) except NotImplementedError: log.warning( f'Scraping handles not implemented for {scraper.__class__.__name__}.') return except Exception as ex: log.exception(ex) queries.write_handles(user_infos)
def get_csrf_token(): response = requests.get('https://csacademy.com/') csrf_token = response.cookies['csrftoken'] log.debug('Got csrf token: {}'.format(csrf_token)) return csrf_token
def scrape_task_statement(self, task_id): statement = utils.scrape_task_statement(task_id) statement['statement'] = translators.translate_ro_en( statement['statement']) log.debug(statement['statement']) return statement
def create_scraper(judge_id: str) -> Scraper: for scraper_kls in __SUBMISSION_SCRAPERS__: if scraper_kls.JUDGE_ID == judge_id: log.debug(f"Found scraper for judge '{judge_id}': {scraper_kls.__name__}") return scraper_kls() raise Exception(f"No scraper configured for {judge_id}.")