def get_standings(self, users=None, statistics=None): # REQ.get('https://www.codechef.com/') # try: # form = REQ.form() # form['post'].update({ # 'name': self._username, # 'pass': self._password, # }) # page = REQ.get(form['url'], post=form['post']) # form = REQ.form() # if form['url'] == '/session/limit': # for field in form['unchecked'][:-1]: # form['post'][field['name']] = field['value'].encode('utf8') # page = REQ.get(form['url'], post=form['post']) # except Exception: # pass url = self.API_CONTEST_URL_FORMAT_.format(**self.__dict__) page = REQ.get(url) data = json.loads(page) if data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) if 'child_contests' in data: contest_infos = { d['contest_code']: { 'division': k } for k, d in data['child_contests'].items() } else: contest_infos = {self.key: {}} result = {} problems_info = dict() if len(contest_infos) > 1 else list() hidden_fields = set() for key, contest_info in contest_infos.items(): url = self.STANDINGS_URL_FORMAT_.format(key=key) page = REQ.get(url) match = re.search( '<input[^>]*name="csrfToken"[^>]*id="edit-csrfToken"[^>]*value="([^"]*)"', page) csrf_token = match.group(1) n_page = 0 per_page = 150 n_total_page = None pbar = None contest_type = None while n_total_page is None or n_page < n_total_page: n_page += 1 time.sleep(2) url = self.API_RANKING_URL_FORMAT_.format(key=key, page=n_page, per_page=per_page) if users: urls = [f'{url}&search={user}' for user in users] else: urls = [url] for url in urls: delay = 5 for _ in range(10): try: headers = { 'x-csrf-token': csrf_token, 'x-requested-with': 'XMLHttpRequest', } page = REQ.get(url, headers=headers) data = json.loads(page) assert data.get('status') != 'rate_limit_exceeded' break except Exception: traceback.print_exc() delay = min(300, delay * 2) sys.stdout.write(f'url = {url}\n') sys.stdout.write(f'Sleep {delay}... ') sys.stdout.flush() time.sleep(delay) sys.stdout.write('Done\n') else: raise ExceptionParseStandings( f'Failed getting {n_page} by url {url}') if 'status' in data and data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) unscored_problems = data['contest_info'][ 'unscored_problems'] if n_total_page is None: for p in data['problems']: if p['code'] in unscored_problems: continue d = problems_info if 'division' in contest_info: d = d.setdefault('division', OrderedDict()) d = d.setdefault(contest_info['division'], []) d.append({ 'short': p['code'], 'name': p['name'], 'url': f"https://www.codechef.com/problems/{p['code']}", }) n_total_page = data['availablePages'] pbar = tqdm.tqdm(total=n_total_page * len(urls)) contest_type = data['contest_info'].get('type') for d in data['list']: handle = d.pop('user_handle') d.pop('html_handle', None) problems_status = d.pop('problems_status') if d['score'] < 1e-9 and not problems_status: LOG.warning(f'Skip handle = {handle}: {d}') continue row = result.setdefault(handle, OrderedDict()) row['member'] = handle row['place'] = d.pop('rank') row['solving'] = d.pop('score') for k in 'time', 'total_time': if k in d: row['time'] = d.pop(k) break problems = row.setdefault('problems', {}) solved, upsolved = 0, 0 if problems_status: for k, v in problems_status.items(): t = 'upsolving' if k in unscored_problems else 'result' v[t] = v.pop('score') solved += 1 if v.get('result', 0) > 0 else 0 upsolved += 1 if v.get('upsolving', 0) > 0 else 0 if contest_type == '1' and 'penalty' in v: penalty = v.pop('penalty') if v[t] > 0: v[t] = f'+{"" if penalty == 0 else penalty}' else: v[t] = f'-{penalty}' problems[k] = v row['solved'] = { 'solving': solved, 'upsolving': upsolved } country = d.pop('country_code') if country: d['country'] = country rating = d.pop('rating', None) if rating and rating != '0': hidden_fields.add('rating') row['rating'] = rating row.update(d) row.update(contest_info) if statistics and handle in statistics: stat = statistics[handle] for k in ('rating_change', 'new_rating'): if k in stat: row[k] = stat[k] hidden_fields |= set(list(d.keys())) pbar.set_description(f'key={key} url={url}') pbar.update() has_penalty = False for row in result.values(): p = row.get('penalty') has_penalty = has_penalty or p and str(p) != "0" if not has_penalty: for row in result.values(): row.pop('penalty', None) if pbar is not None: pbar.close() standings = { 'result': result, 'url': self.url, 'problems': problems_info, 'hidden_fields': list(hidden_fields), } return standings
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) page = REQ.get(self.url) match = re.search( r'''<a[^>]*href=["']?(?P<href>[^"' ]*rating[^"' ]*)["']?[^>]*>\[Рейтинг\]''', page) if not match and re.search( r'''<b>Олимпиада №[0-9]+ не существует!</b>''', page): return {'action': 'delete'} page = REQ.get(match.group('href')) standings_url = REQ.last_url match = re.search( r'''var(?P<vars>(?:\s*[a-z]+=[0-9]+,)+)\s*M=(?:new Array)?[\[\(]?(?P<data>.*?)[\]\)]\s*(?:function|var)''', page) # noqa result = {} problems_info = OrderedDict() def canonize_name(name): name = name.replace('\r', ' ') name = name.replace('\n', ' ') name = re.sub(r'\s+', ' ', name) name = re.sub(r'<br/?>', ',', name) name = re.sub(r'<[^>]*>', '', name) name = re.sub(r'\s*,\s*', ', ', name) name = name.strip() return name if match: data = match.group('data') data = data.replace('\\', '\\\\') data = data.replace('"', r'\"') data = data.replace("'", '"') data = re.sub(r'\s+', ' ', data) data = json.loads(f'[{data}]') variables = {} for var in re.split(r',\s*', match.group('vars').strip()): if not var: continue k, v = var.split('=') variables[k] = v match = re.search(r'''M\[\((?P<val>[0-9]+)\+''', page) offset = int(match.group('val')) n_problems = int(variables['tn']) n_teams = int(variables['nk']) n_fields = offset + 3 * n_problems place = 0 last = None for rank, st in enumerate(range(0, n_teams * n_fields, n_fields), start=1): row = data[st:st + n_fields] name = canonize_name(row[0]) member = name + ', ' + season r = result.setdefault(member, {}) r['name'] = name r['member'] = member r['solving'] = int(row[1]) r['penalty'] = int(row[2]) score = r['solving'], r['penalty'] if score != last: place = rank last = score r['place'] = place n_problems_fields = 3 problems = r.setdefault('problems', {}) for idx in range(0, n_problems): p_info = row[offset + idx * n_problems_fields:offset + (idx + 1) * n_problems_fields] stat, errors, seconds = map(int, p_info) key = chr(ord('A') + idx) if n_problems < 27 else f'{idx + 1:02d}' if key not in problems_info: info = {'short': key} if abs(errors) >= 1000: info['full_score'] = 100 problems_info[key] = info if not stat: continue p = problems.setdefault(key, {}) p['time'] = self.to_time(seconds, num=2) if abs(errors) < 1000: p['result'] = f'+{errors if errors else ""}' if stat == 1 else f'-{errors}' else: solved = r.setdefault('solved', {'solving': 0}) score = errors - 1000 p['result'] = score if score > 0: p['partial'] = score < problems_info[key][ 'full_score'] if not p['partial']: solved['solving'] += 1 if not problems: result.pop(member) else: regex = '''<table[^>]*class=["']?olimp["']?[^>]*>.*?</table>''' match = re.search(regex, page, re.DOTALL) if not match and 'Рейтинг олимпиады' not in page: return {'action': 'delete'} table = parsed_table.ParsedTable(match.group(0)) for row in table: r = OrderedDict() problems = r.setdefault('problems', {}) for k, v in list(row.items()): if k == '=': r['solving'] = int(v.value) elif k == 'Место': r['place'] = int(v.value) elif k == 'Время': r['penalty'] = int(v.value) elif k == 'Участник': name = canonize_name(v.value) r['name'] = name r['member'] = name + ', ' + season elif len(k) == 1 and k not in ['№']: if k not in problems_info: info = {'short': k} problems_info[k] = info if v.value != DOT: p = problems.setdefault(k, {}) p['result'], *values = v.value.split() if values: p['time'] = values[0] if not problems: continue result[r['member']] = r standings = { 'result': result, 'url': standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): standings_url = self.standings_url or self.RANKING_URL_FORMAT_.format( **self.__dict__) api_ranking_url_format = self.API_RANKING_URL_FORMAT_.format( **self.__dict__) url = api_ranking_url_format.format(1) content = REQ.get(url) data = json.loads(content) if not data: return {'result': {}, 'url': standings_url} n_page = (data['user_num'] - 1) // len(data['total_rank']) + 1 problems_info = [{ 'short': f'Q{i + 1}', 'name': p['title'] } for i, p in enumerate(data['questions'])] def fetch_page(page): url = api_ranking_url_format.format(page + 1) content = REQ.get(url) return json.loads(content) start_time = self.start_time.replace(tzinfo=None) result = {} with PoolExecutor(max_workers=8) as executor: for data in executor.map(fetch_page, range(n_page)): for row, submissions in zip(data['total_rank'], data['submissions']): if not submissions: continue handle = row.pop('username') if users and handle not in users: continue row.pop('contest_id') row.pop('user_slug') row.pop('global_ranking') r = result.setdefault(handle, {}) r['member'] = handle r['place'] = row.pop('rank') r['solving'] = row.pop('score') data_region = row.pop('data_region').lower() r['info'] = { 'profile_url': { '_data_region': '' if data_region == 'us' else f'-{data_region}' } } country = None for field in 'country_code', 'country_name': country = country or row.pop(field, None) if country: r['country'] = country solved = 0 problems = r.setdefault('problems', {}) for i, (k, s) in enumerate(submissions.items()): p = problems.setdefault(f'Q{i + 1}', {}) p['time'] = self.to_time( datetime.fromtimestamp(s['date']) - start_time) if s['status'] == 10: solved += 1 p['result'] = '+' + str(s['fail_count'] or '') else: p['result'] = f'-{s["fail_count"]}' r['solved'] = {'solving': solved} finish_time = datetime.fromtimestamp( row.pop('finish_time')) - start_time r['penalty'] = self.to_time(finish_time) r.update(row) standings = { 'result': result, 'url': standings_url, 'problems': problems_info, } return standings
def get_users_infos(users, resource, accounts, pbar=None): page = REQ.get(urljoin(resource.profile_url, Statistic.SETTINGS_URL_)) form = REQ.form(action=r'login.php\?action=login') if form: data = { 'username': conf.BESTCODER_AUTHORID, 'password': conf.BESTCODER_PASSWORD, 'remember': 'on', } page = REQ.submit_form(data=data, form=form) match = re.search('<select[^>]*id="country"[^>]*>.*?</select>', page, re.DOTALL) countries = dict( re.findall('<option[^>]*value="([0-9]+)"[^>]*>([^<]*)</option>', match.group(0))) @RateLimiter(max_calls=5, period=1) def fetch_user(user): url = resource.profile_url.format(account=user) page = REQ.get(url) info = {} matches = re.findall( r'<span[^>]*>([A-Z]+)</span>\s*<span[^>]*>([0-9]+)</span>', page) for k, v in matches: info[k.lower()] = int(v) match = re.search( '<img[^>]*src="[^"]*country[^"]*([0-9]+)[^"]*"[^>]*alt="country"[^>]*>', page) if match: info['country'] = countries.get(match.group(1)) match = re.search( '<img[^>]*class="img-circle"[^>]*src="([^"]*getAvatar.php[^"]*)"[^>]*>', page) if match: info['avatar_url'] = urljoin(url, match.group(1)) page = REQ.get(Statistic.USER_RATING_API_URL_.format(user)) data = json.loads(page) ratings = {} old_rating = None for stat in data: rating = ratings.setdefault(stat['contestid'], collections.OrderedDict()) new_rating = int(stat['rating']) if old_rating is not None: rating['old_rating'] = old_rating rating['rating_change'] = new_rating - old_rating rating['new_rating'] = new_rating old_rating = new_rating info['rating'] = new_rating if not ratings: info.pop('rating', None) return user, info, ratings with PoolExecutor(max_workers=8) as executor: for user, info, ratings in executor.map(fetch_user, users): if pbar: pbar.update() if not info: if info is None: yield {'info': None} else: yield {'skip': True} continue info = { 'info': info, 'contest_addition_update_params': { 'update': ratings, 'by': 'key', }, } yield info
def get_standings(self, users=None, statistics=None): page = REQ.get(self.url) if 'login.php' in REQ.last_url: raise ExceptionParseStandings('private contest') table = parsed_table.ParsedTable( html=page, xpath='.//table[@id="contest-problems"]//tr') problems_infos = collections.OrderedDict() for r in table: p_info = { 'short': r['Pro.ID'].value, 'name': r['Title'].value, } href = r['Title'].column.node.xpath('.//a/@href') if href: p_info['url'] = urljoin(self.url, href[0]) problems_infos[p_info['short']] = p_info standings_url = urljoin( self.url, self.STANDINGS_URL_FORMAT_.format(key=self.key)) page = REQ.get(standings_url) matches = re.findall('"[^"]*contest_ranklist[^"]*page=([0-9]+)', page) n_pages = max(map(int, matches)) if matches else 1 def fetch_page(page): url = f'{standings_url}&page={page + 1}' return REQ.get(url) results = {} header_mapping = { 'Rank': 'place', 'User': '******', 'Score': 'solving', 'Hack': 'hack' } with PoolExecutor(max_workers=4) as executor, tqdm.tqdm( total=n_pages, desc='paging') as pbar: for page in executor.map(fetch_page, range(n_pages)): table = parsed_table.ParsedTable( html=page, xpath='.//table[@id="contest-ranklist"]//tr', header_mapping=header_mapping) for r in table: row = collections.OrderedDict() problems = row.setdefault('problems', {}) for k, v in r.items(): p = k.split() if p[0] not in problems_infos: row[k] = v.value continue short, full_score = p problems_infos[short].setdefault( 'full_score', full_score) if not v.value: continue p = problems.setdefault(short, {}) score, *info = v.value.split() p['result'] = score if score.startswith('-'): continue if 'ondblclick' in v.column.attrs: ondblclick = v.column.attrs['ondblclick'] ids = re.findall('[0-9]+', ondblclick) if len(ids) == 2: url = urljoin( self.url, self.SOLUTION_URL_FORMAT_.format(*ids)) p['url'] = url p['external_solution'] = True *info, p['time'] = info if info and info[0] == '(': m = re.search('-([0-9]+)', info[1]) if m: p['penalty_score'] = m.group(1) info = info[3:] if not problems: continue hack = row.pop('hack') if hack: row['hack'] = {'title': 'hacks'} m = re.search(r'\+[0-9]+', hack) row['hack']['successful'] = int(m.group(0)) if m else 0 m = re.search(r'\-[0-9]+', hack) row['hack']['unsuccessful'] = -int( m.group(0)) if m else 0 handle = row['member'] if statistics and handle in statistics: stat = statistics[handle] for k in ('old_rating', 'rating_change', 'new_rating'): if k in stat: row[k] = stat[k] results[handle] = row pbar.update() ret = { 'url': standings_url, 'problems': list(problems_infos.values()), 'result': results, 'options': { 'fixed_fields': [('hack', 'Hack')], }, } return ret
def get_users_infos(users, resource=None, accounts=None, pbar=None): handles = ';'.join(users) len_limit = 1000 if len(handles) > len_limit: s = 0 for i in range(len(users)): s += len(users[i]) if s > len_limit: return Statistic.get_users_infos( users[:i], pbar) + Statistic.get_users_infos( users[i:], pbar) removed = [] last_index = 0 orig_users = list(users) while True: handles = ';'.join(users) data = _query(method='user.info', params={'handles': handles}) if data['status'] == 'OK': break if data['status'] == 'FAILED' and data['comment'].startswith( 'handles: User with handle'): handle = data['comment'].split()[-3] location = REQ.geturl( f'https://codeforces.com/profile/{handle}') index = users.index(handle) if location.endswith('//codeforces.com/'): removed.append((index, users[index])) users.pop(index) else: target = location.rstrip('/').split('/')[-1] users[index] = target if pbar is not None: pbar.update(index - last_index) last_index = index else: raise NameError(f'data = {data}') if pbar is not None: pbar.update(len(users) - last_index) infos = data['result'] for index, user in removed: infos.insert(index, None) users.insert(index, user) ret = [] assert len(infos) == len(users) for data, user, orig in zip(infos, users, orig_users): if data: if data['handle'].lower() != user.lower(): raise ValueError( f'Do not match handle name for user = {user} and data = {data}' ) if data.get('avatar', '').endswith('/no-avatar.jpg'): data.pop('avatar') if data.get('titlePhoto', '').endswith('/no-title.jpg'): data.pop('titlePhoto') ret.append({'info': data}) if data and data['handle'] != orig: ret[-1]['rename'] = data['handle'] return ret
def _hashcode(self, users=None, statistics=None): standings_url = None is_final_round = self.name.endswith('Final Round') page = REQ.get( self.ARCHIVE_DATA_URL_FORMAT_.format(year=self.start_time.year)) data = json.loads(page) names = set() for data_round in data['rounds']: name = data_round['name'] if name in names: name = 'Qualification Round' if self.name.endswith(name) or name in [ 'Full ranking', 'Main round' ] and is_final_round: data = data_round['data'] standings_url = self.ARCHIVE_URL_FORMAT_.format( year=self.start_time.year) break names.add(name) else: data = None if not data: if 'hashcode_scoreboard' in self.info: page = REQ.get(self.info['hashcode_scoreboard']) data = json.loads(page) else: raise ExceptionParseStandings('Not found data') if 'columns' in data: columns = data['columns'] data = data['rows'] else: columns = None result = {} season = self.get_season() for rank, row in enumerate(data, start=1): if columns is not None: row = dict(zip(columns, row)) row = {k.lower().replace(' ', ''): v for k, v in row.items()} name = row.pop('teamname') name = unescape(name) member = f'{name}, {season}' if users is not None and name not in users: continue r = result.setdefault(member, {}) r['name'] = name r['member'] = member score = row.pop('score', '0') score = re.sub(r'[\s,]', '', str(score)) try: float(score) except Exception: score = '0' r['solving'] = score if 'rank' in row: r['place'] = row.pop('rank') else: r['place'] = rank if 'country' in row: r['_countries'] = re.sub(r',\s+', ',', row.pop('country')).split(',') elif 'countries' in row: r['_countries'] = row.pop('countries') if 'finalround' in row: r['advanced'] = row['finalround'] standings = { 'result': result, 'problems': [], } if standings_url: standings['url'] = standings_url return standings
def fetch_data(handle): url = f'{self.HISTORY_URL_.format(urlparse(self.url), handle)}/json' data = json.loads(REQ.get(url)) return handle, data
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) ret = {} page = REQ.get(self.url) match = re.search( rf'<h2>[^<]*Day\s*[0-9]+:\s*(?P<problem_name>[^<]*)</h2>', page) problem_name = match.group('problem_name').strip('-').strip() if self.name.count('.') == 1 and problem_name: ret['title'] = f'{self.name}. {problem_name}' standings_url = self.standings_url or self.url.replace( '/day/', '/leaderboard/day/') page = REQ.get(standings_url) matches = re.finditer( r''' <div[^>]*class="leaderboard-entry"[^>]*>\s* <span[^>]*class="leaderboard-position"[^>]*>\s*(?P<rank>[0-9]+)[^<]*</span>\s* <span[^>]*class="leaderboard-time"[^>]*>(?P<time>[^<]*)</span>\s* (?:<a[^>]*href="(?P<href>[^"]*)"[^>]*>\s*)? <span[^>]*class="leaderboard-userphoto"[^>]*>(\s*<img[^>]*src="(?P<avatar>[^"]*)"[^>]*>)?[^<]*</span>\s* (?:<span[^>]*class="leaderboard-anon"[^>]*>)?(?P<name>[^<]*) ''', page, re.VERBOSE) problems_info = OrderedDict() result = {} last = None n_problems = 0 n_results = 0 for match in matches: n_results += 1 href = match.group('href') name = html.unescape(match.group('name')).strip() if href: handle = href.split('//')[-1].strip('/') elif re.match(r'^\(anonymous user #[0-9]+\)$', name): handle = name else: handle = f'{name}, {season}' handle = handle.replace('/', '-') rank = int(match.group('rank')) if last is None or last >= rank: n_problems += 1 last = rank row = result.setdefault(handle, { 'solving': 0, '_skip_for_problem_stat': True }) score = 100 - rank + 1 row['solving'] += score row['name'] = name row['member'] = handle avatar = match.group('avatar') if avatar: row['info'] = {'avatar': avatar} k = str(n_problems) if k not in problems_info: problems_info[k] = { 'name': problem_name, 'code': k, 'url': self.url, 'group': 0, 'full_score': 100 } problem = row.setdefault('problems', {}).setdefault(k, {}) problem['result'] = score time = f'''{self.start_time.year} {match.group('time')} -05:00''' problem['time'] = self.to_time( arrow.get(time, 'YYYY MMM D HH:mm:ss ZZ') - self.start_time) if rank == 1: problem['first_ac'] = True problems = list(reversed(problems_info.values())) problems[0]['subname'] = 'first star' if len(problems) > 1: problems[1]['subname'] = 'both stars' place = None last = None for rank, row in enumerate(sorted(result.values(), key=lambda r: -r['solving']), start=1): score = row['solving'] if last != score: place = rank last = score row['place'] = place ret.update({ 'result': result, 'url': standings_url, 'problems': problems, }) if n_results < 200: ret['timing_statistic_delta'] = timedelta(minutes=5) return ret
def fetch_page(page_index): url = self.standings_url if page_index: url += f'?page={page_index}' return REQ.get(url), url
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) result = {} problems_info = OrderedDict() page = REQ.get(self.standings_url) try: standings_xml = REQ.get(self.standings_url.replace( '.html', '.xml'), detect_charsets=False) xml_result = parse_xml(standings_xml) except FailOnGetResponse: xml_result = {} regex = '<table[^>]*class="standings"[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) if not match: page = re.sub('<table[^>]*wrapper[^>]*>', '', page) regex = '<table[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) table = parsed_table.ParsedTable(html_table, as_list=True) university_regex = self.info.get('standings', {}).get('1st_u', {}).get('regex') for r in table: row = {} problems = row.setdefault('problems', {}) for k, v in r: k = k.split()[0] if k == 'Total' or k == '=': row['solving'] = int(v.value) elif len(k) <= 3: problems_info[k] = {'short': k} if 'title' in v.attrs: problems_info[k]['name'] = v.attrs['title'] if '-' in v.value or '+' in v.value or '?' in v.value: p = problems.setdefault(k, {}) if ' ' in v.value: point, time = v.value.split() else: point = v.value time = None if 'result' in p and point != p.get('result'): p.clear() p['result'] = point if time is not None: p['time'] = time first_ac = v.column.node.xpath( './/*[@class="first-to-solve"]') if len(first_ac): p['first_ac'] = True elif k == 'Time': row['penalty'] = int(v.value) elif k.lower() in ['place', 'rank']: row['place'] = v.value.strip('.') elif 'team' in k.lower() or 'name' in k.lower(): if xml_result: problems.update(xml_result[v.value]) row['member'] = v.value + ' ' + season row['name'] = v.value else: row[k] = v.value for f in 'diploma', 'medal': medal = row.pop(f, None) or row.pop(f.title(), None) if medal: if medal in ['З', 'G']: row['medal'] = 'gold' elif medal in ['С', 'S']: row['medal'] = 'silver' elif medal in ['Б', 'B']: row['medal'] = 'bronze' break if university_regex: match = re.search(university_regex, row['name']) if match: u = match.group('key').strip() row['university'] = u result[row['member']] = row if statistics and self.info.get('use_icpc.kimden.online'): team_regions = {} def canonize_name(name): name = re.sub(':', '', name) name = re.sub(r'\s+', ' ', name) return name def get_region(team_name): nonlocal team_regions if not team_regions: page = REQ.get('https://icpc.kimden.online/') matches = re.finditer( '<label[^>]*for="(?P<selector>[^"]*)"[^"]*onclick="setRegion[^"]*"[^>]*>(?P<name>[^>]*)</', page, ) regions = {} for match in matches: selector = match.group('selector').replace( 'selector', '').replace('--', '-') regions[selector] = match.group('name') pprint(regions) matches = re.finditer( r''' <tr[^>]*class="(?P<class>[^"]*)"[^>]*>\s*<td[^>]*>[^<]*</td>\s*<td[^>]*title="(?P<name>[^"]*)">[^<]*</td> ''', page, re.VERBOSE, ) for match in matches: classes = match.group('class').split() name = match.group('name') name = canonize_name(name) for c in classes: if c in regions: team_regions[name] = regions[c] break team_name = canonize_name(team_name) return team_regions[team_name] for row in result.values(): stat = statistics.get(row['member']) if not stat: continue if stat.get('region'): row['region'] = stat['region'] else: row['region'] = get_region(row['name']) standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), 'problems_time_format': '{M}:{s:02d}', 'hidden_fields': ['university', 'region', 'medal'], } return standings
def get_standings(self, users=None, statistics=None): result = {} problems_info = OrderedDict() page = REQ.get(self.standings_url) match = re.findall('<a[^>]href="[^"]*page=[0-9]+"[^>]*>(?P<n_page>[0-9]+)</a>', page) n_page = 1 if not match else int(match[-1]) def fetch_page(page_index): url = self.standings_url if page_index: url += f'?page={page_index}' return REQ.get(url), url place = 0 idx = 0 prev = None with PoolExecutor(max_workers=8) as executor, tqdm.tqdm(total=n_page, desc='fetch pages') as pbar: for page, url in executor.map(fetch_page, range(n_page)): pbar.set_postfix(url=url) pbar.update(1) regex = '<table[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) table = parsed_table.ParsedTable(html_table) for r in table: idx += 1 row = {} problems = row.setdefault('problems', {}) for k, v in list(r.items()): k = k.split()[0] if k.lower() == 'score': solving, *a = v.value.split() row['solving'] = int(solving) if a: row['penalty'] = int(re.sub(r'[\(\)]', '', a[0])) elif len(k) == 1: if k not in problems_info: problems_info[k] = {'short': k} title = first(v.header.node.xpath('a[@title]/@title')) url = first(v.header.node.xpath('a[@href]/@href')) if title: problems_info[k]['name'] = title if url: problems_info[k]['url'] = urllib.parse.urljoin(self.standings_url, url) if '-' in v.value or '+' in v.value: p = problems.setdefault(k, {}) if ' ' in v.value: point, time = v.value.split() p['time'] = time else: point = v.value if point == '+0': point = '+' p['result'] = point elif v.value.isdigit(): p = problems.setdefault(k, {}) p['result'] = v.value elif k.lower() == 'user': row['member'] = v.value else: row[k] = v.value if 'penalty' not in row: solved = [p for p in list(problems.values()) if p['result'] == '100'] row['solved'] = {'solving': len(solved)} curr = (row['solving'], row.get('penalty')) if prev is None or prev != curr: place = idx prev = curr row['place'] = place result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): result = {} problems_info = OrderedDict() year = self.start_time.year if not self.standings_url: self.standings_url = self.url.replace('/olympiads/', '/results/') page = REQ.get(self.standings_url) regex = '<table[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table, as_list=True) idx = 0 for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) problem_idx = 0 for k, v in r: if 'taskscore' in v.header.attrs.get('class', '').split(): problem_idx += 1 d = problems_info.setdefault(problem_idx, {}) d['short'] = str(problem_idx) d['full_score'] = 100 d['name'] = k try: score = float(v.value) p = problems.setdefault(str(problem_idx), {}) p['result'] = v.value p['partial'] = score < 100 except Exception: pass elif k == 'Abs.': row['solving'] = float(v.value) elif k == 'Rank': row['place'] = v.value.strip('*').strip('.') elif k == 'Contestant': if not v.value: idx += 1 member = f'{year}-{idx:06d}' row['member'] = member else: url = first(v.column.node.xpath('a[@href]/@href')) member = url.strip('/').split('/')[-1] row['member'] = member row['name'] = v.value elif k == 'Country': country = re.sub(r'\s*[0-9]+$', '', v.value) if country: row['country'] = country else: val = v.value.strip() if val: row[k] = val result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): standings_url = self.url.rstrip('/') + '/leaderboard' per_page = 100 if '/contests/' in self.url: api_standings_url_format = standings_url.replace( '/contests/', '/rest/contests/') api_standings_url_format += '?offset={offset}&limit={limit}&include_practice=true' elif '/competitions/' in self.url: url = self.host + f'api/hrw/resources/{self.key}?include=leaderboard' page = REQ.get(url) data = json.loads(page) entry_id = data['included'][0]['id'] api_standings_url_format = self.host + f'api/hrw/resources/leaderboards/{entry_id}/leaderboard_entries' api_standings_url_format += '?page[limit]={limit}&page[offset]={offset}' else: raise ExceptionParseStandings(f'Unusual url = {self.url}') @RateLimiter(max_calls=1, period=2) def fetch_page(page): offset = (page - 1) * per_page url = api_standings_url_format.format(offset=offset, limit=per_page) page = Statistic.get(url) data = json.loads(page) return data result = {} hidden_fields = set() schools = dict() def process_data(data): rows = data['models'] if 'models' in data else data['data'] school_ids = set() for r in rows: if isinstance(r.get('attributes'), dict): r = r['attributes'] def get(*fields): for f in fields: if f in r: return r.pop(f) handle = get('hacker', 'name') if handle is None: continue row = result.setdefault(handle, collections.OrderedDict()) row['member'] = handle score = get('score', 'solved_challenges') if score is None: score = get('percentage_score') * 100 row['solving'] = score row['place'] = get('rank', 'leaderboard_rank') time = get('time_taken', 'time_taken_seconds') if time: row['time'] = self.to_time(time, 3) country = get('country') if country: row['country'] = country avatar_url = get('avatar') if avatar_url: row['info'] = {'avatar_url': avatar_url} for k, v in r.items(): if k not in row and v is not None: row[k] = v hidden_fields.add(k) if statistics and handle in statistics: stat = statistics[handle] for k in ('old_rating', 'rating_change', 'new_rating'): if k in stat: row[k] = stat[k] if 'school_id' in row and row['school_id'] not in schools: school_ids.add(row['school_id']) if school_ids: query = ','.join(school_ids) url = self.host + f'community/v1/schools?page[limit]={len(school_ids)}&filter[unique_id]={query}' page = REQ.get(url) data = json.loads(page) for s in data['data']: schools[s['id']] = s['attributes']['name'] for row in result.values(): if 'school_id' in row and 'school' not in row: row['school'] = schools[row['school_id']] try: data = fetch_page(1) except FailOnGetResponse as e: if e.code == 404: return {'action': 'delete'} raise e process_data(data) total = data['meta']['record_count'] if 'meta' in data else data[ 'total'] n_pages = (total - 1) // (per_page) + 1 with ExitStack() as stack: executor = stack.enter_context( PoolExecutor(max_workers=Statistic.MAX_WORKERS)) pbar = stack.enter_context( tqdm(total=n_pages - 1, desc='getting pages')) for data in executor.map(fetch_page, range(1, n_pages + 1)): process_data(data) pbar.set_postfix(delay=f'{Statistic.DELAY:.5f}', refresh=False) pbar.update() hidden_fields.discard('school') standings = { 'result': result, 'hidden_fields': list(hidden_fields), 'url': standings_url, } return standings
def get_standings_from_html(self): url = urljoin(self.standings_url, '?lang=en') page = REQ.get(url) regex = '''<table[^>]*standings[^>]*>.*?</table>''' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) mapping = { '#': 'place', 'Who': 'name', '=': 'solving', 'Penalty': 'penalty', } table = parsed_table.ParsedTable(html_table, header_mapping=mapping) season = self.get_season() problems_info = OrderedDict() result = {} for r in table: row = {} problems = row.setdefault('problems', {}) for k, v in r.items(): if len(k) == 1: problems_info.setdefault(k, {'short': k}) if v.value: p = problems.setdefault(k, {}) v = v.value if ' ' in v: v, p['time'] = v.split() p['result'] = v elif k == 'name': f = v.column.node.xpath( './/img[@class="standings-flag"]/@title') if f: row['country'] = f[0] a = v.column.node.xpath('.//a') if not a: row[k] = v.value row['member'] = row['name'] + ' ' + season else: for el in a: href = el.attrib.get('href') if not href: continue key, val = href.strip('/').split('/') if key == 'team': row['name'] = el.text row['team_id'] = val row['_account_url'] = urljoin(url, href) elif key == 'profile': row.setdefault('members', []).append(val) elif v.value: if k == 'penalty': row[k] = int(v.value) elif v.value: row[k] = v.value if 'solving' not in row: continue if 'members' in row: if 'team_id' in row: row['_members'] = [{'account': m} for m in row['members']] for member in row.pop('members'): result[member] = deepcopy(row) result[member]['member'] = member else: result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get(*args, **kwargs): page = REQ.get(*args, **kwargs) for c in REQ.get_raw_cookies(): if c.domain == domain and c.name.startswith('__'): c.value = re.sub(r'\s*', '', c.value) return page
def _query(method, params, api_key=DEFAULT_API_KEY, prev_time_queries={}, api_url_format='https://codeforces.com/api/%s'): url = api_url_format % method key, secret = api_key params = dict(params) params.update({ 'time': int(time()), 'apiKey': key, 'lang': 'en', }) url_encode = '&'.join( ('%s=%s' % (k, v) for k, v in sorted(params.items()))) api_sig_prefix = ''.join(choice(ascii_lowercase) for x in range(6)) api_sig = '%s/%s?%s#%s' % ( api_sig_prefix, method, url_encode, secret, ) params['apiSig'] = api_sig_prefix + sha512( api_sig.encode('utf8')).hexdigest() url += '?' + urlencode(params) times = prev_time_queries.setdefault((key, secret), []) if len(times) == 5: delta = max(4 - (time() - times[0]), 0) sleep(delta) times.clear() md5_file_cache = url for k in ( 'apiSig', 'time', ): md5_file_cache = re.sub('%s=[0-9a-z]+' % k, '', md5_file_cache) times.append(time()) for attempt in reversed(range(5)): try: page = REQ.get(url, md5_file_cache=md5_file_cache) times[-1] = time() ret = json.loads(page) except FailOnGetResponse as e: if e.code == 503 and attempt: sleep(1) continue err = e.args[0] if hasattr(err, 'fp'): try: ret = json.load(err.fp) except json.decoder.JSONDecodeError: ret = {'status': str(e)} else: ret = {'status': str(e)} ret['code'] = getattr(err, 'code', None) break return ret
def get_standings(self, users=None, statistics=None): urlinfo = urllib.parse.urlparse(self.url) host = f'{urlinfo.scheme}://{urlinfo.netloc}/' page = REQ.get( host + 'services/Challenge/findWorldCupByPublicId', post=f'["{self.key}", null]', content_type='application/json', ) data = json.loads(page) challenge = data.get('challenge', {}) clash_hubs = challenge.get('clashHubs') def get_leaderboard(url, column="", value=""): active = 'true' if column else 'false' filt = f'{{"active":{active},"column":"{column}","filter":"{value}"}}' if clash_hubs: post = f'[1,{filt},null,true,"global",{clash_hubs[0]["clashHubId"]}]' else: post = f'["{self.key}",null,"global",{filt}]' page = REQ.get(url, post=post, content_type='application/json') data = json.loads(page) return data if clash_hubs: url = host + 'services/Leaderboards/getClashLeaderboard' else: url = host + 'services/Leaderboards/getFilteredChallengeLeaderboard' data = get_leaderboard(url) standings_url = os.path.join(self.url, 'leaderboard') page = REQ.get(standings_url) match = re.search( r'<script[^>]*src="(?P<js>[^"]*static.codingame.com/app\.[^"]*\.js)"[^>]*>', page) page = REQ.get(match.group('js'), detect_charsets=None) match = re.search( r'const t={EN:(?P<countries>\[{id:"[^"]*",name:"[^"]*"},.*?}]),[A-Z]{2}:', page) countries = match.group('countries') countries = countries.replace('id:', '"id":') countries = countries.replace('name:', '"name":') countries = json.loads(countries) countries = [c['id'] for c in countries] languages = list(data.get('programmingLanguages', {}).keys()) with PoolExecutor(max_workers=8) as executor: hidden_fields = set() result = {} def process_data(data): nonlocal hidden_fields nonlocal result for row in data['users']: if 'codingamer' not in row: continue info = row.pop('codingamer') row.update(info) info['profile_url'] = { 'public_handle': info.pop('publicHandle') } handle = str(info.pop('userId')) if handle in result: continue r = result.setdefault(handle, OrderedDict()) r['member'] = handle r['place'] = row.pop('rank') r['info'] = info if 'league' in row: league = row.pop('league') r['league'] = league['divisionIndex'] r['league_rank'] = row.pop('localRank') for field, out in ( ('score', 'solving'), ('programmingLanguage', 'language'), ('clashes_count', 'clashes_count'), ('pseudo', 'name'), ('countryId', 'country'), ('company', 'company'), ('school', 'school'), ): if field in row: r[out] = row.pop(field) if 'updateTime' in row: row['updated'] = row.pop('updateTime') / 1000 if 'creationTime' in row: row['created'] = row.pop('creationTime') / 1000 row.pop('public_handle', None) row.pop('test_session_handle', None) row.pop('avatar', None) for k, v in row.items(): if k not in r: r[k] = v hidden_fields.add(k) process_data(data) if len(data['users']) >= 1000: fetch_data = partial(get_leaderboard, url, "LANGUAGE") for data in tqdm.tqdm(executor.map(fetch_data, languages), total=len(languages), desc='languages'): process_data(data) fetch_data = partial(get_leaderboard, url, "COUNTRY") for data in tqdm.tqdm(executor.map(fetch_data, countries), total=len(countries), desc='countries'): process_data(data) standings = { 'url': standings_url, 'result': result, 'fields_types': { 'updated': ['timestamp'], 'created': ['timestamp'] }, 'hidden_fields': hidden_fields, 'options': { 'fixed_fields': [ ('league', 'league'), ('league_rank', 'league_rank'), ('language', 'Language'), ('clashes_count', 'clashes_count'), ('created', 'Submit Time'), ], 'medals': [ { 'name': 'gold', 'count': 1 }, { 'name': 'silver', 'count': 1 }, { 'name': 'bronze', 'count': 1 }, ], }, } return standings
def _old_get_standings(self, users=None): if not self.standings_url: self.standings_url = self.url.replace('/dashboard', '/scoreboard') result = {} page = REQ.get(self.standings_url) matches = re.finditer(r'GCJ.(?P<key>[^\s]*)\s*=\s*"?(?P<value>[^";]*)', page) vs = {m.group('key'): m.group('value') for m in matches} vs['rowsPerPage'] = int(vs['rowsPerPage']) matches = re.finditer(r'GCJ.problems.push\((?P<problem>{[^}]*})', page) problems_info = OrderedDict([]) problems = [json.loads(m.group('problem')) for m in matches] matches = re.finditer( r'(?P<new>\(\);)?\s*io.push\((?P<subtask>{[^}]*})', page) tid = -1 for idx, m in enumerate(matches): subtask = json.loads(m.group('subtask')) if m.group('new'): tid += 1 idx = str(idx) task = problems[tid].copy() task.update(subtask) task['name'] = task.pop('title') task['code'] = idx task['full_score'] = task.pop('points') problems_info[idx] = task def fetch_page(page_idx): nonlocal vs params = { 'cmd': 'GetScoreboard', 'contest_id': vs['contestId'], 'show_type': 'all', 'start_pos': page_idx * vs['rowsPerPage'] + 1, 'csrfmiddlewaretoken': vs['csrfMiddlewareToken'], } url = os.path.join(self.standings_url, 'do') + '?' + urllib.parse.urlencode(params) page = REQ.get(url) data = json.loads(page) return data data = fetch_page(0) n_page = (data['stat']['nrp'] - 1) // vs['rowsPerPage'] + 1 def time2str(t): h = t // 3600 if h: return f'{h}:{t // 60 % 60:02d}:{t % 60:02d}' return f'{t // 60}:{t % 60:02d}' result = {} with PoolExecutor(max_workers=8) as executor: for data in tqdm.tqdm(executor.map(fetch_page, range(n_page)), total=n_page): for row in data['rows']: handle = row.pop('n') r = result.setdefault(handle, {}) r['member'] = handle r['country'] = row.pop('c') r['penalty'] = time2str(row.pop('pen')) r['solving'] = row.pop('pts') r['place'] = row.pop('r') problems = r.setdefault('problems', {}) solved = 0 for idx, (attempt, time) in enumerate( zip(row.pop('att'), row.pop('ss'))): if attempt: p = problems.setdefault(str(idx), {}) if time == -1: p['result'] = -attempt else: solved += 1 p['result'] = '+' if attempt == 1 else f'+{attempt - 1}' p['time'] = time2str(time) r['solved'] = {'solving': solved} standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): geolocator = Nominatim(user_agent="clist.by") geocode_func = partial(geolocator.geocode, timeout=10) geocode = RateLimiter(geocode_func, min_delay_seconds=1, max_retries=3) season = self.key.split('.')[0] if not self.standings_url: return {} page = REQ.get(self.standings_url) page = re.sub('<(/?)tl([^>]*)>', r'<\1tr\2>', page) regex = '<table[^>]*class="standings"[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) if not match: regex = r'<table\s*(?:align="center"\s*)?border="1"\s*(?:align="center"\s*)?>.*?</table>' matches = re.finditer(regex, page, re.DOTALL) for match in matches: pass if not match: raise ExceptionParseStandings('not found standings table') html_table = match.group(0) c_mapping = { 'place': 'place', 'место': 'place', 'user': '******', 'team': 'name', 'участник': 'name', 'solved': 'solved', 'total': 'solved', 'имя': 'first_name', 'фамилия': 'last_name', 'отчество': 'middle_name', 'логин': 'login', 'login': '******', 'класс': 'class', 'город': 'city', 'субъект российской федерации (для иностранных участников - государство)': 'city', 'балл': 'solving', 'сумма': 'solving', 'баллы': 'solving', 'score': 'solving', 'sum': 'solving', 'диплом': 'diploma', 'степень диплома': 'diploma', 'номер диплома': 'diploma_number', 'страна': 'country', 'школа (сокр.)': 'school', 'школа': 'school', 'учебное зачедение, класс': 'school', 'регион/статус': 'region', 'регион': 'region', 'имя в таблице': 'handle', 'uid': 'uid', } table = parsed_table.ParsedTable(html_table) locations = None if os.path.exists(self.LOCATION_CACHE_FILE): with open(self.LOCATION_CACHE_FILE, 'r') as fo: locations = yaml.safe_load(fo) if locations is None: locations = {} def get_location(loc_info): loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower() if loc_info not in locations: try: ru = geocode(loc_info, language='ru') en = geocode(loc_info, language='en') if ru is None and en is None: locations[loc_info] = None else: locations[loc_info] = { 'ru': ru.address, 'en': en.address } except Exception: pass return locations.get(loc_info) def get_country(address): *_, country = map(str.strip, address['en'].split(',')) if country.startswith('The '): country = country[4:] return country try: result = {} problems_info = OrderedDict() has_bold = False last, place, placing = None, None, {} for idx, r in enumerate(tqdm.tqdm(table, total=len(table)), start=1): row = OrderedDict() problems = row.setdefault('problems', {}) letter = chr(ord('A') - 1) solved = 0 for k, v in list(r.items()): is_russian = bool(re.search('[а-яА-Я]', k)) c = v.attrs.get('class') c = c.split()[0] if c else k.lower() if c and c.startswith('st_'): c = c[3:].lower() if c in ['prob'] or c not in c_mapping and not is_russian: letter = chr(ord(letter) + 1) problem_info = problems_info.setdefault( letter, { 'short': letter, 'full_score': 100, }) if letter.lower() != k.lower(): problem_info['name'] = k if 'title' in v.attrs: problem_info['name'] = v.attrs['title'] if v.value != DOT and v.value: p = problems.setdefault(letter, {}) if v.column.node.xpath('b'): p['partial'] = False has_bold = True v = v.value if SPACE in v: v, t = v.split(SPACE, 1) p['time'] = t try: score = float(v) p['result'] = v p['partial'] = score < problem_info[ 'full_score'] except ValueError: pass if 'partial' in p and not p['partial']: solved += 1 else: v = v.value.strip() if not v or v == '-': continue c = c_mapping.get(c, c).lower() row[c] = v if c == 'diploma': row['_medal_title_field'] = 'diploma' v = v.lower().split()[0] if re.search('(^в.к|^вне)', v): continue if v in ['gold', 'i', '1'] or v.startswith('перв'): row['medal'] = 'gold' elif v in ['silver', 'ii', '2' ] or v.startswith('втор'): row['medal'] = 'silver' elif v in ['bronze', 'iii', '3' ] or v.startswith('трет'): row['medal'] = 'bronze' else: row['medal'] = 'honorable' if 'solving' not in row: if 'solved' in row: row['solving'] = row.pop('solved') else: continue row['solved'] = {'solving': solved} if 'place' not in row: if place is None and idx != 1: continue if row['solving'] != last: place = idx last = row['solving'] placing[place] = idx row['place'] = place if 'name' not in row: if 'first_name' in row and 'last_name' in row: row['name'] = row['last_name'] + ' ' + row['first_name'] elif 'first_name' in row and 'last_name' not in row: row['name'] = row.pop('first_name') if 'login' in row: row['member'] = row['login'] if 'name' in row: row['_name_instead_key'] = True elif 'name' in row: name = row['name'] if ' ' in name: row['member'] = name + ' ' + season else: row.pop('name') row['member'] = name else: row['member'] = f'{self.pk}-{idx}' addition = (statistics or {}).get(row['member'], {}) if addition: country = addition.get('country') if country: row.setdefault('country', country) if 'country' not in row: locs = [] if 'city' in row: locs.append(row['city']) if 'extra' in row: extra = row['extra'] extra = re.sub(r'\s*(Не РФ|Not RF):\s*', ' ', extra, re.IGNORECASE) locs.extend(extra.split(',')) for loc in locs: loc = re.sub(r'\s*[0-9]+\s*', ' ', loc) loc = loc.strip() address = get_location(loc) if address: country = get_country(address) row['country'] = country break result[row['member']] = row if placing: for row in result.values(): place = row['place'] last = placing[place] row['place'] = str( place) if place == last else f'{place}-{last}' if has_bold: for row in result.values(): for p in row.get('problems').values(): if 'partial' not in p and 'result' in p: p['partial'] = True finally: with open(self.LOCATION_CACHE_FILE, 'wb') as fo: yaml.dump(locations, fo, encoding='utf8', allow_unicode=True) standings = { 'result': result, 'problems': list(problems_info.values()), 'hidden_fields': [ 'extra', 'first_name', 'last_name', 'middle_name', 'class', 'city', 'country', 'diploma', 'school', 'login', 'region', 'uid', 'handle', 'diploma_number', ], } return standings
def get(offset, num): query = f'{{"min_rank":{offset},"num_consecutive_users":{num}}}' url = api_ranking_url_format + encode(query) content = REQ.get(url) return decode(content)
def get_standings(self, users=None, statistics=None): standings_data = None if not self.standings_url: page = REQ.get(urljoin(self.url, '/')) for name in ( 'Соревнования', 'Тренировочные олимпиады', ): match = re.search('<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format(name), page) url = match.group('url') page = REQ.get(url) regex = ''' <a[^>]*href=["']?[^<"']*cid=(?P<cid>[0-9]+)[^>]*>[^>]*{}[^>]*</a>.*? <a[^>]*href="(?P<url>[^"]*)"[^>]*>{}< '''.format( re.escape(self.name), re.escape('Результаты прошедших тренировок'), ) match = re.search(regex, page, re.DOTALL | re.IGNORECASE | re.VERBOSE) if not match: raise ExceptionParseStandings('Not found standings urls list') url = match.group('url') cid = match.group('cid') last_standings_data = self.resource.info['parse']['last_standings_data'].get(cid, {}) page = REQ.get(url) dates = [self.start_time, self.start_time - timedelta(days=1)] dates = [d.strftime('%Y-%m-%d') for d in dates] re_dates = '|'.join(dates) regex = r''' <tr[^>]*>[^<]*<td[^>]*>\s*(?P<date>{})\s*</td>[^<]* <td[^>]*>(?P<title>[^<]*)</td>[^<]* <td[^>]*>[^<]*<a[^>]*href\s*=["\s]*(?P<url>[^">]*)["\s]*[^>]*> '''.format(re_dates) matches = re.findall(regex, page, re.MULTILINE | re.VERBOSE) datas = [ {'date': date.strip(), 'title': title.strip(), 'url': urljoin(url, u)} for date, title, u in matches ] if len(datas) > 1: regex = r'[0-9]\s*-\s*[0-9].*(?:[0-9]\s*-\s*[0-9].*\bкл\b|школа)' datas = [d for d in datas if not re.search(regex, d['title'], re.I)] if last_standings_data: datas = [d for d in datas if d['date'] > last_standings_data['date']] if not datas: raise ExceptionParseStandings('Not found standings url') if len(datas) > 1: _datas = [d for d in datas if d['date'] == dates[0]] if _datas: datas = _datas if len(datas) > 1: ok = True urls_map = {} for d in datas: url = d['url'] page = REQ.get(url) path = re.findall('<td[^>]*nowrap><a[^>]*href="(?P<href>[^"]*)"', page) if len(path) < 2: ok = False parent = urljoin(url, path[-2]) urls_map.setdefault(parent, d) if len(urls_map) > 1: standings_data = datas[0] elif not ok: raise ExceptionParseStandings('Too much standing url') else: standings_data = list(urls_map.values())[0] else: standings_data = datas[0] page = REQ.get(standings_data['url']) self.standings_url = REQ.last_url try: page = REQ.get(self.standings_url) except FailOnGetResponse as e: if e.code == 404: raise ExceptionParseStandings('Not found response from standings url') raise e def get_table(page): html_table = re.search('<table[^>]*bgcolor="silver"[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) return table table = get_table(page) problems_info = OrderedDict() max_score = defaultdict(float) scoring = False result = {} for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) for k, v in list(r.items()): if k == 'Имя': href = v.column.node.xpath('a/@href') if not href: continue uid = re.search('[0-9]+$', href[0]).group(0) row['member'] = uid row['name'] = v.value elif k == 'Место': row['place'] = v.value elif k == 'Время': row['penalty'] = int(v.value) elif k in ['Сумма', 'Задачи']: row['solving'] = float(v.value) elif re.match('^[a-zA-Z0-9]+$', k): problems_info[k] = {'short': k} if v.value: p = problems.setdefault(k, {}) p['result'] = v.value if v.value and v.value[0] not in ['-', '+']: scoring = True try: max_score[k] = max(max_score[k], float(v.value)) except ValueError: pass elif k: row[k.strip()] = v.value.strip() elif v.value.strip().lower() == 'log': href = v.column.node.xpath('.//a/@href') if href: row['url'] = urljoin(self.standings_url, href[0]) result[row['member']] = row if scoring: match = re.search(r'<b[^>]*>\s*<a[^>]*href="(?P<url>[^"]*)"[^>]*>ACM</a>\s*</b>', page) if match: page = REQ.get(match.group('url')) table = get_table(page) for r in table: uid = None for k, v in list(r.items()): if k == 'Имя': href = v.column.node.xpath('a/@href') if not href: continue uid = re.search('[0-9]+$', href[0]).group(0) elif re.match('^[a-zA-Z0-9]+$', k) and uid and v.value: if v.value[0] == '-': result[uid]['problems'][k]['partial'] = True elif v.value[0] == '+': result[uid]['problems'][k]['partial'] = False problems_info[k]['full_score'] = result[uid]['problems'][k]['result'] for r in result.values(): solved = 0 for k, p in r['problems'].items(): if p.get('partial'): continue score = p['result'] if score.startswith('+') or 'partial' in p and not p['partial']: solved += 1 else: try: score = float(score) except ValueError: continue if abs(max_score[k] - score) < 1e-9 and score > 0: solved += 1 r['solved'] = {'solving': solved} standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), 'info_fields': ['_standings_data'], } if result and standings_data: standings['_standings_data'] = standings_data self.resource.info['parse']['last_standings_data'][cid] = standings_data self.resource.save() return standings
def get_source_code(contest, problem): if 'url' not in problem: raise ExceptionParseStandings('Not found url') solution = REQ.get(problem['url']) ret = {'solution': solution} return ret
def get_standings(self, users=None, statistics=None): result = {} standings_url = None problems_info = OrderedDict() for url, upsolve in ( (self.url, False), (self.url.replace(".html", "-upsolving.html"), True), (self.url.replace("-training-", "-practice-"), True), (self.key, False), (self.key.replace(".html", "-upsolving.html"), True), (self.key.replace("-training-", "-practice-"), True), ): if upsolve and url in [self.url, self.key]: continue try: page = REQ.get(url) except Exception: continue if standings_url is None: standings_url = url header = None for match in re.findall(r'<tr[^>]*>.*?<\/tr>', page): match = match.replace(' ', ' ') fields = [ re.sub('<[^>]*>', ' ', m).strip() for m in re.findall(r'<t[hd][^>]*>.*?\/t[hd]>', match) ] if re.search(r'<\/th>', match): header = fields continue if not header: continue fields = dict(list(zip(header, fields))) get_value = partial(self.get_value_by_keys_, fields) place = get_value('Место', 'Place') if not place: continue member = get_value('Логин', 'Login', 'User', 'Участник') row = result.setdefault(member, {'member': member}) type_ = ('up' if upsolve else '') + 'solving' row[type_] = int( get_value('Всего', 'Решённые задачи', 'Total', 'Score')) problems = row.setdefault('problems', {}) for k in sorted(fields.keys()): if re.match('^(?:[A-Z]|[0-9]{,2})$', k): problems_info[k] = {'short': k} v = fields[k].split() if len(v) > 0: p = {'result': v[0]} if len(v) > 1: p['time'] = re.sub('[^0-9:]', '', v[1]) if upsolve: a = problems.setdefault(k, {}) if a.get('result', None) != p['result']: a['upsolving'] = p else: problems[k] = p try: solved = int( get_value('Решённые задачи', 'Solved problems')) row.setdefault('solved', {})[type_] = solved except ExceptionParseStandings: pass if upsolve: row['upsolving'] -= row.get('solving', 0) if 'solved' in row: row['solved']['upsolving'] -= row['solved'].get( 'solving', 0) else: row['place'] = place if not header: raise ExceptionParseStandings('Not detect header') standings = { 'result': result, 'problems': list(problems_info.values()), } if standings_url is not None: standings['url'] = standings_url return standings
def fetch_page(page): url = f'{standings_url}&page={page + 1}' return REQ.get(url)
def get_standings(self, users=None, statistics=None): if not self.standings_url: page = REQ.get(urljoin(self.url, '/')) for name in ( 'Соревнования', 'Тренировочные олимпиады', ): match = re.search( '<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format(name), page) page = REQ.get(match.group('url')) match = re.search( '{}.*?<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format( re.escape(self.name), 'Результаты прошедших тренировок'), page, re.DOTALL, ) if not match: raise ExceptionParseStandings('Not found standing url') url = match.group('url') page = REQ.get(url) date = self.start_time.strftime('%Y-%m-%d') matches = re.findall( r''' <tr[^>]*>[^<]*<td[^>]*>{}</td>[^<]* <td[^>]*>(?P<title>[^<]*)</td>[^<]* <td[^>]*>[^<]*<a[^>]*href\s*=["\s]*(?P<url>[^">]*)["\s]*[^>]*> '''.format(date), page, re.MULTILINE | re.VERBOSE) urls = [(title, urljoin(url, u)) for title, u in matches] if len(urls) > 1: urls = [( title, urljoin(url, u) ) for title, u in matches if not re.search( r'[0-9]\s*-\s*[0-9].*(?:[0-9]\s*-\s*[0-9].*\bкл\b|школа)', title, re.I)] if not urls: raise ExceptionParseStandings('Not found standing url') if len(urls) > 1: ok = True urls_set = set() for _, u in urls: page = REQ.get(u) path = re.findall( '<td[^>]*nowrap><a[^>]*href="(?P<href>[^"]*)"', page) if len(path) < 2: ok = False parent = urljoin(u, path[-2]) urls_set.add(parent) if len(urls_set) > 1: _, url = urls[0] elif not ok: raise ExceptionParseStandings('Too much standing url') else: url = urls_set.pop() else: _, url = urls[0] page = REQ.get(url) self.standings_url = REQ.last_url else: page = REQ.get(self.standings_url) def get_table(page): html_table = re.search( '<table[^>]*bgcolor="silver"[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) return table table = get_table(page) problems_info = OrderedDict() max_score = defaultdict(float) scoring = False result = {} for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) for k, v in list(r.items()): if k == 'Имя': href = v.column.node.xpath('a/@href') if not href: continue uid = re.search('[0-9]+$', href[0]).group(0) row['member'] = uid row['name'] = v.value elif k == 'Место': row['place'] = v.value elif k == 'Время': row['penalty'] = int(v.value) elif k in ['Сумма', 'Задачи']: row['solving'] = float(v.value) elif re.match('^[a-zA-Z0-9]+$', k): problems_info[k] = {'short': k} if v.value: p = problems.setdefault(k, {}) p['result'] = v.value if v.value and v.value[0] not in ['-', '+']: scoring = True try: max_score[k] = max(max_score[k], float(v.value)) except ValueError: pass elif k: row[k.strip()] = v.value.strip() elif v.value.strip().lower() == 'log': href = v.column.node.xpath('.//a/@href') if href: row['url'] = urljoin(self.standings_url, href[0]) result[row['member']] = row if scoring: match = re.search( r'<b[^>]*>\s*<a[^>]*href="(?P<url>[^"]*)"[^>]*>ACM</a>\s*</b>', page) if match: page = REQ.get(match.group('url')) table = get_table(page) for r in table: uid = None for k, v in list(r.items()): if k == 'Имя': href = v.column.node.xpath('a/@href') if not href: continue uid = re.search('[0-9]+$', href[0]).group(0) elif re.match('^[a-zA-Z0-9]+$', k) and uid and v.value: if v.value[0] == '-': result[uid]['problems'][k]['partial'] = True elif v.value[0] == '+': result[uid]['problems'][k]['partial'] = False problems_info[k]['full_score'] = result[uid][ 'problems'][k]['result'] for r in result.values(): solved = 0 for k, p in r['problems'].items(): if p.get('partial'): continue score = p['result'] if score.startswith( '+') or 'partial' in p and not p['partial']: solved += 1 else: try: score = float(score) except ValueError: continue if abs(max_score[k] - score) < 1e-9 and score > 0: solved += 1 r['solved'] = {'solving': solved} standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): def parse_problems(page, full=False): matches = re.finditer( r''' <div[^>]*class=['"]panel\s*historypanel['"][^>]*>\s* <div[^>]*>\s*<h[^>]*>(?P<index>[^<]*)</h[^>]*>\s*</div>\s* <div[^>]*>(\s*<[^>]*>)*(?P<name>[^<]+) (\s*<[^>]*>)*\s*<a[^>]*href=["'](?P<url>[^"']*)["'][^>]*> ''', page, re.VERBOSE) problems = [] problemsets = [] prev_index = None for match in matches: index = match.group('index') if prev_index and index <= prev_index: if full: problemsets.append(problems) problems = [] else: break prev_index = index url = urllib.parse.urljoin(self.standings_url, match.group('url')) cpid = re.search('cpid=([0-9]+)', url).group(1) problems.append({ 'short': str(len(problems) + 1), 'code': cpid, 'name': match.group('name'), 'url': url, }) if problems: problemsets.append(problems) return problemsets if full else problems page = REQ.get(self.standings_url) divisions = list( re.finditer( '<a[^>]*href="(?P<url>[^"]*data[^"]*_(?P<name>[^_]*)_results.html)"[^>]*>', page)) descriptions = [] prev_span = None for division_match in divisions: curr_span = division_match.span() if prev_span is not None: descriptions.append(page[prev_span[1]:curr_span[0]]) prev_span = curr_span if prev_span is not None: descriptions.append(page[prev_span[1]:]) problems_info = OrderedDict() match = re.search( '''<a[^>]*href=["'](?P<href>[^"']*page=[a-z0-9]+problems)["'][^>]*>''', page) if match: url = urllib.parse.urljoin(self.standings_url, match.group('href')) page = REQ.get(url) problemsets = parse_problems(page, full=True) assert len(divisions) == len(problemsets) else: problemsets = None result = {} d0_set = set() for division_idx, (division_match, description) in enumerate( zip(divisions, descriptions)): division = division_match.group('name') d_problems = parse_problems( description ) if problemsets is None else problemsets[division_idx] division_info = problems_info.setdefault('division', OrderedDict()) division_info[division] = d_problems d0 = division[0].upper() assert d0 not in d0_set d0_set.add(d0) for p in d_problems: p['short'] = d0 + p['short'] url = urllib.parse.urljoin(self.standings_url, division_match.group('url')) page = REQ.get(url) tables = re.finditer( r'>(?P<title>[^<]*)</[^>]*>\s*(?P<html><table[^>]*>.*?</table>)', page, re.DOTALL) for table_match in tables: title = table_match.group('title') table = parsed_table.ParsedTable(table_match.group('html')) for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) solved = 0 idx = 0 for key, value in r.items(): key = key.replace(' ', ' ').strip() if not key: continue if isinstance(value, list): status = ''.join(v.value for v in value) idx += 1 if not status: continue partial = not bool(re.match(r'^[\*]+$', status)) solved += not partial problems[d0 + str(idx)] = { 'partial': partial, 'result': 1000 / len(d_problems) * status.count('*') / len(status), 'status': status, } elif key == 'Score': row['solving'] = int(value.value) else: row[key.lower()] = value.value.replace( ' ', ' ').strip() row['member'] = f'{row["name"]}, {row["country"]}' row['division'] = division row['list'] = title.strip().strip(':') row['solved'] = {'solving': solved} result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': problems_info, 'hidden_fields': ['list'], } return standings
def get_standings(self, users=None, statistics=None): if not hasattr(self, 'season'): year = self.start_time.year - (0 if self.start_time.month > 8 else 1) season = f'{year}-{year + 1}' else: season = self.season result = {} problems_info = OrderedDict() if not re.search('/[0-9]+/', self.standings_url): return {} url = self.standings_url n_page = 1 while True: page = REQ.get(url) match = re.search( '<table[^>]*class="[^"]*standings[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL) if not match: raise ExceptionParseStandings('Not found table standings') html_table = match.group(0) unnamed_fields = self.info.get('standings', {}).get('unnamed_fields', []) table = parsed_table.ParsedTable(html_table, unnamed_fields=unnamed_fields) for r in table: row = {} problems = row.setdefault('problems', {}) solved = 0 has_solved = False for k, v in list(r.items()): if 'table__cell_role_result' in v.attrs['class']: letter = k.split(' ', 1)[0] if letter == 'X': continue p = problems_info.setdefault(letter, {'short': letter}) names = v.header.node.xpath('.//span/@title') if len(names) == 1: p['name'] = names[0] p = problems.setdefault(letter, {}) n = v.column.node if n.xpath( 'img[contains(@class,"image_type_success")]'): res = '+' p['binary'] = True elif n.xpath( 'img[contains(@class,"image_type_fail")]'): res = '-' p['binary'] = False else: if ' ' not in v.value: problems.pop(letter) continue res = v.value.split(' ', 1)[0] res = res.replace(',', '') p['result'] = res p['time'] = v.value.split(' ', 1)[-1] if 'table__cell_firstSolved_true' in v.attrs['class']: p['first_ac'] = True if '+' in res or res.startswith('100'): solved += 1 try: has_solved = has_solved or '+' not in res and float( res) > 0 except ValueError: pass elif 'table__cell_role_participant' in v.attrs['class']: title = v.column.node.xpath('.//@title') if title: name = str(title[0]) else: name = v.value.replace(' ', '', 1) row['name'] = name row['member'] = name if ' ' not in name else f'{name} {season}' country = v.column.node.xpath( ".//div[contains(@class,'country-flag')]/@title") if country: row['country'] = str(country[0]) elif 'table__cell_role_place' in v.attrs['class']: row['place'] = v.value elif 'table__header_type_penalty' in v.attrs['class']: row['penalty'] = int(v.value) if re.match( '^-?[0-9]+$', v.value) else v.value elif 'table__header_type_score' in v.attrs['class']: row['solving'] = float(v.value.replace(',', '')) if has_solved: row['solved'] = {'solving': solved} if not problems: continue result[row['member']] = row n_page += 1 match = re.search( f'<a[^>]*href="(?P<href>[^"]*standings[^"]*p[^"]*={n_page})"[^>]*>', page) if not match: break url = urljoin(url, match.group('href')) standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def fetch_page(page): url = api_ranking_url_format.format(page + 1) content = REQ.get(url) return json.loads(content)
def get_standings(self, users=None, statistics=None): year = self.start_time.year - (0 if self.start_time.month > 8 else 1) season = f'{year}-{year + 1}' result = {} page = REQ.get(self.standings_url) table = parsed_table.ParsedTable( html=page, xpath="//table[@class='ir-contest-standings']//tr") problems_info = collections.OrderedDict() has_plus = False for r in table: row = collections.OrderedDict() problems = row.setdefault('problems', {}) ioi_total_fields = ['Sum', 'Сумма'] # ioi_style = any((f in r for f in ioi_total_fields)) for k, v in list(r.items()): classes = v.attrs['class'].split() if 'ir-column-contestant' in classes: row['member'] = v.value + ' ' + season row['name'] = v.value elif 'ir-column-place' in classes: row['place'] = v.value elif 'ir-column-penalty' in classes: row['penalty'] = int(v.value) elif 'ir-problem-count' in classes or k in ioi_total_fields: row['solving'] = int(v.value) elif len(k.split()[0]) == 1: letter = k.split()[0] problems_info[letter] = {'short': letter} if v.value == DOT: continue p = problems.setdefault(letter, {}) values = v.value.replace('−', '-').split(' ') p['result'] = values[0] if p['result'].startswith('+'): has_plus = True elif v.column.node.xpath('.//*[@class="ir-rejected"]'): p['partial'] = True if len(values) > 1: p['time'] = values[1] else: row[k.lower()] = v.value if not problems or users and row['member'] not in users: continue member = row['member'] if member in result: idx = 0 while member + f'-{idx}' in result: idx += 1 member += f'-{idx}' row['member'] = member result[member] = row if not has_plus: for row in result.values(): solved = 0 for p in row['problems'].values(): if p.get('partial'): continue try: score = float(p['result']) if score > 0: solved += 1 except Exception: pass row['solved'] = {'solving': solved} standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), 'problems_time_format': '{H}:{m:02d}', } return standings