Пример #1
0
        def get_from_icpc(year):
            medal_result_url = f'https://icpc.global/api/help/cms/virtpublic/community/results-{year}'
            page = REQ.get(medal_result_url)
            try:
                json_data = json.loads(page)
            except json.decoder.JSONDecodeError:
                return
            regex = '''<table[^>]*id=["']medalTable[^>]*>.*?</table>'''
            match = re.search(regex, json_data['content'], re.DOTALL)
            if not match:
                return

            html_table = match.group(0)
            table = parsed_table.ParsedTable(html_table)
            medals = OrderedDict()
            fields = ('gold', 'silver', 'bronze')
            for f in fields:
                medals[f] = 0
            for r in table:
                _, v = next(iter(r.items()))
                for attr in v.attrs.get('class', '').split():
                    if attr in fields:
                        medals[attr] = medals.get(attr, 0) + 1
                        break
            if not medals:
                return
            return medals
Пример #2
0
        def fetch_ratings(user, account):
            if account.info.get('is_virtual'):
                return user, False, None
            try:
                page = REQ.get(f'https://toph.co/u/{user}/ratings')
            except FailOnGetResponse as e:
                if e.code == 404:
                    return user, None, None
                return user, False, None

            tables = re.findall('<table[^>]*>.*?</table>', page, re.DOTALL)
            t = parsed_table.ParsedTable(html=tables[-1])
            ratings = {}
            info = {}
            for row in t:
                href = row['Contest'].column.node.xpath('.//a/@href')[0]
                key = href.rstrip('/').split('/')[-1]
                rating = int(row['Rating'].value)
                ratings[key] = {'new_rating': rating}
                info.setdefault('rating', rating)

            matches = re.finditer(
                '''
                 <div[^>]*class="?value"?[^>]*>(?P<value>[^<]*)</div>[^<]*
                 <div[^>]*class="?title"?>(?P<key>[^<]*)</div>
            ''', page, re.DOTALL | re.VERBOSE)
            for match in matches:
                key = match.group('key').lower()
                value = match.group('value')
                info[key] = value
            return user, info, ratings
Пример #3
0
    def fetch_submissions(self, fuser=None, c_page=1):
        url = self.SUBMISSIONS_URL_.format(self) + f'?page={c_page}'
        if fuser:
            url += f'&f.User={fuser}'

        for attempt in range(4):
            if self._stop:
                return

            try:
                page = self._get(url)
                break
            except FailOnGetResponse:
                time.sleep(attempt)
        else:
            return

        regex = '<table[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table,
                                         with_duplicate_colspan=True)

        pages = re.findall(
            r'''<a[^>]*href=["'][^"']*/submissions\?[^"']*page=([0-9]+)[^"']*["'][^>]*>[0-9]+</a>''',
            page)  # noqa
        n_page = max(map(int, pages))

        return url, page, table, c_page, n_page
Пример #4
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year - (0 if self.start_time.month > 8 else 1)
        season = f'{year}-{year + 1}'

        result = {}

        page = REQ.get(self.standings_url)
        table = parsed_table.ParsedTable(
            html=page, xpath="//table[@class='ir-contest-standings']//tr")
        problems_info = collections.OrderedDict()
        for r in table:
            row = collections.OrderedDict()
            problems = row.setdefault('problems', {})
            ioi_total_fields = ['Sum', 'Сумма']
            ioi_style = any((f in r for f in ioi_total_fields))
            for k, v in list(r.items()):
                classes = v.attrs['class'].split()
                if 'ir-column-contestant' in classes:
                    row['member'] = v.value + ' ' + season
                    row['name'] = v.value
                elif 'ir-column-place' in classes:
                    row['place'] = v.value
                elif 'ir-column-penalty' in classes:
                    row['penalty'] = int(v.value)
                elif 'ir-problem-count' in classes or k in ioi_total_fields:
                    row['solving'] = int(v.value)
                elif len(k.split()[0]) == 1:
                    letter = k.split()[0]
                    problems_info[letter] = {'short': letter}
                    if v.value == DOT:
                        continue
                    p = problems.setdefault(letter, {})
                    values = v.value.replace('−', '-').split(' ')
                    p['result'] = values[0]
                    if len(values) > 1:
                        p['time'] = values[1]
                    if ioi_style and p['result'].isdigit():
                        val = int(p['result'])
                        if val:
                            p['partial'] = val < 100
                else:
                    row[k.lower()] = v.value
            if not problems or users and row['member'] not in users:
                continue
            member = row['member']
            if member in result:
                idx = 0
                while member + f'-{idx}' in result:
                    idx += 1
                member += f'-{idx}'
                row['member'] = member
            result[member] = row
        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
            'problems_time_format': '{H}:{m:02d}',
        }
        return standings
Пример #5
0
    def get_standings(self, users=None, statistics=None):
        try:
            page = REQ.get(self.url)
        except FailOnGetResponse as e:
            return {'action': 'delete'} if e.code == 404 else {}

        match = re.search('<table[^>]*past_event_rating[^>]*>.*?</table>',
                          page, re.DOTALL)
        if not match:
            raise ExceptionParseStandings('not found table')

        header_mapping = {
            'Team': 'name',
            'Place': 'place',
            'CTF points': 'solving',
        }
        table = parsed_table.ParsedTable(html=match.group(0),
                                         header_mapping=header_mapping)

        results = {}
        max_score = 0
        for r in table:
            row = OrderedDict()
            for k, v in r.items():
                k = k.strip('*')
                k = k.strip(' ')
                value = ' '.join([c.value for c in v]).strip() if isinstance(
                    v, list) else v.value
                if k == 'name':
                    href = v.column.node.xpath('.//a/@href')[0]
                    match = re.search('/([0-9]+)/?$', href)
                    row['member'] = match.group(1)
                    row['name'] = value
                else:
                    value = as_number(value)
                row[k] = value
            max_score = max(max_score, row.get('solving', 0))
            results[row['member']] = row

        if max_score > 0:
            for row in results.values():
                if 'solving' in row:
                    row['percent'] = f'{row["solving"] * 100 / max_score:.2f}'

        has_medals = not re.search(r'\bqual', self.name,
                                   flags=re.I) and re.search(
                                       r'\bfinal', self.name, flags=re.I)
        medals = [{'name': 'gold', 'count': 1}] if has_medals else []

        return dict(
            standings_url=self.url,
            result=results,
            options={'medals': medals},
        )
Пример #6
0
 def fetch_members(r):
     url = r.pop('members_url', None)
     if url:
         members_page = self._get(url)
         members_page = json.loads(members_page)['data']
         members_table = parsed_table.ParsedTable(members_page)
         for member_row in members_table:
             _, member = member_row['Developer'].value.strip(
             ).rsplit(' ', 1)
             r['members'].append(member)
     return r
Пример #7
0
    def get_standings(self, users=None, statistics=None):
        result = {}
        problems_info = OrderedDict()

        if not self.standings_url:
            self.standings_url = self.url.replace('/olympiads/', '/results/')

        page = REQ.get(self.standings_url)
        regex = '<table[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table, as_list=True)

        for r in table:
            row = OrderedDict()
            problems = row.setdefault('problems', {})
            problem_idx = 0
            for k, v in r:
                if 'taskscore' in v.header.attrs.get('class', '').split():
                    problem_idx += 1
                    d = problems_info.setdefault(problem_idx, {})
                    d['short'] = str(problem_idx)
                    d['full_score'] = 100
                    d['name'] = k
                    try:
                        score = float(v.value)
                        p = problems.setdefault(str(problem_idx), {})
                        p['result'] = v.value
                        p['partial'] = score < 100
                    except Exception:
                        pass
                elif k == 'Abs.':
                    row['solving'] = float(v.value)
                elif k == 'Rank':
                    row['place'] = v.value.strip('*').strip('.')
                elif k == 'Contestant':
                    url = first(v.column.node.xpath('a[@href]/@href'))
                    member = url.strip('/').split('/')[-1]
                    row['member'] = member
                    row['name'] = v.value
                elif k == 'Country':
                    row['country'] = re.sub(r'\s*[0-9]+$', '', v.value)
                else:
                    row[k] = v.value
            result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
        }
        return standings
Пример #8
0
                    def fetch_problem(p):
                        errors = set()
                        for attempt in range(3):
                            try:
                                page = REQ.get(p['url'], time_out=30)
                                match = re.search('<a[^>]*href="(?P<href>[^"]*module=ProblemDetail[^"]*)"[^>]*>', page)
                                page = REQ.get(urljoin(p['url'], match.group('href')), time_out=30)
                                matches = re.findall(r'<td[^>]*class="statTextBig"[^>]*>(?P<key>[^<]*)</td>\s*<td[^>]*>(?P<value>.*?)</td>', page, re.DOTALL)  # noqa
                                for key, value in matches:
                                    key = key.strip().rstrip(':').lower()
                                    if key == 'categories':
                                        tags = [t.strip().lower() for t in value.split(',')]
                                        tags = [t for t in tags if t]
                                        if tags:
                                            p['tags'] = tags
                                    elif key.startswith('writer') or key.startswith('tester'):
                                        key = key.rstrip('s') + 's'
                                        p[key] = re.findall('(?<=>)[^<>,]+(?=<)', value)
                                for w in p.get('writers', []):
                                    writers[w] += 1

                                info = p.setdefault('info', {})
                                matches = re.finditer('<table[^>]*paddingTable2[^>]*>.*?</table>', page, re.DOTALL)
                                for match in matches:
                                    html_table = match.group(0)
                                    rows = parsed_table.ParsedTable(html_table)
                                    for row in rows:
                                        key, value = None, None
                                        for k, v in row.items():
                                            if k == "":
                                                key = v.value
                                            elif k and division_str in k.split():
                                                value = v.value
                                        if key and value:
                                            key = re.sub(' +', '_', key.lower())
                                            info[key] = value
                                            if key == 'point_value':
                                                value = toint(value) or asfloat(value)
                                                if value is not None:
                                                    p['full_score'] = value
                            except Exception as e:
                                errors.add(f'error parse problem info {p}: {e}')
                                sleep(5 + attempt)
                        else:
                            errors = None
                        if errors:
                            LOG.error(errors)

                        return p
Пример #9
0
    def get_standings(self, users=None, statistics=None):
        if not self.standings_url:
            self.standings_url = f'https://projecteuler.net/fastest={self.key}'

        result = {}

        page = REQ.get(self.standings_url, headers=conf.PROJECTEULER_COOKIE_HEADER)
        regex = '<table[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table)

        for r in table:
            row = OrderedDict()
            row['solving'] = 1
            for k, v in r.items():
                if isinstance(v, list):
                    place, country = v
                    row['place'] = re.match('[0-9]+', place.value).group(0)
                    country = first(country.column.node.xpath('.//@title'))
                    if country:
                        row['country'] = country
                elif k == 'Time To Solve':
                    params = {}
                    for x in v.value.split(', '):
                        value, field = x.split()
                        if field[-1] != 's':
                            field += 's'
                        params[field] = int(value)
                    delta = timedelta(**params)
                    row['penalty'] = f'{delta.total_seconds() / 60:.2f}'
                elif k == 'User':
                    member = first(v.column.node.xpath('.//@title')) or v.value
                    row['member'] = member
                else:
                    row[k.lower()] = v.value
            if 'member' not in row:
                continue
            result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': [],
        }
        return standings
Пример #10
0
        def parse_problems_infos():
            problem_url = self.standings_url.replace('/ranking', '/p')
            page = REQ.get(problem_url)

            match = re.search(
                r'<h1[^>]*>[^<]*</h1>(\s*<[^/][^>]*>)*\s*(?P<table><table[^>]*>.*?</table>)',
                page, re.DOTALL)
            if not match:
                raise ExceptionParseStandings('Not found problems table')
            table = parsed_table.ParsedTable(html=match.group('table'),
                                             ignore_wrong_header_number=False)
            skip = False
            problems_infos = collections.OrderedDict()
            for r in table:
                if isinstance(r, parsed_table.ParsedTableRow):
                    runda = re.sub(r'\s*\(.*\)\s*$', '',
                                   r.columns[0].value).strip()
                    skip = runda.lower() not in self.name.lower()
                    continue

                if skip:
                    continue

                problem_info = {}
                for k, vs in list(r.items()):
                    if isinstance(vs, list):
                        v = ' '.join([v.value for v in vs]).strip()
                    else:
                        v = vs.value
                    if not k:
                        problem_info['short'] = v
                    elif k in ('Nazwa', 'Name'):
                        match = re.search(r'\[(?P<letter>[^\]]+)\]$', v)
                        if match:
                            problem_info['_letter'] = match.group('letter')
                        problem_info['name'] = v
                        href = vs.column.node.xpath('//a/@href')
                        if href:
                            problem_info['url'] = urljoin(problem_url, href[0])
                if problem_info:
                    problems_infos[problem_info['short']] = problem_info
            return problems_infos
Пример #11
0
    def _get_medals(year):
        default = OrderedDict([(k, 4) for k in ('gold', 'silver', 'bronze')])

        main_url = 'https://icpc.baylor.edu/'
        page = REQ.get(main_url)
        match = re.search('src="(?P<js>/static/js/main.[^"]*.js)"', page)
        if not match:
            return default
        js_url = match.group('js')

        page = REQ.get(js_url)
        match = re.search('XWIKI:"(?P<xwiki>[^"]*)"', page)
        if not match:
            return default
        xwiki_url = match.group('xwiki')
        xwiki_url = urljoin(main_url, xwiki_url).rstrip('/') + '/'

        medal_result_url = urljoin(xwiki_url, f'community/results-{year}')

        page = REQ.get(medal_result_url)
        json_data = json.loads(page)
        regex = '''<table[^>]*id=["']medalTable[^>]*>.*?</table>'''
        match = re.search(regex, json_data['content'], re.DOTALL)
        if not match:
            return default

        html_table = match.group(0)
        table = parsed_table.ParsedTable(html_table)
        medals = OrderedDict()
        fields = ('gold', 'silver', 'bronze')
        for f in fields:
            medals[f] = 0
        for r in table:
            _, v = next(iter(r.items()))
            for attr in v.attrs.get('class', '').split():
                if attr in fields:
                    medals[attr] = medals.get(attr, 0) + 1
                    break
        if not medals:
            return default
        return medals
Пример #12
0
        def fetch_table(page):
            nonlocal web_archive_url
            nonlocal total_num_pages
            nonlocal standings_url
            url = standings_url
            if n_page > 1:
                url += f'/page/{page}'
            if not web_archive_url:
                url += '?locale=en'

            page = Statistic.get(url)

            match = re.search('<title>[^<]*-(?P<name>[^<]*)</title>', page)
            if codename not in match.group('name'):
                return

            if total_num_pages is None:
                matches = re.findall(
                    '<span[^>]*class="[^"]*page-index[^"]*"[^>]*pageindex="([0-9]+)"[^>]*>',
                    page,
                    re.I,
                )
                if matches:
                    total_num_pages = int(matches[-1])

            regex = '''<table[^>]*class="[^>]*table[^>]*"[^>]*>.*?</table>'''
            match = re.search(regex, page, re.DOTALL)
            table = parsed_table.ParsedTable(
                match.group(0),
                header_mapping={
                    '№': '#',
                    'Участник': 'Participant',
                    'Бои': 'Games',
                    'Игры': 'Games',
                    'Побед': 'Won',
                    'Рейтинг': 'Rating',
                    'Язык': 'Language',
                },
            )
            return table
Пример #13
0
    def get_standings(self, users=None, statistics=None):
        season = self.key.split()[0]

        result = {}

        page = REQ.get(self.standings_url)
        table = parsed_table.ParsedTable(
            html=page, xpath="//table[@class='ir-contest-standings']//tr")
        problems_info = collections.OrderedDict()
        for r in table:
            row = {}
            problems = row.setdefault('problems', {})
            for k, v in list(r.items()):
                classes = v.attrs['class'].split()
                if 'ir-column-contestant' in classes:
                    row['member'] = v.value + ' ' + season
                    row['name'] = v.value
                elif 'ir-column-place' in classes:
                    row['place'] = v.value
                elif 'ir-column-penalty' in classes:
                    row['penalty'] = int(v.value)
                elif 'ir-problem-count' in classes:
                    row['solving'] = int(v.value)
                else:
                    letter = k.split()[0]
                    problems_info[letter] = {'short': letter}
                    if v.value == DOT:
                        continue
                    p = problems.setdefault(letter, {})
                    values = v.value.replace('−', '-').split(' ')
                    p['result'] = values[0]
                    if len(values) > 1:
                        p['time'] = values[1]
            result[row['member']] = row
        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
        }
        return standings
Пример #14
0
    def get_standings(self, users=None, statistics=None):
        result = {}

        page = REQ.get(self.standings_url + ('&' if '?' in self.standings_url else '?') + 'locale=en')
        table = parsed_table.ParsedTable(html=page, xpath="//table[@class='monitor']//tr")
        problems_info = collections.OrderedDict()
        for r in table:
            row = {}
            problems = row.setdefault('problems', {})
            for k, v in list(r.items()):
                title = first(v.header.node.xpath('a[@title]/@title'))
                if k in ['Участник', 'Participant']:
                    url = first(v.column.node.xpath('a[@href]/@href'))
                    row['member'] = re.search('([0-9]+)/?$', url).group(1)
                    row['name'] = v.value
                elif k in ['Место', 'Rank']:
                    row['place'] = v.value
                elif k in ['Время', 'Time']:
                    row['penalty'] = int(v.value)
                elif k in ['Решено', 'Solved']:
                    row['solving'] = int(v.value)
                elif len(k) == 1 and title is not None:
                    problems_info[k] = {'short': k, 'name': title}
                    url = first(v.header.node.xpath('a[@href]/@href'))
                    if url is not None:
                        problems_info[k]['url'] = urllib.parse.urljoin(self.standings_url, url)
                    if v.value:
                        p = problems.setdefault(k, {})
                        values = v.value.replace('–', '-').split(' ')
                        p['result'] = values[0]
                        if len(values) > 1:
                            p['time'] = values[1]
            result[row['member']] = row
        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
        }
        return standings
Пример #15
0
    def get_standings(self, users=None, statistics=None):
        season = self.get_season()

        def standings_page(req):
            return req.get(self.standings_url)

        print(self.standings_url)
        with REQ(
            with_proxy=True,
            args_proxy=dict(
                time_limit=3,
                n_limit=30,
                connect=standings_page,
            ),
        ) as req:
            page = req.proxer.get_connect_ret()

        html_table = re.search('<table[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL)
        if not html_table:
            raise ExceptionParseStandings('Not found html table')
        mapping = {
            'Rank': 'place',
            'Name': 'name',
            'Language': 'language',
        }
        table = parsed_table.ParsedTable(html_table.group(0), header_mapping=mapping)

        result = {}
        for r in table:
            row = dict()
            for k, v in r.items():
                if v.value:
                    row[k] = v.value
            if 'member' not in row:
                row['member'] = f'{row["name"]} {season}'
            result[row['member']] = row

        return {'result': result}
Пример #16
0
 def get_table(page):
     html_table = re.search(
         '<table[^>]*bgcolor="silver"[^>]*>.*?</table>', page,
         re.MULTILINE | re.DOTALL).group(0)
     table = parsed_table.ParsedTable(html_table)
     return table
Пример #17
0
    def get_standings(self, users=None, statistics=None):
        if not self.standings_url:
            self.standings_url = f'https://projecteuler.net/fastest={self.key}'

        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'  # noqa
        page = REQ.get(self.standings_url, headers={'User-Agent': user_agent})

        sign_out = re.search('<form[^>]*action="sign_out"[^>]*>', page)
        if not sign_out:
            for attempt in range(20):
                while True:
                    value = f'{random.random():.16f}'
                    image_bytes = REQ.get(f'https://projecteuler.net/captcha/show_captcha.php?{value}')
                    image_stream = io.BytesIO(image_bytes)
                    image_rgb = Image.open(image_stream)
                    text = pytesseract.image_to_string(image_rgb, config='--oem 0 --psm 13 digits')
                    text = text.strip()
                    if re.match('^[0-9]{5}$', text):
                        break

                REQ.get('https://projecteuler.net/sign_in')
                page = REQ.submit_form(
                    name='sign_in_form',
                    action=None,
                    data={
                        'username': conf.PROJECTEULER_USERNAME,
                        'password': conf.PROJECTEULER_PASSWORD,
                        'captcha': text,
                        'remember_me': '1',
                    },
                )
                match = re.search('<p[^>]*class="warning"[^>]*>(?P<message>[^<]*)</p>', page)
                if match:
                    REQ.print(match.group('message'))
                else:
                    break
            else:
                raise ExceptionParseStandings('Did not recognize captcha for sign in')
            page = REQ.get(self.standings_url)

        result = {}

        problem_name = self.name.split('.', 1)[1].strip()
        problems_info = [{'name': problem_name, 'url': self.url}]

        regex = '<table[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL)

        if html_table:
            table = parsed_table.ParsedTable(html_table.group(0))
            for r in table:
                row = OrderedDict()
                row['solving'] = 1
                for k, v in r.items():
                    if isinstance(v, list):
                        place, country = v
                        row['place'] = re.match('[0-9]+', place.value).group(0)
                        country = first(country.column.node.xpath('.//@title'))
                        if country:
                            row['country'] = country
                    elif k == 'Time To Solve':
                        params = {}
                        for x in v.value.split(', '):
                            value, field = x.split()
                            if field[-1] != 's':
                                field += 's'
                            params[field] = int(value)
                        rel_delta = relativedelta(**params)
                        now = timezone.now()
                        delta = now - (now - rel_delta)
                        row['penalty'] = f'{delta.total_seconds() / 60:.2f}'
                    elif k == 'User':
                        member = first(v.column.node.xpath('.//@title')) or v.value
                        row['member'] = member
                    else:
                        row[k.lower()] = v.value
                problems = row.setdefault('problems', {})
                problem = problems.setdefault(problem_name, {})
                problem['result'] = '+'
                problem['binary'] = True
                row['_skip_for_problem_stat'] = True
                if 'member' not in row:
                    continue
                result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': problems_info,
        }

        if len(result) < 100:
            delta = timezone.now() - self.start_time
            if delta < timedelta(days=1):
                standings['timing_statistic_delta'] = timedelta(minutes=60)
            elif delta < timedelta(days=30):
                standings['timing_statistic_delta'] = timedelta(days=1)

        return standings
Пример #18
0
    def get_standings(self, users=None, statistics=None):
        if not hasattr(self, 'season'):
            year = self.start_time.year - (0
                                           if self.start_time.month > 8 else 1)
            season = f'{year}-{year + 1}'
        else:
            season = self.season

        result = {}
        problems_info = OrderedDict()

        if not re.search('/[0-9]+/', self.standings_url):
            return {}

        url = self.standings_url
        n_page = 1
        while True:
            page = REQ.get(url)

            match = re.search(
                '<table[^>]*class="[^"]*standings[^>]*>.*?</table>', page,
                re.MULTILINE | re.DOTALL)
            if not match:
                raise ExceptionParseStandings('Not found table standings')

            html_table = match.group(0)
            table = parsed_table.ParsedTable(html_table)

            for r in table:
                row = {}
                problems = row.setdefault('problems', {})
                solved = 0
                has_solved = False
                for k, v in list(r.items()):
                    if 'table__cell_role_result' in v.attrs['class']:
                        letter = k.split(' ', 1)[0]
                        if letter == 'X':
                            continue

                        p = problems_info.setdefault(letter, {'short': letter})
                        names = v.header.node.xpath('.//span/@title')
                        if len(names) == 1:
                            p['name'] = names[0]

                        p = problems.setdefault(letter, {})
                        n = v.column.node
                        if n.xpath(
                                'img[contains(@class,"image_type_success")]'):
                            res = '+'
                            p['binary'] = True
                        elif n.xpath(
                                'img[contains(@class,"image_type_fail")]'):
                            res = '-'
                            p['binary'] = False
                        else:
                            if ' ' not in v.value:
                                problems.pop(letter)
                                continue
                            res = v.value.split(' ', 1)[0]
                        p['result'] = res
                        p['time'] = v.value.split(' ', 1)[-1]
                        if 'table__cell_firstSolved_true' in v.attrs['class']:
                            p['first_ac'] = True

                        if '+' in res or res.startswith('100'):
                            solved += 1

                        try:
                            has_solved = has_solved or '+' not in res and float(
                                res) > 0
                        except ValueError:
                            pass
                    elif 'table__cell_role_participant' in v.attrs['class']:
                        title = v.column.node.xpath('.//@title')
                        if title:
                            name = title[0]
                        else:
                            name = v.value.replace(' ', '', 1)
                        row['name'] = name
                        row['member'] = name if ' ' not in name else f'{name} {season}'
                    elif 'table__cell_role_place' in v.attrs['class']:
                        row['place'] = v.value
                    elif 'table__header_type_penalty' in v.attrs['class']:
                        row['penalty'] = int(
                            v.value) if v.value.isdigit() else v.value
                    elif 'table__header_type_score' in v.attrs['class']:
                        row['solving'] = int(round(float(v.value)))
                if has_solved:
                    row['solved'] = {'solving': solved}
                result[row['member']] = row

            n_page += 1
            match = re.search(
                f'<a[^>]*href="(?P<href>[^"]*standings[^"]*p[^"]*={n_page})"[^>]*>',
                page)
            if not match:
                break
            url = urljoin(url, match.group('href'))

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
        }
        return standings
Пример #19
0
    def get_standings(self, users=None, statistics=None):
        standings_url = self.standings_url or self.url
        page = REQ.get(standings_url)

        standings = {'url': standings_url}
        options = standings.setdefault('options', {'parse': {}})

        regex = '<table>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        if match:
            html_table = match.group(0)
            table = parsed_table.ParsedTable(html_table,
                                             without_header=True,
                                             ignore_wrong_header_number=False)
            infos = {}
            for r in table:
                k, v = [col.value for col in r.columns]
                k = k.strip(':').lower().replace(' ', '_')
                infos[k] = v
            options['parse'] = infos

        def find_related(statistics):
            infos = deepcopy(self.info.get('standings', {}).get('parse', {}))

            if '_related' in infos and Contest.objects.get(
                    pk=infos['_related']):
                options['parse']['_related'] = infos['_related']
                return

            related = None

            infos.update(options.get('parse', {}))

            host_mapping = self.resource.info['_host_mapping']
            host = infos.get('official_page')
            if host:
                match = re.search('.*https?://(?P<host>[^/]*)/', host)
                host = match.group('host')
            else:
                host = infos.get('series')

            ignore_n_statistics = False
            ignore_title = None
            for mapping in host_mapping:
                if re.search(mapping['regex'], host):
                    host = mapping['host']
                    ignore_title = mapping.get('ignore_title')
                    ignore_n_statistics = mapping.get('ignore_n_statistics',
                                                      ignore_n_statistics)
                    break
            if host:
                delta_start = timedelta(days=3)
                qs = Contest.objects.filter(resource__host=host)
                qs = qs.filter(
                    Q(start_time__gte=self.start_time - delta_start,
                      start_time__lte=self.start_time + delta_start)
                    | Q(end_time__gte=self.start_time - delta_start,
                        end_time__lte=self.start_time + delta_start))

                if not ignore_n_statistics:
                    teams = set()
                    for r in statistics.values():
                        if 'team_id' in r:
                            teams.add(r['team_id'])
                    n_statistics = len(teams) if teams else len(statistics)
                    delta_n = round(n_statistics * 0.15)
                    qs = qs.filter(n_statistics__gte=n_statistics - delta_n,
                                   n_statistics__lte=n_statistics + delta_n)

                if ignore_title:
                    qs = qs.exclude(title__iregex=ignore_title)

                if len(qs) > 1:
                    first = None
                    for stat in statistics.values():
                        if stat.get('place') == '1':
                            first = stat['member'].split(':', 1)[-1]
                    qs = qs.filter(statistics__place_as_int=1,
                                   statistics__account__key=first)

                if len(qs) == 1:
                    related = qs.first().pk

            if related is not None:
                options['parse']['_related'] = related
                standings['invisible'] = True
            else:
                standings['invisible'] = False

        regex = '<table[^>]*class="[^"]*table[^"]*"[^>]*>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        html_table = match.group(0)
        table = parsed_table.ParsedTable(html_table)

        profile_urls = {}
        for r in table:
            row = OrderedDict()
            rank = r.pop('Rank')
            row['place'] = rank.value
            medal = rank.column.node.xpath(
                './/img[contains(@alt,"medal")]/@title')
            if medal:
                row['medal'] = medal[0].lower()

            name_key = 'Name' if 'Name' in r else 'Team'
            name = r.pop(name_key)
            members = name.column.node.xpath('.//a')
            val = name.value
            if name_key == 'Team':
                if ':' in val:
                    val = val.rsplit(': ', 1)[0]
                row['team_id'] = val
            row['name'] = val

            val = r.pop('Score').value.strip()
            row['solving'] = as_number(val) if val and val != '?' else 0

            row['_no_update_name'] = True

            for k, v in r.items():
                k = k.lower()
                if k in row:
                    continue
                v = v.value.strip()
                if not v or v == '?':
                    continue
                row[k.lower()] = as_number(v)

            for member in members:
                url = urljoin(standings_url, member.attrib['href'])
                row['_profile_url'] = url
                profile_urls[url] = deepcopy(row)

        statistics_profiles_urls = {}
        if statistics:
            for s in statistics.values():
                if '_profile_url' in s:
                    statistics_profiles_urls[s['_profile_url']] = s

        def get_handle(row):
            url = row['_profile_url']
            if 'university' in url:
                row['_skip'] = True

            if url in statistics_profiles_urls:
                stat = statistics_profiles_urls[url]
                for k, v in stat.items():
                    if k not in row:
                        row[k] = v
                if '_member' in row and '_info' in row:
                    row['member'] = row['_member']
                    row['info'] = row['_info']
                    return row

            page = REQ.get(url)
            info = row.setdefault('info', {})

            if 'university' in url:
                handle = unquote(urlparse(url).path)
                handle = handle.strip('/')
                handle = handle.replace('/', ':')
                row['member'] = handle
            else:
                match = re.search(
                    '<link[^>]*rel="canonical"[^>]*href="[^"]*/profile/(?P<handle>[^"]*)"[^>]*>',
                    page)
                handle = match.group('handle')
                row['member'] = handle

                match = re.search(
                    r'>[^<]*prize[^<]*money[^<]*(?:<[^>]*>)*[^<]*\$(?P<val>[.0-9]+)',
                    page, re.IGNORECASE)
                if match:
                    info['prize_money'] = as_number(match.group('val'))

                match = re.search(
                    r'>country:</[^>]*>(?:\s*<[^>]*>)*\s*<a[^>]*href="[^"]*/country/(?P<country>[^"]*)"',
                    page, re.IGNORECASE)
                if match:
                    info['country'] = match.group('country')

            match = re.search('<h3[^>]*>(?P<name>[^>]*)<', page)
            info['name'] = match.group('name').strip()

            row['_member'] = row['member']
            row['_info'] = dict(info)

            return row

        result = {}
        members = defaultdict(list)
        with PoolExecutor(max_workers=4) as executor, tqdm(
                total=len(result), desc='urls') as pbar:
            for row in executor.map(get_handle, profile_urls.values()):
                pbar.update()
                result[row['member']] = row

                skip = row.pop('_skip', False)
                if not skip and 'team_id' in row:
                    members[row['team_id']].append({
                        'account': row['member'],
                        'name': row['info']['name']
                    })

        if members:
            for row in result.values():
                if 'team_id' in row:
                    row['_members'] = members[row['team_id']]

        find_related(result)

        standings['result'] = result
        return standings
Пример #20
0
    def get_standings(self, users=None, statistics=None):
        if not self.standings_url:
            page = REQ.get(urljoin(self.url, '/'))

            for name in (
                    'Соревнования',
                    'Тренировочные олимпиады',
            ):
                match = re.search(
                    '<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format(name), page)
                page = REQ.get(match.group('url'))

            match = re.search(
                '{}.*?<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format(
                    re.escape(self.name), 'Результаты прошедших тренировок'),
                page,
                re.DOTALL,
            )
            if not match:
                raise ExceptionParseStandings('Not found standing url')

            url = match.group('url')
            page = REQ.get(url)

            date = self.start_time.strftime('%Y-%m-%d')
            matches = re.findall(
                r'''
                <tr[^>]*>[^<]*<td[^>]*>{}</td>[^<]*
                <td[^>]*>(?P<title>[^<]*)</td>[^<]*
                <td[^>]*>[^<]*<a[^>]*href\s*=["\s]*(?P<url>[^">]*)["\s]*[^>]*>
            '''.format(date), page, re.MULTILINE | re.VERBOSE)

            urls = [(title, urljoin(url, u)) for title, u in matches]
            if len(urls) > 1:
                urls = [(
                    title, urljoin(url, u)
                ) for title, u in matches if not re.search(
                    r'[0-9]\s*-\s*[0-9].*(?:[0-9]\s*-\s*[0-9].*\bкл\b|школа)',
                    title, re.I)]

            if not urls:
                raise ExceptionParseStandings('Not found standing url')

            if len(urls) > 1:
                ok = True
                urls_set = set()
                for _, u in urls:
                    page = REQ.get(u)
                    path = re.findall(
                        '<td[^>]*nowrap><a[^>]*href="(?P<href>[^"]*)"', page)
                    if len(path) < 2:
                        ok = False
                    parent = urljoin(u, path[-2])
                    urls_set.add(parent)
                if len(urls_set) > 1 or not ok:
                    raise ExceptionParseStandings('Too much standing url')
                url = urls_set.pop()
            else:
                _, url = urls[0]

            page = REQ.get(url)
            self.standings_url = REQ.last_url
        else:
            page = REQ.get(self.standings_url)

        html_table = re.search('<table[^>]*bgcolor="silver"[^>]*>.*?</table>',
                               page, re.MULTILINE | re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table)

        problems_info = OrderedDict()
        max_score = defaultdict(float)

        result = {}
        for r in table:
            row = OrderedDict()
            problems = row.setdefault('problems', {})
            solved = 0
            for k, v in list(r.items()):
                if k == 'Имя':
                    href = v.column.node.xpath('a/@href')
                    if not href:
                        continue
                    uid = re.search('[0-9]+$', href[0]).group(0)
                    row['member'] = uid
                    row['name'] = v.value
                elif k == 'Место':
                    row['place'] = v.value
                elif k == 'Время':
                    row['penalty'] = int(v.value)
                elif k in ['Сумма', 'Задачи']:
                    row['solving'] = float(v.value)
                elif re.match('^[a-zA-Z0-9]+$', k):
                    problems_info[k] = {'short': k}
                    if v.value:
                        p = problems.setdefault(k, {})
                        p['result'] = v.value
                        try:
                            max_score[k] = max(max_score[k], float(v.value))
                        except ValueError:
                            pass
                elif k:
                    row[k.strip()] = v.value.strip()
                elif v.value.strip().lower() == 'log':
                    href = v.column.node.xpath('.//a/@href')
                    if href:
                        row['url'] = urljoin(self.standings_url, href[0])
            result[row['member']] = row
        for r in result.values():
            solved = 0
            for k, p in r['problems'].items():
                score = p['result']
                if score.startswith('+'):
                    solved += 1
                else:
                    try:
                        score = float(score)
                    except ValueError:
                        pass
                    if abs(max_score[k] - score) < 1e-9 and score > 0:
                        solved += 1
            r['solved'] = {'solving': solved}

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
        }

        return standings
Пример #21
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year
        year = year + 1 if self.start_time.month >= 9 else year
        season = '%d-%d' % (year - 1, year)

        standings_urls = []
        if not self.standings_url:
            for url in (
                    f'http://static.kattis.com/icpc/wf{year}/',
                    f'https://zibada.guru/finals/{year}/',
                    f'http://web.archive.org/web/{year}/https://icpc.baylor.edu/scoreboard/',
            ):
                try:
                    page = REQ.get(url)
                except FailOnGetResponse:
                    continue

                if 'web.archive.org' in REQ.last_url and f'/{year}' not in REQ.last_url:
                    continue

                standings_urls.append(url)
        else:
            standings_urls.append(self.standings_url)

        if not standings_urls:
            raise ExceptionParseStandings(
                f'Not found standings url year = {year}')

        for standings_url in standings_urls:
            page = REQ.get(standings_url)

            result = {}
            problems_info = OrderedDict()
            has_submission = False

            if 'zibada' in standings_url:
                match = re.search(r' = (?P<data>[\{\[].*?);?\s*$', page,
                                  re.MULTILINE)
                if match:
                    names = self._json_load(match.group('data'))
                else:
                    names = None

                try:
                    page = REQ.get('standings.js')
                    match = re.search(r' = (?P<data>\{.*?);?\s*$', page,
                                      re.MULTILINE)
                    data = self._json_load(match.group('data'))
                except Exception:
                    assert names
                    data = names

                for p_name in data['problems']:
                    problems_info[p_name] = {'short': p_name}

                events = data.pop('events', None)
                if events:
                    teams = {}
                    time_divider = 60
                    events.sort(key=lambda e: int(e.split()[-1]))
                    for e in events:
                        tid, p_name, status, attempt, time = e.split()
                        time = int(time)

                        team = teams.setdefault(tid, {})
                        problems = team.setdefault('problems', {})
                        result = problems.get(p_name, {}).get('result', '')
                        if not result.startswith('?') and status.startswith(
                                '?'):
                            continue
                        has_submission = True
                        if status == '+':
                            attempt = int(attempt) - 1
                            p_info = problems_info[p_name]
                        problems[p_name] = {
                            'time':
                            time,
                            'result':
                            '+' if status == '+' and attempt == 0 else
                            f'{status}{attempt}',
                        }
                    for tid, team in teams.items():
                        name = names[int(tid)][0]
                        name = html.unescape(name)
                        team['member'] = f'{name} {season}'
                        team['name'] = name
                        penalty = 0
                        solving = 0
                        for p_name, problem in team.get('problems',
                                                        {}).items():
                            if problem['result'].startswith('+'):
                                solving += 1
                                attempt_penalty = (int(
                                    problem['result'].lstrip('+')
                                    or 0)) * 20 * time_divider
                                penalty += problem['time'] + attempt_penalty
                        team['penalty'] = int(round(penalty / time_divider))
                        team['solving'] = solving
                else:
                    teams = {}
                    time_divider = 1
                    data_teams = data['teams']
                    if isinstance(data_teams, dict):
                        data_teams = data_teams.values()
                    for team in data_teams:
                        row = {}

                        def get(key, index):
                            return team[key] if isinstance(
                                team, dict) else team[index]

                        name = get('name', 0)
                        name = html.unescape(name)
                        row['member'] = f'{name} {season}'
                        row['name'] = name
                        row['solving'] = int(get('score', 2))
                        row['penalty'] = int(get('time', 3))

                        if isinstance(team, dict):
                            team['problems'] = [
                                team[str(index)]
                                for index in range(len(data['problems']))
                            ]

                        problems = row.setdefault('problems', {})
                        for p_name, verdict in zip(data['problems'],
                                                   get('problems', 4)):
                            if not verdict:
                                continue
                            if isinstance(verdict, dict):
                                verdict = {k[0]: v for k, v in verdict.items()}
                                verdict['a'] = int(verdict['a'])
                                if isinstance(verdict.get('p'), int):
                                    verdict['a'] += verdict['p']
                                if isinstance(verdict['s'], str):
                                    verdict['s'] = int(verdict['s'])
                                status = '+' if verdict['s'] else (
                                    '?' if verdict.get('p', False) else '-')
                                time = verdict['t']
                                result = verdict['a']
                                time_divider = 1000 * 60
                                if not result:
                                    continue
                            else:
                                status, result = verdict.split(' ', 1)
                                if ' ' in result:
                                    result, time = result.split()
                                    time = int(time)
                                else:
                                    time = None
                                result = int(result)
                            has_submission = True
                            problem = problems.setdefault(p_name, {})
                            if status == '+':
                                problem['time'] = time
                                problem[
                                    'result'] = '+' if result == 1 else f'+{result - 1}'
                            else:
                                problem['result'] = f'{status}{result}'
                        teams[row['member']] = row

                teams = list(teams.values())
                teams.sort(key=lambda t: (t['solving'], -t['penalty']),
                           reverse=True)
                rank = 0
                prev = None
                for i, t in enumerate(teams):
                    curr = (t['solving'], t['penalty'])
                    if prev != curr:
                        rank = i + 1
                        prev = curr
                    t['place'] = rank
                result = {t['member']: t for t in teams}

                problems_info = OrderedDict(sorted(problems_info.items()))
            else:
                regex = '''<table[^>]*(?:id=["']standings|class=["']scoreboard)[^>]*>.*?</table>'''
                match = re.search(regex, page, re.DOTALL)
                html_table = match.group(0)

                table = parsed_table.ParsedTable(html_table)
                time_divider = 1
                for r in table:
                    row = {}
                    problems = row.setdefault('problems', {})
                    for k, vs in r.items():
                        if isinstance(vs, list):
                            v = ' '.join(i.value for i in vs if i.value)
                        else:
                            v = vs.value
                        k = k.lower().strip('.')
                        v = v.strip()
                        if k in ('rank', 'rk'):
                            row['place'] = v
                        elif k == 'team':
                            row['member'] = f'{v} {season}'
                            row['name'] = v
                        elif k == 'time':
                            row['penalty'] = int(v)
                        elif k == 'slv':
                            row['solving'] = int(v)
                        elif k == 'score':
                            if ' ' in v:
                                row['solving'], row['penalty'] = map(
                                    int, v.split())
                            else:
                                row['solving'] = int(v)
                        elif len(k) == 1:
                            k = k.title()
                            if k not in problems_info:
                                problems_info[k] = {'short': k}
                                if 'title' in vs.header.attrs:
                                    problems_info[k]['name'] = vs.header.attrs[
                                        'title']

                            v = re.sub(r'([0-9]+)\s+([0-9]+)\s+tr.*', r'\2 \1',
                                       v)
                            v = re.sub('tr[a-z]*', '', v)
                            v = re.sub('-*', '', v)
                            v = v.strip()
                            if not v:
                                continue

                            has_submission = True

                            p = problems.setdefault(k, {})
                            if ' ' in v:
                                pnt, time = map(int, v.split())
                                p['result'] = '+' if pnt == 1 else f'+{pnt - 1}'
                                p['time'] = time

                                if ('solvedfirst' in vs.column.attrs.get(
                                        'class', ''
                                ) or vs.column.node.xpath(
                                        './/*[contains(@class, "score_first")]'
                                )):
                                    p['first_ac'] = True
                            else:
                                p['result'] = f'-{v}'
                    result[row['member']] = row

            if not has_submission:
                continue

            first_ac_of_all = None
            for team in result.values():
                for p_name, problem in team['problems'].items():
                    p_info = problems_info[p_name]
                    if not problem['result'].startswith('+'):
                        continue
                    time = problem['time']
                    if 'first_ac' not in p_info or time < p_info['first_ac']:
                        p_info['first_ac'] = time
                    if first_ac_of_all is None or time < first_ac_of_all:
                        first_ac_of_all = time
                    if problem.get('first_ac'):
                        p_info['has_first_ac'] = True

            for team in result.values():
                for p_name, problem in team['problems'].items():
                    p_info = problems_info[p_name]
                    if problem['result'].startswith('+'):
                        if p_info.get('has_first_ac'
                                      ) and not problem.get('first_ac'):
                            continue
                        if problem['time'] == p_info['first_ac']:
                            problem['first_ac'] = True
                        if problem['time'] == first_ac_of_all:
                            problem['first_ac_of_all'] = True
                    if 'time' in problem:
                        problem['time'] = int(
                            round(problem['time'] / time_divider))

            without_medals = any(p['result'].startswith('?')
                                 for row in result.values()
                                 for p in row.get('problems', {}).values())

            options = {'per_page': None}
            if not without_medals:
                medals = self._get_medals(year)
                medals = [{'name': k, 'count': v} for k, v in medals.items()]
                options['medals'] = medals

            standings = {
                'result': result,
                'url': standings_url,
                'problems': list(problems_info.values()),
                'options': options,
            }
            return standings

        raise ExceptionParseStandings(
            f'Not found standings url from {standings_urls}')
Пример #22
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        page = REQ.get(self.url)
        match = re.search(
            r'''<a[^>]*href=["']?(?P<href>[^"' ]*rating[^"' ]*)["']?[^>]*>\[Рейтинг\]''',
            page)

        if not match and re.search(
                r'''<b>Олимпиада №[0-9]+ не существует!</b>''', page):
            return {'action': 'delete'}

        page = REQ.get(match.group('href'))
        standings_url = REQ.last_url

        match = re.search(
            r'''var(?P<vars>(?:\s*[a-z]+=[0-9]+,)+)\s*M=(?:new Array)?[\[\(]?(?P<data>.*?)[\]\)]\s*(?:function|var)''',
            page)  # noqa

        result = {}
        problems_info = OrderedDict()

        def canonize_name(name):
            name = name.replace('\r', ' ')
            name = name.replace('\n', ' ')
            name = re.sub(r'\s+', ' ', name)
            name = re.sub(r'<br/?>', ',', name)
            name = re.sub(r'<[^>]*>', '', name)
            name = re.sub(r'\s*,\s*', ', ', name)
            name = name.strip()
            return name

        if match:
            data = match.group('data')
            data = data.replace('\\', '\\\\')
            data = data.replace('"', r'\"')
            data = data.replace("'", '"')
            data = re.sub(r'\s+', ' ', data)
            data = json.loads(f'[{data}]')

            variables = {}
            for var in re.split(r',\s*', match.group('vars').strip()):
                if not var:
                    continue
                k, v = var.split('=')
                variables[k] = v

            match = re.search(r'''M\[\((?P<val>[0-9]+)\+''', page)
            offset = int(match.group('val'))

            n_problems = int(variables['tn'])
            n_teams = int(variables['nk'])
            n_fields = offset + 3 * n_problems
            place = 0
            last = None
            for rank, st in enumerate(range(0, n_teams * n_fields, n_fields),
                                      start=1):
                row = data[st:st + n_fields]

                name = canonize_name(row[0])

                member = name + ', ' + season

                r = result.setdefault(member, {})

                r['name'] = name
                r['member'] = member
                r['solving'] = int(row[1])
                r['penalty'] = int(row[2])

                score = r['solving'], r['penalty']
                if score != last:
                    place = rank
                    last = score
                r['place'] = place

                n_problems_fields = 3
                problems = r.setdefault('problems', {})
                for idx in range(0, n_problems):
                    p_info = row[offset + idx * n_problems_fields:offset +
                                 (idx + 1) * n_problems_fields]
                    stat, errors, seconds = map(int, p_info)
                    key = chr(ord('A') +
                              idx) if n_problems < 27 else f'{idx + 1:02d}'

                    if key not in problems_info:
                        info = {'short': key}
                        if abs(errors) >= 1000:
                            info['full_score'] = 100
                        problems_info[key] = info

                    if not stat:
                        continue
                    p = problems.setdefault(key, {})
                    p['time'] = self.to_time(seconds, num=2)
                    if abs(errors) < 1000:
                        p['result'] = f'+{errors if errors else ""}' if stat == 1 else f'-{errors}'
                    else:
                        solved = r.setdefault('solved', {'solving': 0})
                        score = errors - 1000
                        p['result'] = score
                        if score > 0:
                            p['partial'] = score < problems_info[key][
                                'full_score']
                            if not p['partial']:
                                solved['solving'] += 1

                if not problems:
                    result.pop(member)
        else:
            regex = '''<table[^>]*class=["']?olimp["']?[^>]*>.*?</table>'''
            match = re.search(regex, page, re.DOTALL)
            if not match and 'Рейтинг олимпиады' not in page:
                return {'action': 'delete'}
            table = parsed_table.ParsedTable(match.group(0))

            for row in table:
                r = OrderedDict()
                problems = r.setdefault('problems', {})
                for k, v in list(row.items()):
                    if k == '=':
                        r['solving'] = int(v.value)
                    elif k == 'Место':
                        r['place'] = int(v.value)
                    elif k == 'Время':
                        r['penalty'] = int(v.value)
                    elif k == 'Участник':
                        name = canonize_name(v.value)
                        r['name'] = name
                        r['member'] = name + ', ' + season
                    elif len(k) == 1 and k not in ['№']:
                        if k not in problems_info:
                            info = {'short': k}
                            problems_info[k] = info
                        if v.value != DOT:
                            p = problems.setdefault(k, {})
                            p['result'], *values = v.value.split()
                            if values:
                                p['time'] = values[0]
                if not problems:
                    continue

                result[r['member']] = r

        standings = {
            'result': result,
            'url': standings_url,
            'problems': list(problems_info.values()),
        }
        return standings
Пример #23
0
    def get_users_infos(users, resource=None, accounts=None, pbar=None):

        @RateLimiter(max_calls=5, period=1)
        def fetch_profle_page(user):
            for format_url in (
                Statistic.PROFILE_URL_FORMAT_,
                Statistic.TEAM_URL_FORMAT_,
            ):
                page = None
                url = format_url.format(user=user)
                try:
                    ret = REQ.get(url, return_url=True)
                    if not ret:
                        continue
                    page, page_url = ret
                    if url != page_url:
                        page = None
                    break
                except FailOnGetResponse as e:
                    if e.args[0].code == 404:
                        page = None
                    else:
                        raise e
            return page

        with PoolExecutor(max_workers=4) as executor:
            for user, page in zip(users, executor.map(fetch_profle_page, users)):
                if pbar:
                    pbar.update()

                if page is None:
                    yield {'info': None}
                    continue

                match = re.search(r'jQuery.extend\(Drupal.settings,(?P<data>[^;]*)\);$', str(page), re.MULTILINE)
                data = json.loads(match.group('data'))
                if 'date_versus_rating' not in data:
                    info = {}
                    info['is_team'] = True
                    regex = '<table[^>]*cellpadding=""[^>]*>.*?</table>'
                    match = re.search(regex, page, re.DOTALL)
                    if match:
                        html_table = match.group(0)
                        table = parsed_table.ParsedTable(html_table)
                        for r in table:
                            for k, v in list(r.items()):
                                k = k.lower().replace(' ', '_')
                                info[k] = v.value

                    matches = re.finditer(r'''
                                          <td[^>]*>\s*<b[^>]*>Member[^<]*</b>\s*</td>\s*
                                          <td[^>]*><a[^>]*href\s*=\s*"[^"]*/users/(?P<member>[^"/]*)"[^>]*>
                                          ''', page, re.VERBOSE)
                    coders = set()
                    for match in matches:
                        coders.add(match.group('member'))
                    if coders:
                        info['members'] = list(coders)

                    ret = {'info': info, 'coders': coders}
                else:
                    data = data['date_versus_rating']['all']

                    matches = re.finditer(
                        r'''
                            <li[^>]*>\s*<label[^>]*>(?P<key>[^<]*):\s*</label>\s*
                            <span[^>]*>(?P<value>[^<]*)</span>\s*</li>
                        ''',
                        page,
                        re.VERBOSE,
                    )

                    info = {}
                    for match in matches:
                        key = match.group('key').strip().replace(' ', '_').lower()
                        value = match.group('value').strip()
                        info[key] = value

                    contest_addition_update = {}
                    prev_rating = None
                    for row in data:
                        rating = row.get('rating')
                        if not rating:
                            continue
                        rating = int(rating)
                        info['rating'] = rating

                        code = row.get('code')
                        if code:
                            if re.search(r'\bdiv(ision)?[-_\s]+[AB12]', row['name'], re.I) and re.search('[AB]$', code):
                                code = code[:-1]

                            update = contest_addition_update.setdefault(code, OrderedDict())
                            update['rating_change'] = rating - prev_rating if prev_rating is not None else None
                            update['new_rating'] = rating

                        prev_rating = rating

                    ret = {'info': info, 'contest_addition_update': contest_addition_update}

                yield ret
Пример #24
0
    def get_standings(self, users=None, statistics=None):
        year = int(re.search(r'\b[0-9]{4}\b', self.key).group(0))
        season = '%d-%d' % (year - 1, year)

        icpc_standings_url = f'https://icpc.global/community/results-{year}'
        icpc_api_standings_url = f'https://icpc.global/api/help/cms/virtpublic/community/results-{year}'

        standings_urls = []
        if not self.standings_url:
            for url in (
                    f'http://static.kattis.com/icpc/wf{year}/',
                    f'https://zibada.guru/finals/{year}/',
                    f'http://web.archive.org/web/{year}/https://icpc.baylor.edu/scoreboard/',
                    f'http://web.archive.org/web/{year}/https://icpc.global/scoreboard/',
                    f'https://cphof.org/standings/icpc/{year}',
                    icpc_api_standings_url,
            ):
                try:
                    page = REQ.get(url)
                except FailOnGetResponse:
                    continue

                if 'web.archive.org' in REQ.last_url and f'/{year}' not in REQ.last_url:
                    continue

                if not re.search(
                        rf'\b(world\s*finals\s*{year}|{year}\s*world\s*finals)\b',
                        page, re.IGNORECASE):
                    continue

                standings_urls.append(url)
        else:
            if self.standings_url == icpc_standings_url:
                standings_urls.append(icpc_api_standings_url)
            else:
                standings_urls.append(self.standings_url)

        if not standings_urls:
            raise ExceptionParseStandings(
                f'Not found standings url year = {year}')

        for standings_url in standings_urls:
            is_icpc_api_standings_url = standings_url == icpc_api_standings_url
            page = REQ.get(standings_url)

            result = {}
            hidden_fields = set(self.info.get('hidden_fields',
                                              [])) | {'region'}
            problems_info = OrderedDict()

            if 'zibada' in standings_url:
                match = re.search(r' = (?P<data>[\{\[].*?);?\s*$', page,
                                  re.MULTILINE)
                if match:
                    names = self._json_load(match.group('data'))
                else:
                    names = None

                try:
                    page = REQ.get('standings.js')
                    match = re.search(r' = (?P<data>\{.*?);?\s*$', page,
                                      re.MULTILINE)
                    data = self._json_load(match.group('data'))
                except Exception:
                    assert names
                    data = names

                for p_name in data['problems']:
                    problems_info[p_name] = {'short': p_name}

                events = data.pop('events', None)
                if events:
                    teams = {}
                    time_divider = 60
                    events.sort(key=lambda e: int(e.split()[-1]))
                    for e in events:
                        tid, p_name, status, attempt, time = e.split()
                        time = int(time)

                        team = teams.setdefault(tid, {})
                        problems = team.setdefault('problems', {})
                        result = problems.get(p_name, {}).get('result', '')
                        if not result.startswith('?') and status.startswith(
                                '?'):
                            continue
                        if status == '+':
                            attempt = int(attempt) - 1
                            p_info = problems_info[p_name]
                        problems[p_name] = {
                            'time':
                            time,
                            'result':
                            '+' if status == '+' and attempt == 0 else
                            f'{status}{attempt}',
                        }
                    for tid, team in teams.items():
                        name = names[int(tid)][0]
                        name = html.unescape(name)
                        team['member'] = f'{name} {season}'
                        team['name'] = name
                        penalty = 0
                        solving = 0
                        for p_name, problem in team.get('problems',
                                                        {}).items():
                            if problem['result'].startswith('+'):
                                solving += 1
                                attempt_penalty = (int(
                                    problem['result'].lstrip('+')
                                    or 0)) * 20 * time_divider
                                penalty += problem['time'] + attempt_penalty
                        team['penalty'] = int(round(penalty / time_divider))
                        team['solving'] = solving
                else:
                    teams = {}
                    time_divider = 1
                    data_teams = data['teams']
                    if isinstance(data_teams, dict):
                        data_teams = data_teams.values()
                    for team in data_teams:
                        row = {}

                        def get(key, index):
                            return team[key] if isinstance(
                                team, dict) else team[index]

                        name = get('name', 0)
                        name = html.unescape(name)
                        row['member'] = f'{name} {season}'
                        row['name'] = name
                        row['solving'] = int(get('score', 2))
                        row['penalty'] = int(get('time', 3))

                        if isinstance(team, dict):
                            team['problems'] = [
                                team[str(index)]
                                for index in range(len(data['problems']))
                            ]

                        problems = row.setdefault('problems', {})
                        for p_name, verdict in zip(data['problems'],
                                                   get('problems', 4)):
                            if not verdict:
                                continue
                            if isinstance(verdict, dict):
                                verdict = {k[0]: v for k, v in verdict.items()}
                                verdict['a'] = int(verdict['a'])
                                if isinstance(verdict.get('p'), int):
                                    verdict['a'] += verdict['p']
                                if isinstance(verdict['s'], str):
                                    verdict['s'] = int(verdict['s'])
                                status = '+' if verdict['s'] else (
                                    '?' if verdict.get('p', False) else '-')
                                time = verdict['t']
                                result = verdict['a']
                                time_divider = 1000 * 60
                                if not result:
                                    continue
                            else:
                                status, result = verdict.split(' ', 1)
                                if ' ' in result:
                                    result, time = result.split()
                                    time = int(time)
                                else:
                                    time = None
                                result = int(result)
                            problem = problems.setdefault(p_name, {})
                            if status == '+':
                                problem['time'] = time
                                problem[
                                    'result'] = '+' if result == 1 else f'+{result - 1}'
                            else:
                                problem['result'] = f'{status}{result}'
                        teams[row['member']] = row

                teams = list(teams.values())
                teams.sort(key=lambda t: (t['solving'], -t['penalty']),
                           reverse=True)
                rank = 0
                prev = None
                for i, t in enumerate(teams):
                    curr = (t['solving'], t['penalty'])
                    if prev != curr:
                        rank = i + 1
                        prev = curr
                    t['place'] = rank
                result = {t['member']: t for t in teams}

                problems_info = OrderedDict(sorted(problems_info.items()))
            else:
                if is_icpc_api_standings_url:
                    page = re.sub(
                        r'</table>\s*<table>\s*(<tr[^>]*>\s*<t[^>]*>)',
                        r'\1',
                        page,
                        flags=re.I)

                regex = '''(?:<table[^>]*(?:id=["']standings|class=["']scoreboard)[^>]*>|"content":"[^"]*<table[^>]*>|<table[^>]*class="[^"]*(?:table[^"]*){3}"[^>]*>).*?</table>'''  # noqa
                match = re.search(regex, page, re.DOTALL)
                if match:
                    html_table = match.group(0)
                    table = parsed_table.ParsedTable(
                        html_table,
                        with_not_full_row=is_icpc_api_standings_url)
                else:
                    table = []
                time_divider = 1
                last_place = None
                honorables = []
                for r in table:
                    row = {}
                    problems = row.setdefault('problems', {})
                    for k, vs in r.items():
                        if isinstance(vs, list):
                            v = ' '.join(i.value for i in vs if i.value)
                        else:
                            v = vs.value
                        k = k.lower().strip('.')
                        v = v.strip()
                        if honorables:
                            if v:
                                honorables.append(v)
                            continue
                        if k in ('rank', 'rk', 'place'):
                            if not isinstance(vs, list):
                                medal = vs.column.node.xpath('.//img/@alt')
                                if medal and medal[0].endswith('medal'):
                                    row['medal'] = medal[0].split()[0]
                            if v and not v[0].isdigit():
                                honorables.append(v)
                            row['place'] = v
                        elif k in ('team', 'name', 'university'):
                            if isinstance(vs, list):
                                for el in vs:
                                    logo = el.column.node.xpath('.//img/@src')
                                    if logo:
                                        logo = urllib.parse.urljoin(
                                            standings_url, logo[0])
                                        row.setdefault('info',
                                                       {})['logo'] = logo
                                        break
                                for el in vs:
                                    region = el.column.node.xpath(
                                        './/*[@class="badge badge-warning"]')
                                    if region:
                                        region = ''.join([
                                            s.strip()
                                            for s in region[0].xpath('text()')
                                        ])
                                        if region:
                                            row['region'] = region
                            if 'cphof' in standings_url:
                                member = vs.column.node.xpath(
                                    './/a/text()')[0].strip()
                                row['member'] = f'{member} {season}'
                            else:
                                row['member'] = f'{v} {season}'
                            row['name'] = v
                        elif k in ('time', 'penalty', 'total time (min)',
                                   'minutes'):
                            if v:
                                row['penalty'] = int(v)
                        elif k in ('slv', 'solved', '# solved'):
                            row['solving'] = int(v)
                        elif k == 'score':
                            if ' ' in v:
                                row['solving'], row['penalty'] = map(
                                    int, v.split())
                            else:
                                row['solving'] = int(v)
                        elif len(k) == 1:
                            k = k.title()
                            if k not in problems_info:
                                problems_info[k] = {'short': k}
                                if 'title' in vs.header.attrs:
                                    problems_info[k]['name'] = vs.header.attrs[
                                        'title']

                            v = re.sub(r'([0-9]+)\s+([0-9]+)\s+tr.*', r'\2 \1',
                                       v)
                            v = re.sub('tr[a-z]*', '', v)
                            v = re.sub('-*', '', v)
                            v = v.strip()
                            if not v:
                                continue

                            p = problems.setdefault(k, {})
                            if '+' in v:
                                v = v.replace(' ', '')
                                p['result'] = f'?{v}'
                            elif ' ' in v:
                                pnt, time = map(int, v.split())
                                p['result'] = '+' if pnt == 1 else f'+{pnt - 1}'
                                p['time'] = time

                                if ('solvedfirst' in vs.column.attrs.get(
                                        'class', ''
                                ) or vs.column.node.xpath(
                                        './/*[contains(@class, "score_first")]'
                                )):
                                    p['first_ac'] = True
                            else:
                                p['result'] = f'-{v}'
                    if row.get('place'):
                        last_place = row['place']
                    elif last_place:
                        row['place'] = last_place
                    if 'member' not in row or row['member'].startswith(' '):
                        continue
                    result[row['member']] = row

                elements = etree.HTML(page).xpath(
                    '//div[@class="card-header"]/following-sibling::div[@class="card-body"]//li'
                )  # noqa
                for el in elements:
                    name = ''.join([s.strip() for s in el.xpath('text()')])
                    member = f'{name} {season}'
                    row = result.setdefault(member, {
                        'member': member,
                        'name': name
                    })

                    logo = el.xpath('./img/@src')
                    if logo:
                        row.setdefault('info',
                                       {})['logo'] = urllib.parse.urljoin(
                                           standings_url, logo[0])

                    while el is not None:
                        prv = el.getprevious()
                        if prv is not None and prv.tag == 'div' and prv.get(
                                'class') == 'card-header':
                            break
                        el = el.getparent()
                    if el is not None:
                        region = ''.join(
                            [s.strip() for s in prv.xpath('text()')])
                        row['region'] = region

                if result and honorables:
                    for name in honorables:
                        if 'honorable' in name.lower():
                            continue
                        row = dict(name=name, member=f'{name} {season}')
                        result[row['member']] = row

            if not result:
                continue

            if statistics:
                for team, row in result.items():
                    stat = statistics.get(team)
                    if not stat:
                        continue
                    for k, v in stat.items():
                        if k not in row:
                            hidden_fields.add(k)
                            row[k] = v

            if any(['region' not in r for r in result.values()]):
                try:
                    url = f'https://icpc.global/api/team/wf/{year}/published'
                    page = REQ.get(url, time_out=60)
                    data = self._json_load(page)
                except Exception:
                    traceback.print_exc()
                    data = None

                if data:

                    def canonize_name(name):
                        name = name.lower()
                        name = name.replace('&', ' and ')
                        name = re.sub(r'\s{2,}', ' ', name)
                        name = re.split(r'(?:\s-\s|\s-|-\s|,\s)', name)
                        name = tuple(sorted([n.strip() for n in name]))
                        return name

                    matching = {}
                    for key, row in result.items():
                        name = row['name']
                        matching.setdefault(name, key)
                        name = canonize_name(name)
                        matching.setdefault(name, key)

                    for site in data:
                        region = site['siteName']
                        for team in site['teams']:
                            name = team['university']
                            if name not in matching:
                                name = canonize_name(name)
                            if name not in matching:
                                name = tuple(
                                    sorted(name + canonize_name(team['name'])))
                            if name not in matching:
                                logger.warning(f'Not found team = {name}')
                            else:
                                row = result[matching[name]]
                                row['region'] = region
                                for k, v in team.items():
                                    k = k.lower()
                                    if k not in row:
                                        hidden_fields.add(k)
                                        row[k] = v

            first_ac_of_all = None
            for team in result.values():
                for p_name, problem in team.get('problems', {}).items():
                    p_info = problems_info[p_name]
                    if not problem['result'].startswith('+'):
                        continue
                    time = problem['time']
                    if 'first_ac' not in p_info or time < p_info['first_ac']:
                        p_info['first_ac'] = time
                    if first_ac_of_all is None or time < first_ac_of_all:
                        first_ac_of_all = time
                    if problem.get('first_ac'):
                        p_info['has_first_ac'] = True

            for team in result.values():
                for p_name, problem in team.get('problems', {}).items():
                    p_info = problems_info[p_name]
                    if problem['result'].startswith('+'):
                        if p_info.get('has_first_ac'
                                      ) and not problem.get('first_ac'):
                            continue
                        if problem['time'] == p_info['first_ac']:
                            problem['first_ac'] = True
                        if problem['time'] == first_ac_of_all:
                            problem['first_ac_of_all'] = True
                    if 'time' in problem:
                        problem['time'] = int(
                            round(problem['time'] / time_divider))

            without_medals = any(p['result'].startswith('?')
                                 for row in result.values()
                                 for p in row.get('problems', {}).values())

            options = {'per_page': None}
            if not without_medals:
                medals = self._get_medals(year)
                if medals:
                    medals = [{
                        'name': k,
                        'count': v
                    } for k, v in medals.items()]
                    options['medals'] = medals

            standings = {
                'result': result,
                'url': icpc_standings_url
                if is_icpc_api_standings_url else standings_url,
                'problems': list(problems_info.values()),
                'options': options,
                'hidden_fields': list(hidden_fields),
            }
            return standings

        raise ExceptionParseStandings(
            f'Not found standings url from {standings_urls}')
Пример #25
0
    def get_standings(self, users=None, statistics=None):
        def parse_problems(page, full=False):
            matches = re.finditer(
                r'''
                <div[^>]*class=['"]panel\s*historypanel['"][^>]*>\s*
                <div[^>]*>\s*<h[^>]*>(?P<index>[^<]*)</h[^>]*>\s*</div>\s*
                <div[^>]*>(\s*<[^>]*>)*(?P<name>[^<]+)
                (\s*<[^>]*>)*\s*<a[^>]*href=["'](?P<url>[^"']*)["'][^>]*>
            ''', page, re.VERBOSE)

            problems = []
            problemsets = []

            prev_index = None
            for match in matches:
                index = match.group('index')
                if prev_index and index <= prev_index:
                    if full:
                        problemsets.append(problems)
                        problems = []
                    else:
                        break
                prev_index = index
                url = urllib.parse.urljoin(self.standings_url,
                                           match.group('url'))
                cpid = re.search('cpid=([0-9]+)', url).group(1)
                problems.append({
                    'short': str(len(problems) + 1),
                    'code': cpid,
                    'name': match.group('name'),
                    'url': url,
                })

            if problems:
                problemsets.append(problems)

            return problemsets if full else problems

        page = REQ.get(self.standings_url)
        divisions = list(
            re.finditer(
                '<a[^>]*href="(?P<url>[^"]*data[^"]*_(?P<name>[^_]*)_results.html)"[^>]*>',
                page))
        descriptions = []
        prev_span = None
        for division_match in divisions:
            curr_span = division_match.span()
            if prev_span is not None:
                descriptions.append(page[prev_span[1]:curr_span[0]])
            prev_span = curr_span
        if prev_span is not None:
            descriptions.append(page[prev_span[1]:])

        problems_info = OrderedDict()
        match = re.search(
            '''<a[^>]*href=["'](?P<href>[^"']*page=[a-z0-9]+problems)["'][^>]*>''',
            page)
        if match:
            url = urllib.parse.urljoin(self.standings_url, match.group('href'))
            page = REQ.get(url)
            problemsets = parse_problems(page, full=True)
            assert len(divisions) == len(problemsets)
        else:
            problemsets = None

        result = {}
        d0_set = set()
        for division_idx, (division_match, description) in enumerate(
                zip(divisions, descriptions)):
            division = division_match.group('name')

            d_problems = parse_problems(
                description
            ) if problemsets is None else problemsets[division_idx]
            division_info = problems_info.setdefault('division', OrderedDict())
            division_info[division] = d_problems

            d0 = division[0].upper()
            assert d0 not in d0_set
            d0_set.add(d0)
            for p in d_problems:
                p['short'] = d0 + p['short']

            url = urllib.parse.urljoin(self.standings_url,
                                       division_match.group('url'))
            page = REQ.get(url)

            tables = re.finditer(
                r'>(?P<title>[^<]*)</[^>]*>\s*(?P<html><table[^>]*>.*?</table>)',
                page, re.DOTALL)
            for table_match in tables:
                title = table_match.group('title')
                table = parsed_table.ParsedTable(table_match.group('html'))

                for r in table:
                    row = OrderedDict()
                    problems = row.setdefault('problems', {})
                    solved = 0
                    idx = 0
                    for key, value in r.items():
                        key = key.replace('&nbsp', ' ').strip()
                        if not key:
                            continue
                        if isinstance(value, list):
                            status = ''.join(v.value for v in value)
                            idx += 1
                            if not status:
                                continue
                            partial = not bool(re.match(r'^[\*]+$', status))
                            solved += not partial
                            problems[d0 + str(idx)] = {
                                'partial':
                                partial,
                                'result':
                                1000 / len(d_problems) * status.count('*') /
                                len(status),
                                'status':
                                status,
                            }
                        elif key == 'Score':
                            row['solving'] = int(value.value)
                        else:
                            row[key.lower()] = value.value.replace(
                                '&nbsp', ' ').strip()
                    row['member'] = f'{row["name"]}, {row["country"]}'
                    row['division'] = division
                    row['list'] = title.strip().strip(':')
                    row['solved'] = {'solving': solved}
                    result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': problems_info,
            'hidden_fields': ['list'],
        }
        return standings
Пример #26
0
    def get_standings(self, users=None, statistics=None):
        geolocator = Nominatim(user_agent="clist.by")
        geocode_func = partial(geolocator.geocode, timeout=10)
        geocode = RateLimiter(geocode_func, min_delay_seconds=1, max_retries=3)

        season = self.key.split('.')[0]

        if not self.standings_url:
            return {}

        page = REQ.get(self.standings_url)
        page = re.sub('<(/?)tl([^>]*)>', r'<\1tr\2>', page)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        if not match:
            regex = r'<table\s*(?:align="center"\s*)?border="1"\s*(?:align="center"\s*)?>.*?</table>'
            matches = re.finditer(regex, page, re.DOTALL)
            for match in matches:
                pass
        if not match:
            raise ExceptionParseStandings('not found standings table')
        html_table = match.group(0)
        c_mapping = {
            'place':
            'place',
            'место':
            'place',
            'user':
            '******',
            'team':
            'name',
            'участник':
            'name',
            'solved':
            'solved',
            'total':
            'solved',
            'имя':
            'first_name',
            'фамилия':
            'last_name',
            'отчество':
            'middle_name',
            'логин':
            'login',
            'login':
            '******',
            'класс':
            'class',
            'город':
            'city',
            'субъект российской федерации (для иностранных участников - государство)':
            'city',
            'балл':
            'solving',
            'сумма':
            'solving',
            'баллы':
            'solving',
            'score':
            'solving',
            'sum':
            'solving',
            'диплом':
            'diploma',
            'степень диплома':
            'diploma',
            'номер диплома':
            'diploma_number',
            'страна':
            'country',
            'школа (сокр.)':
            'school',
            'школа':
            'school',
            'учебное зачедение, класс':
            'school',
            'регион/статус':
            'region',
            'регион':
            'region',
            'имя в таблице':
            'handle',
            'uid':
            'uid',
        }

        table = parsed_table.ParsedTable(html_table, strip_empty_columns=True)

        locations = None
        if os.path.exists(self.LOCATION_CACHE_FILE):
            with open(self.LOCATION_CACHE_FILE, 'r') as fo:
                locations = yaml.safe_load(fo)
        if locations is None:
            locations = {}

        def get_location(loc_info):
            loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower()
            if loc_info not in locations:
                try:
                    ru = geocode(loc_info, language='ru')
                    en = geocode(loc_info, language='en')
                    if ru is None and en is None:
                        locations[loc_info] = None
                    else:
                        locations[loc_info] = {
                            'ru': ru.address,
                            'en': en.address
                        }
                except Exception:
                    pass

            return locations.get(loc_info)

        def get_country(address):
            *_, country = map(str.strip, address['en'].split(','))
            if country.startswith('The '):
                country = country[4:]
            return country

        try:
            result = {}
            problems_info = OrderedDict()
            has_bold = False
            last, place, placing = None, None, {}
            for idx, r in enumerate(tqdm.tqdm(table, total=len(table)),
                                    start=1):
                row = OrderedDict()
                problems = row.setdefault('problems', {})
                letter = chr(ord('A') - 1)
                solved = 0
                for k, v in list(r.items()):
                    is_russian = bool(re.search('[а-яА-Я]', k))
                    c = v.attrs.get('class')
                    c = c.split()[0] if c else k.lower()
                    if c and c.startswith('st_'):
                        c = c[3:].lower()
                    if c in ['prob'] or c not in c_mapping and not is_russian:
                        letter = chr(ord(letter) + 1)
                        problem_info = problems_info.setdefault(
                            letter, {
                                'short': letter,
                                'full_score': 100,
                            })
                        if letter.lower() != k.lower():
                            problem_info['name'] = k
                        if 'title' in v.attrs:
                            problem_info['name'] = v.attrs['title']

                        if v.value != DOT and v.value:
                            p = problems.setdefault(letter, {})

                            if v.column.node.xpath('b'):
                                p['partial'] = False
                                has_bold = True

                            v = v.value
                            if SPACE in v:
                                v, t = v.split(SPACE, 1)
                                t = t.strip()
                                m = re.match(r'^\((?P<val>[0-9]+)\)$', t)
                                if m:
                                    t = int(m.group('val'))
                                    if t > 1:
                                        p['attempts'] = t - 1
                                else:
                                    p['time'] = t

                            try:
                                score = float(v)
                                p['result'] = v
                                p['partial'] = score < problem_info[
                                    'full_score']
                            except ValueError:
                                pass
                            if 'partial' in p and not p['partial']:
                                solved += 1
                    else:
                        v = v.value.strip()
                        if not v or v == '-':
                            continue
                        c = c_mapping.get(c, c).lower()
                        row[c] = v

                        if c == 'diploma':
                            row['_medal_title_field'] = 'diploma'
                            v = v.lower().split()[0]
                            if re.search('(^в.к|^вне)', v):
                                continue
                            if v in ['gold', 'i', '1'] or v.startswith('перв'):
                                row['medal'] = 'gold'
                            elif v in ['silver', 'ii', '2'
                                       ] or v.startswith('втор'):
                                row['medal'] = 'silver'
                            elif v in ['bronze', 'iii', '3'
                                       ] or v.startswith('трет'):
                                row['medal'] = 'bronze'
                            else:
                                row['medal'] = 'honorable'

                if 'solving' not in row:
                    if 'solved' in row:
                        row['solving'] = row.pop('solved')
                    else:
                        continue
                row['solved'] = {'solving': solved}

                if 'place' not in row:
                    if place is None and idx != 1:
                        continue
                    if row['solving'] != last:
                        place = idx
                        last = row['solving']
                    placing[place] = idx
                    row['place'] = place

                if 'name' not in row:
                    if 'first_name' in row and 'last_name' in row:
                        row['name'] = row['last_name'] + ' ' + row['first_name']
                    elif 'first_name' in row and 'last_name' not in row:
                        row['name'] = row.pop('first_name')

                if 'login' in row:
                    row['member'] = row['login']
                    if 'name' in row:
                        row['_name_instead_key'] = True
                elif 'name' in row:
                    name = row['name']
                    if ' ' in name:
                        row['member'] = name + ' ' + season
                    else:
                        row.pop('name')
                        row['member'] = name
                else:
                    row['member'] = f'{self.pk}-{idx}'

                addition = (statistics or {}).get(row['member'], {})
                if addition:
                    country = addition.get('country')
                    if country:
                        row.setdefault('country', country)
                    if 'country' not in row:
                        locs = []
                        if 'city' in row:
                            locs.append(row['city'])
                        if 'extra' in row:
                            extra = row['extra']
                            extra = re.sub(
                                r'\s*(Не\s*РФ|Not\s*RF|Участник\s*вне\s*конкурса):\s*',
                                ' ', extra, re.IGNORECASE)
                            extra = re.sub('<[^>]*>', '', extra)
                            locs.extend(re.split('[,:]', extra))
                        for loc in locs:
                            loc = re.sub(r'\s*[0-9]+\s*', ' ', loc)
                            loc = loc.strip()

                            address = get_location(loc)
                            if address:
                                country = get_country(address)
                                row['country'] = country
                                break

                result[row['member']] = row
            if placing:
                for row in result.values():
                    place = row['place']
                    last = placing[place]
                    row['place'] = str(
                        place) if place == last else f'{place}-{last}'

            if has_bold:
                for row in result.values():
                    for p in row.get('problems').values():
                        if 'partial' not in p and 'result' in p:
                            p['partial'] = True
        finally:
            with open(self.LOCATION_CACHE_FILE, 'wb') as fo:
                yaml.dump(locations, fo, encoding='utf8', allow_unicode=True)

        standings = {
            'result':
            result,
            'problems':
            list(problems_info.values()),
            'hidden_fields': [
                'extra',
                'first_name',
                'last_name',
                'middle_name',
                'class',
                'city',
                'country',
                'diploma',
                'school',
                'login',
                'region',
                'uid',
                'handle',
                'diploma_number',
            ],
        }
        if not statistics and result:
            standings['timing_statistic_delta'] = timedelta(minutes=5)

        return standings
Пример #27
0
        def get_results(standings_url, division_data):
            page = REQ.get(standings_url)

            page_format = division_data.get('format')
            if page_format == 'json':
                data = json.loads(page)
                scores_field = None
                if 'problems' in data:
                    scores_field = 'problem'
                elif 'tournaments' in data:
                    scores_field = 'tournament'

                if scores_field:
                    scores_fields_mapping = {'submission': 'T', 'request': 'R'}
                    scores_mapping = OrderedDict()
                    for score in data[f'{scores_field}s']:
                        name = str(score[f'{scores_field}Id'])
                        scores_mapping[name] = scores_fields_mapping.get(name, name.split(':')[-1])

                table = []
                for team in data['teams']:
                    row = OrderedDict()
                    row['name'] = team['team']['teamName']
                    row['solving'] = team['score']
                    row['country'] = team['team']['customData']['country']
                    if scores_field:
                        problems = row.setdefault('_scores', OrderedDict())
                        scores = team[f'{scores_field}s']
                        for field, out in scores_mapping.items():
                            if field in scores:
                                problems[out] = as_number(scores.get(field, {}).get('score'))
                    table.append(row)
            else:
                mapping = {
                    'Rank': 'place',
                    '': 'place',
                    'Score': 'solving',
                    'score': 'solving',
                    'Total Score': 'solving',
                    'Team': 'name',
                    'name': 'name',
                    'score + unspent LAM': 'unspent_lam',
                }
                xpath = division_data.get('xpath', '//table//tr')
                table = parsed_table.ParsedTable(html=page, header_mapping=mapping, xpath=xpath)

            season = self.get_season()
            ret = {}
            was_place = False
            for r in table:
                row = OrderedDict()
                for k, v in r.items():
                    was_place = was_place or k == 'place'
                    if isinstance(v, parsed_table.ParsedTableValue):
                        v = v.value
                    if k == 'name':
                        row['name'] = v
                        row['member'] = f'{v} {season}'
                    else:
                        row[k] = as_number(v) if k in {'place', 'solving'} else v
                ret[row['member']] = row
            if not was_place:
                place = None
                last = None
                for idx, row in enumerate(sorted(ret.values(), key=lambda r: r['solving'], reverse=True), start=1):
                    if row['solving'] != last:
                        last = row['solving']
                        place = idx
                    row['place'] = place
            return ret
Пример #28
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        result = {}
        problems_info = OrderedDict()

        try:
            standings_xml = REQ.get(self.standings_url.replace(
                '.html', '.xml'),
                                    detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        page = REQ.get(self.standings_url)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        if not match:
            page = re.sub('<table[^>]*wrapper[^>]*>', '', page)
            regex = '<table[^>]*>.*?</table>'
            match = re.search(regex, page, re.DOTALL)
        html_table = match.group(0)
        table = parsed_table.ParsedTable(html_table)

        university_regex = self.info.get('standings', {}).get('1st_u',
                                                              {}).get('regex')
        for r in table:
            row = {}
            problems = row.setdefault('problems', {})
            for k, v in list(r.items()):
                k = k.split()[0]
                if k == 'Total' or k == '=':
                    row['solving'] = int(v.value)
                elif len(k) <= 3:
                    problems_info[k] = {'short': k}
                    if 'title' in v.attrs:
                        problems_info[k]['name'] = v.attrs['title']

                    if '-' in v.value or '+' in v.value or '?' in v.value:
                        p = problems.setdefault(k, {})
                        if ' ' in v.value:
                            point, time = v.value.split()
                            p['time'] = time
                        else:
                            point = v.value
                        p['result'] = point

                        first_ac = v.column.node.xpath(
                            './/*[@class="first-to-solve"]')
                        if len(first_ac):
                            p['first_ac'] = True
                elif k == 'Time':
                    row['penalty'] = int(v.value)
                elif k.lower() in ['place', 'rank']:
                    row['place'] = v.value.strip('.')
                elif 'team' in k.lower() or 'name' in k.lower():
                    if xml_result:
                        problems.update(xml_result[v.value])
                    row['member'] = v.value + ' ' + season
                    row['name'] = v.value
                else:
                    row[k] = v.value
            for f in 'diploma', 'medal':
                medal = row.pop(f, None) or row.pop(f.title(), None)
                if medal:
                    if medal in ['З', 'G']:
                        row['medal'] = 'gold'
                    elif medal in ['С', 'S']:
                        row['medal'] = 'silver'
                    elif medal in ['Б', 'B']:
                        row['medal'] = 'bronze'
                    break
            if university_regex:
                match = re.search(university_regex, row['name'])
                if match:
                    u = match.group('key').strip()
                    row['university'] = u
            result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
            'problems_time_format': '{M}:{s:02d}',
            'hidden_fields': ['university'],
        }
        return standings
Пример #29
0
    def get_standings(self, users=None, statistics=None):
        if not self.standings_url:
            raise ExceptionParseStandings('Not set stnadings url')
        is_final = self.name.lower().startswith('final round')
        now = datetime.utcnow().replace(tzinfo=pytz.utc)
        if not is_final and self.end_time + timedelta(days=3) < now:
            raise ExceptionParseStandings('Too late')

        page = REQ.get(self.standings_url)

        html_table = re.search('<table[^>]*>.*?</table>', page,
                               re.MULTILINE | re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table,
                                         as_list=True,
                                         ignore_wrong_header_number=False,
                                         ignore_display_none=True)

        problems_info = OrderedDict()

        result = {}
        season = self.get_season()
        advanced = False
        for r in table:
            if isinstance(r, parsed_table.ParsedTableRow):
                if re.search(r'qualification\s*threshold', r.columns[0].value,
                             re.I):
                    advanced = True
                    for row in result.values():
                        row['advanced'] = True
                continue
            row = OrderedDict()
            problems = row.setdefault('problems', {})
            if advanced:
                row['advanced'] = False
            pid = 0
            for k, v in r:
                if k == '#':
                    row['place'] = v.value
                elif k == 'Name':
                    row['name'] = v.value
                elif k.startswith('Total'):
                    row['solving'] = v.value
                elif '_top_column' in v.header.attrs:
                    problem_key = str(pid)
                    if problem_key not in problems_info:
                        name = v.header.attrs['_top_column'].value
                        p_info = {'code': problem_key}
                        p_info_regex = r'^(?P<name>.*)\s+\(?(?P<score>[0-9]{2,})\)?$'
                        match = re.search(p_info_regex, name)
                        if match:
                            name = match.group('name').strip()
                        match = re.search(p_info_regex, k)
                        if match:
                            p_info['subname'] = match.group('name').strip()
                            p_info['full_score'] = int(match.group('score'))
                        p_info['name'] = name
                        href = v.header.node.xpath('a/@href')
                        if href:
                            p_info['suburl'] = href[0]
                            p_info['url'] = href[0]
                        problems_info[problem_key] = p_info

                    if v.value:
                        try:
                            val = float(v.value)
                            if val:
                                p = problems.setdefault(problem_key, {})
                                p['result'] = v.value

                                full_score = problems_info[problem_key].get(
                                    'full_score')
                                if full_score is not None:
                                    p['partial'] = val < full_score
                                else:
                                    style = v.attrs.get('style')
                                    if style:
                                        if 'yellow' in style:
                                            p['partial'] = True
                                        elif 'lightgreen' in style:
                                            p['partial'] = False
                                            if full_score is None:
                                                problems_info[problem_key][
                                                    'full_score'] = int(
                                                        round(val, 0))
                        except ValueError:
                            pass
                    pid += 1
                else:
                    row.setdefault('_info', {})[k] = v.value

            if not problems:
                continue

            handle = row['name'] + ' ' + season
            row['member'] = handle
            if handle in result:
                continue
            result[handle] = row

        standings = {
            'result': result,
            'problems': list(problems_info.values()),
        }

        if is_final:
            standings['options'] = {
                'medals': [{
                    'name': k,
                    'count': 1
                } for k in ('gold', 'silver', 'bronze')]
            }

        return standings
Пример #30
0
    def get_standings(self, users=None, statistics=None):
        geolocator = Nominatim(user_agent="clist.by")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3)

        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        if not self.standings_url:
            return {}

        try:
            standings_xml = REQ.get(self.standings_url.replace('.html', '.xml'), detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        page = REQ.get(self.standings_url)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table)
        mapping_key = {
            'rank': 'place',
            'rankl': 'place',
            'party': 'name',
            'solved': 'solving',
        }

        locations = None
        if os.path.exists(self.LOCATION_CACHE_FILE):
            with open(self.LOCATION_CACHE_FILE, 'r') as fo:
                locations = yaml.safe_load(fo)
        if locations is None:
            locations = {}

        try:
            result = {}
            problems_info = OrderedDict()
            for r in tqdm.tqdm(table):
                row = OrderedDict()
                problems = row.setdefault('problems', {})
                for k, v in list(r.items()):
                    c = v.attrs['class'].split()[0]
                    if c in ['problem', 'ioiprob']:
                        problems_info[k] = {'short': k, 'name': v.attrs['title']}
                        if v.value != DOT:
                            p = problems.setdefault(k, {})

                            first_ac = v.column.node.xpath('.//*[@class="first-to-solve"]')
                            if len(first_ac):
                                p['first_ac'] = True

                            partial = v.column.node.xpath('self::td[@class="ioiprob"]/u')
                            if partial:
                                p['partial'] = True

                            v = v.value
                            if SPACE in v:
                                v, t = v.split(SPACE, 1)
                                p['time'] = t
                            p['result'] = v
                    else:
                        c = mapping_key.get(c, c)
                        row[c] = v.value
                        if xml_result and c == 'name':
                            problems.update(xml_result[v.value])
                if 'penalty' not in row:
                    match = re.search(r'\s*\((?P<info>[^\)]*)\)\s*$', row['name'])
                    if match:
                        row['name'] = row['name'][:match.span()[0]]
                        group_info = match.group('info')
                        if u'класс' in group_info:
                            row['degree'], loc_info = map(str.strip, group_info.split(',', 1))
                        else:
                            loc_info = group_info

                        loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower()
                        if loc_info not in locations:
                            try:
                                locations[loc_info] = {
                                    'ru': geocode(loc_info, language='ru').address,
                                    'en': geocode(loc_info, language='en').address,
                                }
                            except Exception:
                                locations[loc_info] = None
                        address = locations[loc_info]
                        if address:
                            *_, country = map(str.strip, address['en'].split(','))
                            if country.startswith('The '):
                                country = country[4:]
                            row['country'] = country
                            if ', ' in address['ru']:
                                row['city'], *_ = map(str.strip, address['ru'].split(','))

                    solved = [p for p in list(problems.values()) if p['result'] == '100']
                    row['solved'] = {'solving': len(solved)}
                elif re.match('^[0-9]+$', row['penalty']):
                    row['penalty'] = int(row['penalty'])

                for f in 'diploma', 'medal':
                    medal = row.pop(f, None) or row.pop(f.title(), None)
                    if medal:
                        if medal in ['З', 'G']:
                            row['medal'] = 'gold'
                        elif medal in ['С', 'S']:
                            row['medal'] = 'silver'
                        elif medal in ['Б', 'B']:
                            row['medal'] = 'bronze'
                        break
                row['member'] = row['name'] + ' ' + season
                result[row['member']] = row
        finally:
            with open(self.LOCATION_CACHE_FILE, 'wb') as fo:
                yaml.dump(locations, fo, encoding='utf8', allow_unicode=True)

        standings = {
            'result': result,
            'problems': list(problems_info.values()),
        }
        return standings