Exemplo n.º 1
0
def get_regions():
    url = '{0}/site/dropDownLaender'.format(SITE)

    r = requests.get(url, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    htmltext = literal_eval(r.text)

    content = html.fromstring(htmltext)
    values = content.xpath('option/@value')
    labels = content.xpath('option/text()')

    for label, value in zip(labels, values):
        regions.update_one({'region': int(value)},
                           {'$setOnInsert': {
                               'name': label,
                               'type': False
                           }},
                           upsert=True)

    regions.update_one({'region': 'fifa'},
                       {'$setOnInsert': {
                           'name': 'FIFA',
                           'type': True
                       }},
                       upsert=True)
    regions.update_one({'region': 'europa'},
                       {'$setOnInsert': {
                           'name': 'Europe',
                           'type': True
                       }},
                       upsert=True)
    regions.update_one({'region': 'asien'},
                       {'$setOnInsert': {
                           'name': 'Asia',
                           'type': True
                       }},
                       upsert=True)
    regions.update_one({'region': 'amerika'},
                       {'$setOnInsert': {
                           'name': 'America',
                           'type': True
                       }},
                       upsert=True)
    regions.update_one({'region': 'afrika'},
                       {'$setOnInsert': {
                           'name': 'Africa',
                           'type': True
                       }},
                       upsert=True)

    wait()
Exemplo n.º 2
0
def get_seasons(tournament_id, overwrite=False):
    if seasons.find_one({'tournamentId': tournament_id}) and not overwrite:
        print('Seasons already exist')
        return True

    tournament = tournaments.find_one({'tournamentId': tournament_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}'.format(**tournament)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        return False

    content = html.fromstring(r.text)
    season_links = content.xpath('//select[@id="seasons"]/option/@value')
    season_names = content.xpath('//select[@id="seasons"]/option/text()')

    for season_link, season_name in zip(season_links, season_names):
        season = {
            'seasonId': int(season_link.split('/')[-1]),
            'name': season_name,
            'regionId': tournament['regionId'],
            'tournamentId': tournament['tournamentId'],
        }
        seasons.update_one({'seasonId': season['seasonId']},
                           {'$setOnInsert': {
                               'name': season['name'],
                               'regionId': tournament['regionId'],
                               'tournamentId': tournament['tournamentId']}},
                           upsert=True)

    # Sometimes the tournament doesn't have a name in the main menu - use the title on the page
    if tournament['name'] == '':
        tournament_name = content.xpath('//h1[@class="tournament-header"]/text()')[0].strip()
        tournaments.update_one({'tournamentId': tournament['tournamentId']}, {'$se': {'name': tournament_name}})

    # Some tournaments don't show up in the main menu - take a fuller list from the dropdown menu
    tournament_links = content.xpath('//select[@id="tournaments"]/option/@value')
    tournament_names = content.xpath('//select[@id="tournaments"]/option/text()')

    for tournament_link, tournament_name in zip(tournament_links, tournament_names):
        new_tournament = {
            'tournamentId': int(tournament_link.split('/')[-1]),
            'name': tournament_name,
            'regionId': tournament['regionId'],
        }
        tournaments.update_one({'tournamentId': new_tournament['tournamentId']},
                               {'$setOnInsert': {
                                   'name': new_tournament['name'],
                                   'regionId': new_tournament['regionId']}},
                               upsert=True)

    wait()
Exemplo n.º 3
0
def get_tournaments(region_id):
    region = regions.find_one({'region': region_id})
    if region is None:
        return False

    urls = {
        False: '{0}/wettbewerbe/national/wettbewerbe/{region}',
        True: '{0}/wettbewerbe/{region}'
    }
    url = urls[region['type']].format(SITE, **region)

    r = requests.get(url, headers=HEADERS)
    print(r.url, region['name'])

    if r.status_code != 200:
        wait()
        return False

    content = html.fromstring(r.text)
    main = content.xpath(
        '//div/table/tbody/tr/td[@class="hauptlink"]/table/tr/td[2]')
    side = content.xpath('//div[@class="box"]/div/li')

    # Competitions
    for row in side if region['type'] else main:
        tournaments.update_one(
            {'tournament': unquote(row.xpath('a/@href')[0].split('/')[-1])}, {
                '$setOnInsert': {
                    'name': row.xpath('a/@title')[0],
                    'region': region['region']
                }
            },
            upsert=True)

    # National teams
    for row in list() if region['type'] else side:
        teams.update_one(
            {'team': int(unquote(row.xpath('a/@href')[0].split('/')[-1]))}, {
                '$setOnInsert': {
                    'name': row.xpath('a/@title')[0],
                    'region': region['region'],
                    'national': True
                }
            },
            upsert=True)

    wait()
Exemplo n.º 4
0
def get_stages(season_id, overwrite=False):
    if stages.find_one({'seasonId': season_id}) and not overwrite:
        print('Stages already exist')
        return True

    season = seasons.find_one({'seasonId': season_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}'.format(**season)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        return False

    content = html.fromstring(r.text)
    stage_links = content.xpath("//select[@id='stages']/option/@value")
    stage_names = content.xpath("//select[@id='stages']/option/text()")

    for stage_link, stage_name in zip(stage_links, stage_names):
        stages.update_one({'stageId': int(stage_link.split('/')[-1])},
                          {'$setOnInsert': {
                              'name': stage_name,
                              'regionId': season['regionId'],
                              'tournamentId': season['tournamentId'],
                              'seasonId': season['seasonId']}},
                          upsert=True)

    if len(stage_links) == 0:
        fixture_link = content.xpath("//div[@id='sub-navigation']/ul/li/a[text()='Fixtures']/@href")[0]
        stages.update_one({'stageId': int(fixture_link.split("/")[-3])},
                          {'$setOnInsert': {
                              'name': content.xpath('//h1/text()')[0].strip(),
                              'regionId': season['regionId'],
                              'tournamentId': season['tournamentId'],
                              'seasonId': season['seasonId']}},
                          upsert=True)

    wait()
Exemplo n.º 5
0
def get_seasons(tournament_id):
    tournament = tournaments.find_one({'tournament': tournament_id})
    if tournament is None:
        return False

    url = '{0}/wettbewerb/startseite/wettbewerb/{tournament}'.format(
        SITE, **tournament)

    r = requests.get(url, headers=HEADERS)
    print(r.url, tournament['name'])

    if r.status_code != 200:
        wait()
        return False

    if 'cup' not in tournament:
        tournament['cup'] = True if r.url.split(
            '/')[-2] == 'pokalwettbewerb' else False
        tournaments.save(tournament)

    content = html.fromstring(r.text)
    for row in content.xpath(
            '//div[@class="inline-select"]/select[@name="saison_id"]/option'):
        seasons.update_one(
            {
                'tournament': tournament['tournament'],
                'season': int(unquote(row.xpath('@value')[0]))
            }, {
                '$setOnInsert': {
                    'name': row.xpath('text()')[0],
                    'region': tournament['region']
                }
            },
            upsert=True)

    wait()
Exemplo n.º 6
0
def get_player(player_id, overwrite=False):
    keys = {
        'Name:': {'xpath': 'dd/text()', 'key': 'name', 'parse': str},
        'Full Name:': {'xpath': 'dd/text()', 'key': 'fullName', 'parse': str},
        'Current Team:': {'xpath': 'dd/a/@href', 'key': 'teamId', 'parse': parseTeam},
        'Shirt Number:': {'xpath': 'dd/text()', 'key': 'number', 'parse': int},
        'Positions:': {'xpath': 'dd/ul/li/text()', 'key': 'position', 'parse': str},
        'Age:': {'xpath': 'dd/i/text()', 'key': 'birthDate', 'parse': parseDate},
        'Height:': {'xpath': 'dd/text()', 'key': 'height', 'parse': parseHeight},
        'Weight:': {'xpath': 'dd/text()', 'key': 'weight', 'parse': parseWeight},
        'Nationality:': {'xpath': 'dd/span/text()', 'key': 'nationality', 'parse': str},
    }

    player = players.find_one({'playerId': player_id})
    if not player:
        player = {'playerId': player_id}
    elif not overwrite:
        print('Player already exists')
        return True

    page = SITE+'/Players/{0}'.format(player_id)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    if page != r.url:
        wait()
        return False

    content = html.fromstring(r.text)
    blocks = content.xpath("//div[@class='player-info']/div/div/dl")

    for block in blocks:
        title = block.xpath('dt/text()')[0]
        if title in keys:
            k = keys[title]
        else:
            print('Unexpected info: "{}"'.format(title))
            continue
        value = ', '.join([p.strip() for p in block.xpath(k['xpath']) if p.strip() != ''])
        player[k['key']] = k['parse'](value)

    players.save(player)
    wait()
Exemplo n.º 7
0
def get_fixtures_for_date(d=None, overwrite=False):
    if d is None:
        params = {'d': datetime.strftime(datetime.utcnow(), '%Y%m%d')}
    elif type(d) is datetime:
        params = {'d': datetime.strftime(d, '%Y%m%d')}
    elif type(d) in [str, int]:
        params = {'d': d}
    else:
        print('Unknown date type')
        return False

    page = SITE+'/LiveScores/'
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0]
    headers = HEADERS.copy()
    headers['Model-Last-Mode'] = model_last_mode
    headers['Referer'] = r.url
    headers['X-Requested-With'] = 'XMLHttpRequest'
    print(model_last_mode)
    wait()

    page = SITE+'/matchesfeed/'
    r = requests.get(page, params=params, headers=HEADERS, allow_redirects=False)
    print(r.url, r.status_code)
    print(r.text)

    if r.status_code != 200:
        wait()
        return False

    matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', r.text)
    data = json.loads(matchData.replace("'", '"'))
    print(data)

    stageData = data[1]
    matchData = data[2]
Exemplo n.º 8
0
def get_match(match_id, overwrite=False):
    if matches.find_one({'matchId': match_id}) and not overwrite:
        print('Match already exists')
        return True

    page = SITE+'/Matches/{0}/Live'.format(match_id)
    r = requests.get(page, headers=HEADERS)

    print(r.url)
    content = unescape(r.text)

    if r.status_code != 200:
        wait()
        return False

    if r.url != page:
        match = {'matchId': match_id, 'error': 'No page found'}
        print(match['error'])
        matches.update_one({'matchId': match['matchId']}, {'$set': {'error': match['error']}}, upsert=True)
        wait()
        return False

    matchId = re.findall("matchId = ([^;]+);", content)
    matchData = re.findall("matchCentreData = ([^;]+});", content)

    if matchData and matchData != ['null']:
        match = json.loads(matchData[0], strict=False)
        match['matchId'] = int(matchId[0])

    else:
        matchData = re.findall("initialMatchDataForScrappers = (.+]);", content, re.DOTALL)

        if matchData:
            matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', matchData[0].replace("'", '"'))
            matchData = json.loads(matchData, strict=False)

            matchHeader = matchData[0][0]
            matchEvents = matchData[0][1]
            matchLineup = matchData[0][2]
            timeInterval = matchData[1]

        else:
            matchData = re.findall("matchHeader.load\(([^;]+)\r\n\);", content)
            matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', matchData[0].replace("'", '"'))

            matchHeader = json.loads(matchData, strict=False)
            matchEvents = []
            matchLineup = []

        fieldHeader = [['home', 'teamId'], ['away', 'teamId'], ['home', 'name'], ['away', 'name'],
                       'startTime', 'startDate', 'statusCode', 'elapsed',
                       'htScore', 'ftScore', 'etScore', 'pkScore', 'score'
                       ]

        match = {'matchId': match_id, 'home': {'field': 'home'}, 'away': {'field': 'away'}}

        for k, v in zip(fieldHeader, matchHeader):
            if v:
                if type(k) == list:
                    match[k[0]][k[1]] = v
                else:
                    match[k] = v

        parseLineup(matchLineup, match)
        if matchEvents[:1]:
            parseEvents(matchEvents, match)

    content = html.fromstring(r.text)
    link = content.xpath("//div[@id='breadcrumb-nav']/a/@href")

    if link:
        for key, val in re.findall(r'/(?P<key>\w+)/(?P<val>\d+)', link[0]):
            key = key[:-1].lower() + 'Id'
            match[key] = int(val)

    match['startDate'] = datetime.strptime(match['startDate'], '%m/%d/%Y %I:%M:%S %p')
    match['startTime'] = datetime.strptime(match['startTime'], '%m/%d/%Y %I:%M:%S %p')
    if 'timeStamp' in match:
        try:
            match['timeStamp'] = datetime.strptime(match['timeStamp'], '%d/%m/%Y %H:%M:%S')
        except ValueError:
            match['timeStamp'] = datetime.strptime(match['timeStamp'], '%Y-%m-%d %H:%M:%S')
    matches.replace_one({'matchId': match_id}, match, upsert=True)

    wait()
    return True
Exemplo n.º 9
0
def get_fixtures(stage_id, overwrite=False):
    if matchheaders.find_one({'stageId': stage_id}) and not overwrite:
        print('Matches already exist')
        return True

    stage = stages.find_one({'stageId': stage_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0]
    headers = HEADERS.copy()
    headers['Model-Last-Mode'] = model_last_mode
    headers['Referer'] = r.url
    headers['X-Requested-With'] = 'XMLHttpRequest'

    dates = re.findall("'Month', ([^ ]+), min, max", r.text)
    if dates:
        dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0])
        d = json.loads(dates)

        if len(d) == 0:
            print('No matches')
            wait()
            return False

        months = {format(d): format(d+1, '02') for d in range(0, 12)}
        params = {'isAggregate': 'false'}

        for y in d:
            for m in d[y]:
                params['d'] = '{0}{1}'.format(y, months[m])
                wait()

                page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id)
                r = requests.get(page, params=params, headers=headers, allow_redirects=False)
                print(r.url, r.status_code)

                if r.status_code != 200:
                    wait()
                    return False

                matchData = re.sub(r',(?=,)', r',null', r.text)
                data = json.loads(matchData.replace("'", '"'))

                for row in data:
                    match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                             'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                             'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                             'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                             'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                             }

                    if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                        print('Match already exists')
                    else:
                        match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                        match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                        match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                        for k, v in stage.items():
                            if 'Id' in k:
                                match[k] = v

                        matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    else:
        matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text)
        matchData = re.sub(r',(?=,)', r',null', matchData[0])
        data = json.loads(matchData.replace("'", '"') if matchData else '{}')

        for row in data:
            match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                     'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                     'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                     'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                     'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                     }

            if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                print('Match already exists')
            else:
                match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                for k, v in stage.items():
                    if 'Id' in k:
                        match[k] = v

                matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    wait()
Exemplo n.º 10
0
def get_lineups(match_id):
    match = matches.find_one({'match': match_id})
    url = '{0}/spielbericht/aufstellung/spielbericht/{match}'.format(
        SITE, **match)
    r = requests.get(url, headers=HEADERS)
    print(r.url)

    content = html.fromstring(r.text)
    tables = content.xpath('//div[@class="box"]')

    for table in tables:
        section = table.xpath('div/text()')[0].split(' ')[-1].lower()
        side = 'home' if int(
            table.xpath('div/a/@id')[0]) == match['home']['team'] else 'away'

        match[side][section] = list()
        for row in table.xpath('div/table[@class="items"]/tr'):
            if section == 'manager':
                manager_name = row.xpath('td[1]/table/tr/td[2]/a/@title')[0]
                manager_id = row.xpath(
                    'td[1]/table/tr/td[2]/a/@href')[0].split('/')[-1]
                manager_nationality = row.xpath('td[2]/img/@title')[0]

                managers.update_one({'manager': int(manager_id)}, {
                    '$setOnInsert': {
                        'name': manager_name,
                        'nationality': manager_nationality
                    }
                },
                                    upsert=True)
                match[side][section] = int(manager_id)

            else:
                player_position = row.xpath('td[1]/@title')[0]
                player_number = row.xpath('td[1]/div/text()')[0]
                player_name = row.xpath('td[2]/table/tr/td[2]/a/@title')[0]
                player_id = row.xpath('td[2]/table/tr/td[2]/a/@id')[0]
                player_nationality = row.xpath('td[3]/img/@title')[0]

                players.update_one({'player': int(player_id)}, {
                    '$setOnInsert': {
                        'name': player_name,
                        'nationality': player_nationality
                    }
                },
                                   upsert=True)

                match[side][section].append({
                    'player': int(player_id),
                    'position': player_position,
                    'number': player_number
                })

    for venue in content.xpath('//div[@class="sb-spieldaten"]/p[3]/span/a'):
        venues.update_one(
            {'venue': int(venue.xpath('@href')[0].split('/')[-3])},
            {'$setOnInsert': {
                'name': venue.xpath('text()')[0]
            }},
            upsert=True)
        match['venue'] = int(venue.xpath('@href')[0].split('/')[-3])

    for attendance in content.xpath(
            '//div[@class="sb-spieldaten"]/p[3]/span/strong/text()'):
        match['attendance'] = int(
            attendance.replace(' Spectators', '').replace('.', ''))

    for referee in content.xpath('//div[@class="sb-spieldaten"]/p[3]/a'):
        referees.update_one(
            {'referee': int(referee.xpath('@href')[0].split('/')[-1])},
            {'$setOnInsert': {
                'name': referee.xpath('@title')[0]
            }},
            upsert=True)
        match['referee'] = int(referee.xpath('@href')[0].split('/')[-1])

    matches.save(match)

    wait()
Exemplo n.º 11
0
def get_fixtures(tournament_id, season_id):
    season = seasons.find_one({
        'tournament': tournament_id,
        'season': season_id
    })
    tournament = tournaments.find_one({'tournament': tournament_id})
    if tournament.get('cup') == 1:
        url = '{0}/spielplan/gesamtspielplan/pokalwettbewerb/{tournament}/saison_id/{season}'.format(
            SITE, **season)
    else:
        url = '{0}/spielplan/gesamtspielplan/wettbewerb/{tournament}/saison_id/{season}'.format(
            SITE, **season)
    r = requests.get(url, headers=HEADERS)
    print(r.url, tournament['name'], season['name'])

    if r.status_code != 200:
        wait()
        return False

    content = html.fromstring(r.text)
    datestamp, timestamp = date.min, time.min
    for row in content.xpath(
            '//div[@class="box"]/table/tbody/tr[not(td/@colspan)]'):
        teams.update_one({'team': int(row.xpath('td[3]/a/@id')[0])}, {
            '$setOnInsert': {
                'name': row.xpath('td[3]/a/text()')[0],
                'region': tournament['region'],
                'national': False
            }
        },
                         upsert=True)
        teams.update_one({'team': int(row.xpath('td[7]/a/@id')[0])}, {
            '$setOnInsert': {
                'name': row.xpath('td[7]/a/text()')[0],
                'region': tournament['region'],
                'national': False
            }
        },
                         upsert=True)

        if row.xpath('td[2]/text()')[0].strip():
            timestamp = datetime.strptime(
                row.xpath('td[2]/text()')[0].strip(), '%I:%M %p').time()

        if row.xpath('td[1]/a/@href'):
            datestring = row.xpath('td[1]/a/@href')[0].split('/')[-1]
            if datestring == '0000-00-00':
                datestamp = datestamp.min
            else:
                datestamp = datetime.strptime(
                    row.xpath('td[1]/a/@href')[0].split('/')[-1], '%Y-%m-%d')
        else:
            datestamp = datetime.strptime(
                row.xpath('td[1]/text()')[0].strip().split(' ')[-1],
                '%m/%d/%y')

        matches.update_one(
            {'match': int(row.xpath('td[5]/a/@href')[0].split('/')[-1])}, {
                '$setOnInsert': {
                    'season': season['season'],
                    'tournament': tournament['tournament'],
                    'region': tournament['region']
                },
                '$set': {
                    'date': datestamp,
                    'time': datetime.combine(datestamp.date(), timestamp),
                    'home': {
                        'team': int(row.xpath('td[3]/a/@id')[0])
                    },
                    'away': {
                        'team': int(row.xpath('td[7]/a/@id')[0])
                    },
                    'score': row.xpath('td[5]/a/text()')[0]
                }
            },
            upsert=True)

    wait()