def get_regions(): url = '{0}/site/dropDownLaender'.format(SITE) r = requests.get(url, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False htmltext = literal_eval(r.text) content = html.fromstring(htmltext) values = content.xpath('option/@value') labels = content.xpath('option/text()') for label, value in zip(labels, values): regions.update_one({'region': int(value)}, {'$setOnInsert': { 'name': label, 'type': False }}, upsert=True) regions.update_one({'region': 'fifa'}, {'$setOnInsert': { 'name': 'FIFA', 'type': True }}, upsert=True) regions.update_one({'region': 'europa'}, {'$setOnInsert': { 'name': 'Europe', 'type': True }}, upsert=True) regions.update_one({'region': 'asien'}, {'$setOnInsert': { 'name': 'Asia', 'type': True }}, upsert=True) regions.update_one({'region': 'amerika'}, {'$setOnInsert': { 'name': 'America', 'type': True }}, upsert=True) regions.update_one({'region': 'afrika'}, {'$setOnInsert': { 'name': 'Africa', 'type': True }}, upsert=True) wait()
def get_seasons(tournament_id, overwrite=False): if seasons.find_one({'tournamentId': tournament_id}) and not overwrite: print('Seasons already exist') return True tournament = tournaments.find_one({'tournamentId': tournament_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}'.format(**tournament) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) season_links = content.xpath('//select[@id="seasons"]/option/@value') season_names = content.xpath('//select[@id="seasons"]/option/text()') for season_link, season_name in zip(season_links, season_names): season = { 'seasonId': int(season_link.split('/')[-1]), 'name': season_name, 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId'], } seasons.update_one({'seasonId': season['seasonId']}, {'$setOnInsert': { 'name': season['name'], 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId']}}, upsert=True) # Sometimes the tournament doesn't have a name in the main menu - use the title on the page if tournament['name'] == '': tournament_name = content.xpath('//h1[@class="tournament-header"]/text()')[0].strip() tournaments.update_one({'tournamentId': tournament['tournamentId']}, {'$se': {'name': tournament_name}}) # Some tournaments don't show up in the main menu - take a fuller list from the dropdown menu tournament_links = content.xpath('//select[@id="tournaments"]/option/@value') tournament_names = content.xpath('//select[@id="tournaments"]/option/text()') for tournament_link, tournament_name in zip(tournament_links, tournament_names): new_tournament = { 'tournamentId': int(tournament_link.split('/')[-1]), 'name': tournament_name, 'regionId': tournament['regionId'], } tournaments.update_one({'tournamentId': new_tournament['tournamentId']}, {'$setOnInsert': { 'name': new_tournament['name'], 'regionId': new_tournament['regionId']}}, upsert=True) wait()
def get_tournaments(region_id): region = regions.find_one({'region': region_id}) if region is None: return False urls = { False: '{0}/wettbewerbe/national/wettbewerbe/{region}', True: '{0}/wettbewerbe/{region}' } url = urls[region['type']].format(SITE, **region) r = requests.get(url, headers=HEADERS) print(r.url, region['name']) if r.status_code != 200: wait() return False content = html.fromstring(r.text) main = content.xpath( '//div/table/tbody/tr/td[@class="hauptlink"]/table/tr/td[2]') side = content.xpath('//div[@class="box"]/div/li') # Competitions for row in side if region['type'] else main: tournaments.update_one( {'tournament': unquote(row.xpath('a/@href')[0].split('/')[-1])}, { '$setOnInsert': { 'name': row.xpath('a/@title')[0], 'region': region['region'] } }, upsert=True) # National teams for row in list() if region['type'] else side: teams.update_one( {'team': int(unquote(row.xpath('a/@href')[0].split('/')[-1]))}, { '$setOnInsert': { 'name': row.xpath('a/@title')[0], 'region': region['region'], 'national': True } }, upsert=True) wait()
def get_stages(season_id, overwrite=False): if stages.find_one({'seasonId': season_id}) and not overwrite: print('Stages already exist') return True season = seasons.find_one({'seasonId': season_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}'.format(**season) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) stage_links = content.xpath("//select[@id='stages']/option/@value") stage_names = content.xpath("//select[@id='stages']/option/text()") for stage_link, stage_name in zip(stage_links, stage_names): stages.update_one({'stageId': int(stage_link.split('/')[-1])}, {'$setOnInsert': { 'name': stage_name, 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) if len(stage_links) == 0: fixture_link = content.xpath("//div[@id='sub-navigation']/ul/li/a[text()='Fixtures']/@href")[0] stages.update_one({'stageId': int(fixture_link.split("/")[-3])}, {'$setOnInsert': { 'name': content.xpath('//h1/text()')[0].strip(), 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) wait()
def get_seasons(tournament_id): tournament = tournaments.find_one({'tournament': tournament_id}) if tournament is None: return False url = '{0}/wettbewerb/startseite/wettbewerb/{tournament}'.format( SITE, **tournament) r = requests.get(url, headers=HEADERS) print(r.url, tournament['name']) if r.status_code != 200: wait() return False if 'cup' not in tournament: tournament['cup'] = True if r.url.split( '/')[-2] == 'pokalwettbewerb' else False tournaments.save(tournament) content = html.fromstring(r.text) for row in content.xpath( '//div[@class="inline-select"]/select[@name="saison_id"]/option'): seasons.update_one( { 'tournament': tournament['tournament'], 'season': int(unquote(row.xpath('@value')[0])) }, { '$setOnInsert': { 'name': row.xpath('text()')[0], 'region': tournament['region'] } }, upsert=True) wait()
def get_player(player_id, overwrite=False): keys = { 'Name:': {'xpath': 'dd/text()', 'key': 'name', 'parse': str}, 'Full Name:': {'xpath': 'dd/text()', 'key': 'fullName', 'parse': str}, 'Current Team:': {'xpath': 'dd/a/@href', 'key': 'teamId', 'parse': parseTeam}, 'Shirt Number:': {'xpath': 'dd/text()', 'key': 'number', 'parse': int}, 'Positions:': {'xpath': 'dd/ul/li/text()', 'key': 'position', 'parse': str}, 'Age:': {'xpath': 'dd/i/text()', 'key': 'birthDate', 'parse': parseDate}, 'Height:': {'xpath': 'dd/text()', 'key': 'height', 'parse': parseHeight}, 'Weight:': {'xpath': 'dd/text()', 'key': 'weight', 'parse': parseWeight}, 'Nationality:': {'xpath': 'dd/span/text()', 'key': 'nationality', 'parse': str}, } player = players.find_one({'playerId': player_id}) if not player: player = {'playerId': player_id} elif not overwrite: print('Player already exists') return True page = SITE+'/Players/{0}'.format(player_id) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False if page != r.url: wait() return False content = html.fromstring(r.text) blocks = content.xpath("//div[@class='player-info']/div/div/dl") for block in blocks: title = block.xpath('dt/text()')[0] if title in keys: k = keys[title] else: print('Unexpected info: "{}"'.format(title)) continue value = ', '.join([p.strip() for p in block.xpath(k['xpath']) if p.strip() != '']) player[k['key']] = k['parse'](value) players.save(player) wait()
def get_fixtures_for_date(d=None, overwrite=False): if d is None: params = {'d': datetime.strftime(datetime.utcnow(), '%Y%m%d')} elif type(d) is datetime: params = {'d': datetime.strftime(d, '%Y%m%d')} elif type(d) in [str, int]: params = {'d': d} else: print('Unknown date type') return False page = SITE+'/LiveScores/' r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0] headers = HEADERS.copy() headers['Model-Last-Mode'] = model_last_mode headers['Referer'] = r.url headers['X-Requested-With'] = 'XMLHttpRequest' print(model_last_mode) wait() page = SITE+'/matchesfeed/' r = requests.get(page, params=params, headers=HEADERS, allow_redirects=False) print(r.url, r.status_code) print(r.text) if r.status_code != 200: wait() return False matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', r.text) data = json.loads(matchData.replace("'", '"')) print(data) stageData = data[1] matchData = data[2]
def get_match(match_id, overwrite=False): if matches.find_one({'matchId': match_id}) and not overwrite: print('Match already exists') return True page = SITE+'/Matches/{0}/Live'.format(match_id) r = requests.get(page, headers=HEADERS) print(r.url) content = unescape(r.text) if r.status_code != 200: wait() return False if r.url != page: match = {'matchId': match_id, 'error': 'No page found'} print(match['error']) matches.update_one({'matchId': match['matchId']}, {'$set': {'error': match['error']}}, upsert=True) wait() return False matchId = re.findall("matchId = ([^;]+);", content) matchData = re.findall("matchCentreData = ([^;]+});", content) if matchData and matchData != ['null']: match = json.loads(matchData[0], strict=False) match['matchId'] = int(matchId[0]) else: matchData = re.findall("initialMatchDataForScrappers = (.+]);", content, re.DOTALL) if matchData: matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', matchData[0].replace("'", '"')) matchData = json.loads(matchData, strict=False) matchHeader = matchData[0][0] matchEvents = matchData[0][1] matchLineup = matchData[0][2] timeInterval = matchData[1] else: matchData = re.findall("matchHeader.load\(([^;]+)\r\n\);", content) matchData = re.sub(r'([,[])(?=[,\]])', r'\1null', matchData[0].replace("'", '"')) matchHeader = json.loads(matchData, strict=False) matchEvents = [] matchLineup = [] fieldHeader = [['home', 'teamId'], ['away', 'teamId'], ['home', 'name'], ['away', 'name'], 'startTime', 'startDate', 'statusCode', 'elapsed', 'htScore', 'ftScore', 'etScore', 'pkScore', 'score' ] match = {'matchId': match_id, 'home': {'field': 'home'}, 'away': {'field': 'away'}} for k, v in zip(fieldHeader, matchHeader): if v: if type(k) == list: match[k[0]][k[1]] = v else: match[k] = v parseLineup(matchLineup, match) if matchEvents[:1]: parseEvents(matchEvents, match) content = html.fromstring(r.text) link = content.xpath("//div[@id='breadcrumb-nav']/a/@href") if link: for key, val in re.findall(r'/(?P<key>\w+)/(?P<val>\d+)', link[0]): key = key[:-1].lower() + 'Id' match[key] = int(val) match['startDate'] = datetime.strptime(match['startDate'], '%m/%d/%Y %I:%M:%S %p') match['startTime'] = datetime.strptime(match['startTime'], '%m/%d/%Y %I:%M:%S %p') if 'timeStamp' in match: try: match['timeStamp'] = datetime.strptime(match['timeStamp'], '%d/%m/%Y %H:%M:%S') except ValueError: match['timeStamp'] = datetime.strptime(match['timeStamp'], '%Y-%m-%d %H:%M:%S') matches.replace_one({'matchId': match_id}, match, upsert=True) wait() return True
def get_fixtures(stage_id, overwrite=False): if matchheaders.find_one({'stageId': stage_id}) and not overwrite: print('Matches already exist') return True stage = stages.find_one({'stageId': stage_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0] headers = HEADERS.copy() headers['Model-Last-Mode'] = model_last_mode headers['Referer'] = r.url headers['X-Requested-With'] = 'XMLHttpRequest' dates = re.findall("'Month', ([^ ]+), min, max", r.text) if dates: dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0]) d = json.loads(dates) if len(d) == 0: print('No matches') wait() return False months = {format(d): format(d+1, '02') for d in range(0, 12)} params = {'isAggregate': 'false'} for y in d: for m in d[y]: params['d'] = '{0}{1}'.format(y, months[m]) wait() page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id) r = requests.get(page, params=params, headers=headers, allow_redirects=False) print(r.url, r.status_code) if r.status_code != 200: wait() return False matchData = re.sub(r',(?=,)', r',null', r.text) data = json.loads(matchData.replace("'", '"')) for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) else: matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text) matchData = re.sub(r',(?=,)', r',null', matchData[0]) data = json.loads(matchData.replace("'", '"') if matchData else '{}') for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) wait()
def get_lineups(match_id): match = matches.find_one({'match': match_id}) url = '{0}/spielbericht/aufstellung/spielbericht/{match}'.format( SITE, **match) r = requests.get(url, headers=HEADERS) print(r.url) content = html.fromstring(r.text) tables = content.xpath('//div[@class="box"]') for table in tables: section = table.xpath('div/text()')[0].split(' ')[-1].lower() side = 'home' if int( table.xpath('div/a/@id')[0]) == match['home']['team'] else 'away' match[side][section] = list() for row in table.xpath('div/table[@class="items"]/tr'): if section == 'manager': manager_name = row.xpath('td[1]/table/tr/td[2]/a/@title')[0] manager_id = row.xpath( 'td[1]/table/tr/td[2]/a/@href')[0].split('/')[-1] manager_nationality = row.xpath('td[2]/img/@title')[0] managers.update_one({'manager': int(manager_id)}, { '$setOnInsert': { 'name': manager_name, 'nationality': manager_nationality } }, upsert=True) match[side][section] = int(manager_id) else: player_position = row.xpath('td[1]/@title')[0] player_number = row.xpath('td[1]/div/text()')[0] player_name = row.xpath('td[2]/table/tr/td[2]/a/@title')[0] player_id = row.xpath('td[2]/table/tr/td[2]/a/@id')[0] player_nationality = row.xpath('td[3]/img/@title')[0] players.update_one({'player': int(player_id)}, { '$setOnInsert': { 'name': player_name, 'nationality': player_nationality } }, upsert=True) match[side][section].append({ 'player': int(player_id), 'position': player_position, 'number': player_number }) for venue in content.xpath('//div[@class="sb-spieldaten"]/p[3]/span/a'): venues.update_one( {'venue': int(venue.xpath('@href')[0].split('/')[-3])}, {'$setOnInsert': { 'name': venue.xpath('text()')[0] }}, upsert=True) match['venue'] = int(venue.xpath('@href')[0].split('/')[-3]) for attendance in content.xpath( '//div[@class="sb-spieldaten"]/p[3]/span/strong/text()'): match['attendance'] = int( attendance.replace(' Spectators', '').replace('.', '')) for referee in content.xpath('//div[@class="sb-spieldaten"]/p[3]/a'): referees.update_one( {'referee': int(referee.xpath('@href')[0].split('/')[-1])}, {'$setOnInsert': { 'name': referee.xpath('@title')[0] }}, upsert=True) match['referee'] = int(referee.xpath('@href')[0].split('/')[-1]) matches.save(match) wait()
def get_fixtures(tournament_id, season_id): season = seasons.find_one({ 'tournament': tournament_id, 'season': season_id }) tournament = tournaments.find_one({'tournament': tournament_id}) if tournament.get('cup') == 1: url = '{0}/spielplan/gesamtspielplan/pokalwettbewerb/{tournament}/saison_id/{season}'.format( SITE, **season) else: url = '{0}/spielplan/gesamtspielplan/wettbewerb/{tournament}/saison_id/{season}'.format( SITE, **season) r = requests.get(url, headers=HEADERS) print(r.url, tournament['name'], season['name']) if r.status_code != 200: wait() return False content = html.fromstring(r.text) datestamp, timestamp = date.min, time.min for row in content.xpath( '//div[@class="box"]/table/tbody/tr[not(td/@colspan)]'): teams.update_one({'team': int(row.xpath('td[3]/a/@id')[0])}, { '$setOnInsert': { 'name': row.xpath('td[3]/a/text()')[0], 'region': tournament['region'], 'national': False } }, upsert=True) teams.update_one({'team': int(row.xpath('td[7]/a/@id')[0])}, { '$setOnInsert': { 'name': row.xpath('td[7]/a/text()')[0], 'region': tournament['region'], 'national': False } }, upsert=True) if row.xpath('td[2]/text()')[0].strip(): timestamp = datetime.strptime( row.xpath('td[2]/text()')[0].strip(), '%I:%M %p').time() if row.xpath('td[1]/a/@href'): datestring = row.xpath('td[1]/a/@href')[0].split('/')[-1] if datestring == '0000-00-00': datestamp = datestamp.min else: datestamp = datetime.strptime( row.xpath('td[1]/a/@href')[0].split('/')[-1], '%Y-%m-%d') else: datestamp = datetime.strptime( row.xpath('td[1]/text()')[0].strip().split(' ')[-1], '%m/%d/%y') matches.update_one( {'match': int(row.xpath('td[5]/a/@href')[0].split('/')[-1])}, { '$setOnInsert': { 'season': season['season'], 'tournament': tournament['tournament'], 'region': tournament['region'] }, '$set': { 'date': datestamp, 'time': datetime.combine(datestamp.date(), timestamp), 'home': { 'team': int(row.xpath('td[3]/a/@id')[0]) }, 'away': { 'team': int(row.xpath('td[7]/a/@id')[0]) }, 'score': row.xpath('td[5]/a/text()')[0] } }, upsert=True) wait()