def get_seasons(tournament_id, overwrite=False): if seasons.find_one({'tournamentId': tournament_id}) and not overwrite: print('Seasons already exist') return True tournament = tournaments.find_one({'tournamentId': tournament_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}'.format(**tournament) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) season_links = content.xpath('//select[@id="seasons"]/option/@value') season_names = content.xpath('//select[@id="seasons"]/option/text()') for season_link, season_name in zip(season_links, season_names): season = { 'seasonId': int(season_link.split('/')[-1]), 'name': season_name, 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId'], } seasons.update_one({'seasonId': season['seasonId']}, {'$setOnInsert': { 'name': season['name'], 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId']}}, upsert=True) # Sometimes the tournament doesn't have a name in the main menu - use the title on the page if tournament['name'] == '': tournament_name = content.xpath('//h1[@class="tournament-header"]/text()')[0].strip() tournaments.update_one({'tournamentId': tournament['tournamentId']}, {'$se': {'name': tournament_name}}) # Some tournaments don't show up in the main menu - take a fuller list from the dropdown menu tournament_links = content.xpath('//select[@id="tournaments"]/option/@value') tournament_names = content.xpath('//select[@id="tournaments"]/option/text()') for tournament_link, tournament_name in zip(tournament_links, tournament_names): new_tournament = { 'tournamentId': int(tournament_link.split('/')[-1]), 'name': tournament_name, 'regionId': tournament['regionId'], } tournaments.update_one({'tournamentId': new_tournament['tournamentId']}, {'$setOnInsert': { 'name': new_tournament['name'], 'regionId': new_tournament['regionId']}}, upsert=True) wait()
def get_tournaments(region_id): region = regions.find_one({'region': region_id}) if region is None: return False urls = { False: '{0}/wettbewerbe/national/wettbewerbe/{region}', True: '{0}/wettbewerbe/{region}' } url = urls[region['type']].format(SITE, **region) r = requests.get(url, headers=HEADERS) print(r.url, region['name']) if r.status_code != 200: wait() return False content = html.fromstring(r.text) main = content.xpath( '//div/table/tbody/tr/td[@class="hauptlink"]/table/tr/td[2]') side = content.xpath('//div[@class="box"]/div/li') # Competitions for row in side if region['type'] else main: tournaments.update_one( {'tournament': unquote(row.xpath('a/@href')[0].split('/')[-1])}, { '$setOnInsert': { 'name': row.xpath('a/@title')[0], 'region': region['region'] } }, upsert=True) # National teams for row in list() if region['type'] else side: teams.update_one( {'team': int(unquote(row.xpath('a/@href')[0].split('/')[-1]))}, { '$setOnInsert': { 'name': row.xpath('a/@title')[0], 'region': region['region'], 'national': True } }, upsert=True) wait()
def get_all_tournaments(): r = requests.get(SITE, headers=HEADERS) print(r.url) if r.status_code != 200: return False all_regions = re.findall("allRegions = ([^;]+);", r.text)[0].replace("'", '"') all_regions = re.sub(r'(\w+):', r'"\1":', all_regions) for region in json.loads(all_regions): regions.update_one({'regionId': region['id']}, {'$setOnInsert': { 'name': region['name'], 'type': region['type']}}, upsert=True) tournament_list = region.pop('tournaments') for tournament in tournament_list: tournaments.update_one({'tournamentId': tournament['id']}, {'$setOnInsert': { 'name': tournament['name'], 'regionId': region['id']}}, upsert=True)