def get_half_stat(url, half, command_id=None): html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start parsing HALF {half} stat for {url}\n') half_table = ('div#tab-statistics-1-statistic' if half == '1st_half' else 'div#tab-statistics-2-statistic') stat_rows = soup.select(f'{half_table} div.statRow') half_stats = {} for stat_row in stat_rows: if command_id is not None: title_value = normalize_value(stat_row.select( 'div.statText.statText--titleValue')[0].text) home_value = normalize_value(stat_row.select( 'div.statText.statText--homeValue')[0].text) away_value = normalize_value(stat_row.select( 'div.statText.statText--awayValue')[0].text) half_stats[f'{half}_{title_value}_OWN'] = int( [home_value, away_value][command_id]) half_stats[f'{half}_{title_value}_ENEMY'] = int( [home_value, away_value][command_id - 1]) else: title_value = normalize_value(stat_row.select( 'div.statText.statText--titleValue')[0].text) home_value = normalize_value(stat_row.select( 'div.statText.statText--homeValue')[0].text) away_value = normalize_value(stat_row.select( 'div.statText.statText--awayValue')[0].text) half_stats[f'{half}_{title_value}_home'] = int(home_value) half_stats[f'{half}_{title_value}_away'] = int(away_value) app_logger.debug(f'Received HALF stat:\n{half_stats}\n') return half_stats
def get_past_stat(url): main_stat = get_main_stat(url) html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start collect PAST stat data from {url}\n') template = 'https://www.flashscore.com{}/results/' home_command_url = template.format( soup.select('div.team-text.tname-home a.participant-imglink') [0].attrs['onclick'].split("'")[1]) away_command_url = template.format( soup.select('div.team-text.tname-away a.participant-imglink') [0].attrs['onclick'].split("'")[1]) home_prev_events = find_previous_events(home_command_url, main_stat['date']) away_prev_events = find_previous_events(away_command_url, main_stat['date']) past_stat = {} home_past_stat = add_type_command( get_summary_stat(home_prev_events, main_stat['home_command'], main_stat['championate'], 'home'), 'HOME') away_past_stat = add_type_command( get_summary_stat(away_prev_events, main_stat['away_command'], main_stat['championate'], 'away'), 'AWAY') past_stat.update(home_past_stat) past_stat.update(away_past_stat) app_logger.debug('Formed PAST STAT') return past_stat
def get_live_stat(url): live_stat = {} live_stat.update(get_main_stat(url)) first_half_url = url + '#match-statistics;1' second_half_url = url + '#match-statistics;2' live_stat.update(get_half_stat(first_half_url, '1st_half')) live_stat.update(get_half_stat(second_half_url, '2nd_half')) app_logger.debug(f'Formed data dict with live stat:\n {live_stat}\n') return live_stat
def get_html(url): user_agent = UserAgent().chrome r = requests.get(url, headers={'User-Agent': user_agent}) if r.ok: app_logger.debug(f'Received html page {url} code = {r.status_code}') return r.text else: app_logger.exception(f'Error getting html page {url} {r.status_code}') print(r.ok)
def get_summary_stat(stat_rows, command, championate, position, select_type='position'): app_logger.info(f'Start received SUMMARY stats for {command}\n') stat_rows = (find_position_events(stat_rows, command, position) if select_type == 'position' else stat_rows) stat_rows = rows_filter(stat_rows, championate) app_logger.info(f'LEFT AFTER FILTER {len(stat_rows)} rows ') summary_stats = [] for stat_row in stat_rows: event_stat = {} try: home_command = stat_row.select( 'div.event__participant--home')[0].text.strip() away_command = stat_row.select( 'div.event__participant--away')[0].text.strip() event_scores = stat_row.select('div.event__scores span') first_half_scores = stat_row.select( 'div.event__part')[0].text.strip('(').strip(')').split('-') command_id = 0 if command in home_command else 1 event_stat['goals_scored'] = event_scores[command_id].text event_stat['goals_missed'] = event_scores[command_id - 1].text event_stat['1half_goals_scored'] = first_half_scores[command_id] event_stat['1half_goals_missed'] = first_half_scores[command_id - 1] event_id = stat_row['id'][4:] first_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;1' second_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;2' app_logger.info( f'DETAIL STAT {home_command} {event_scores} {away_command}') event_stat.update( get_half_stat(first_half_url, '1st_half', command_id)) event_stat.update( get_half_stat(second_half_url, '2nd_half', command_id)) summary_stats.append(event_stat) except Exception: app_logger.exception( f'\nError received data from stat row {command}') app_logger.debug(f'Formed event stats: \n{event_stat}\n') return (calculate_stat(summary_stats))
def calculate_stat(stats): app_logger.info('Start CALCULATING collected past stats') slices = [3, 5, 10, 15, 20] sums = {} slice_stats = {} for i, stat in enumerate(stats): keys = stat.keys() for key in keys: if key in sums: sums[key] += int(stat[key]) else: sums[key] = int(stat[key]) if i + 1 in slices: keys = sums.keys() values = sums.values() [ slice_stats.update({f'{i+1}_last_{key}': value}) for key, value in zip(keys, values) ] app_logger.debug(f'Formed slice stats: \n{slice_stats}\n') return slice_stats
def insert_stat(html): try: soup = BeautifulSoup(html, 'lxml') HDA_odds, AH_odds, OU_odds = soup.select('div#oddsDetai div')[0:3] HDA_stat = get_data_from_table(HDA_odds.select('tr'), '1x2') AH_stat = get_data_from_table(AH_odds.select('tr')) OU_stat = get_data_from_table(OU_odds.select('tr')) app_logger.debug('Received HDA, AH, OU statistics by minutes') summary_stats = {} summary_stats.update(select_pre_match_line(HDA_stat, '1x2')) summary_stats.update(select_pre_match_line(AH_stat, 'AH')) summary_stats.update(select_pre_match_line(OU_stat, 'OU')) app_logger.debug('Added prematch line move') [ summary_stats.update(stat) for stat in select_stat_in_HT(HDA_stat, '1x2') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(AH_stat, 'AH') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(OU_stat, 'OU') ] summary_stats.update(get_match_info(soup)) app_logger.info( f'Formed objects with stats cnt keys={len(summary_stats.keys())}') except Exception: app_logger.exception('\nError received stats from elements page') insert_into_ng_odds(summary_stats) app_logger.debug('Record values in table\n')
def make_file_champ_urls(country_urls, amount_seasons=4): for url in tqdm(country_urls): archive_url = url + 'archive/' driver = get_driver driver.get(archive_url) time.sleep(1) champs_by_years = driver.find_elements_by_css_selector( 'div.leagueTable__season div.leagueTable__seasonName') for i, champ in enumerate(champs_by_years[:amount_seasons + 1]): champ_text = champ.find_element_by_css_selector('a').text season = champ_text.split(' ')[1] country = driver.find_element_by_css_selector( 'h2.tournament').text.split('\n')[1] try: champ_url = champ.find_element_by_css_selector( 'a').get_attribute('href') app_logger.debug( f'received url - {champ_url} by {country} {season}') write_url_in_file(champ_url) except Exception: app_logger.exception( '\nError getting or writing in file element')
def find_previous_events(url, event_date): html = get_more_events(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start finding PREV events for {event_date}\n') stat_rows = soup.select('div.event__match') last_date = stat_rows[-1].select('div.event__time')[0].text.split(' ')[0] app_logger.info(f'LAST date in received stat rows = {last_date}\n') for stat_row in stat_rows: date = stat_row.select('div.event__time')[0].text.split(' ') date_parts = date[0].split('.') date_format = date_parts[0] + \ date_parts[1] if len(date) > 1 else '.'.join(date_parts) event_date_parts = event_date.split('.') event_date_format = event_date_parts[0] + \ event_date_parts[1] if len(date) > 1 else event_date if date_format == event_date_format: prev_events = stat_row.find_next_siblings( name='div', attrs={'class': 'event__match'}) app_logger.debug( f'Found previous {len(prev_events)} events earlier {event_date}\n' ) return prev_events return []
def get_events_urls(champoinate_url): driver = get_driver driver.get(champoinate_url) app_logger.debug(f'Open page - {champoinate_url}') time.sleep(1) more_event = driver.find_element_by_css_selector('a.event__more') more_event.send_keys(Keys.END) try: for i in range(1, 11): app_logger.debug(f'get events page #{i}') time.sleep(1) more_event.click() if i > 8: app_logger.debug('too many pages open') except Exception: app_logger.debug('All events open\n') time.sleep(1) events_lines = driver.find_elements_by_css_selector( 'div.sportName div.event__match') events_id = [event.get_attribute('id') for event in events_lines] return make_url_event(events_id)
def main(urls): for url in tqdm(urls): try: insert_stat(get_html(url)) except Exception: app_logger.debug('\nError records values in database')