def get_main_stat(url):
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'lxml')
    app_logger.info(f'Start parsing MAIN stat for {url}\n')
    main_stat = {}
    try:
        championate_info = soup.select(
            'span.description__country')[0].text
        main_stat['country'] = championate_info.split(':')[0]
        main_stat['championate'] = championate_info.split(
            ':')[1].split('-')[0].strip()
        main_stat['round_num'] = championate_info.split(
            ':')[1].split('-')[1].strip()
        main_stat['date'] = soup.select('div#utime')[0].text.split(' ')[0]
        main_stat['home_command'] = soup.select(
            'div.team-text.tname-home a.participant-imglink')[0].text
        main_stat['away_command'] = soup.select(
            'div.team-text.tname-away a.participant-imglink')[0].text
        main_stat['result_score'] = soup.select(
            'div#event_detail_current_result')[0].text.strip()
        detail_info = soup.select('div.detailMS')[0]
        main_stat['goal_minutes'] = get_goal_minutes(
            detail_info.encode_contents())
    except Exception:
        app_logger.exception(f'Error receiving main stat elements {url}')
    return main_stat
示例#2
0
def insert_stat(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
        HDA_odds, AH_odds, OU_odds = soup.select('div#oddsDetai div')[0:3]
        HDA_stat = get_data_from_table(HDA_odds.select('tr'), '1x2')
        AH_stat = get_data_from_table(AH_odds.select('tr'))
        OU_stat = get_data_from_table(OU_odds.select('tr'))
        app_logger.debug('Received HDA, AH, OU statistics by minutes')
        summary_stats = {}
        summary_stats.update(select_pre_match_line(HDA_stat, '1x2'))
        summary_stats.update(select_pre_match_line(AH_stat, 'AH'))
        summary_stats.update(select_pre_match_line(OU_stat, 'OU'))
        app_logger.debug('Added prematch line move')
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(HDA_stat, '1x2')
        ]
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(AH_stat, 'AH')
        ]
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(OU_stat, 'OU')
        ]
        summary_stats.update(get_match_info(soup))
        app_logger.info(
            f'Formed objects with stats cnt keys={len(summary_stats.keys())}')
    except Exception:
        app_logger.exception('\nError received stats from elements page')
    insert_into_ng_odds(summary_stats)
    app_logger.debug('Record values in table\n')
def get_html(url):
    try:
        r = requests.get(url, headers={'User-Agent': USER_AGENT})
        app_logger.info(f'Received html {url} STATUS {r.status_code}\n')
    except Exception:
        app_logger.exception(f'Error receive html {url}\n')
    if r.ok:
        return r.text
示例#4
0
def get_html(url):
    user_agent = UserAgent().chrome
    r = requests.get(url, headers={'User-Agent': user_agent})
    if r.ok:
        app_logger.debug(f'Received html page {url} code = {r.status_code}')
        return r.text
    else:
        app_logger.exception(f'Error getting html page {url} {r.status_code}')
    print(r.ok)
def run_parse(filename, url):
    summary_stat = {}
    try:
        summary_stat.update(get_live_stat(url))
        summary_stat.update(get_past_stat(url))
    except Exception:
        write_text_file('stat_scraper/urls/failed_urls.txt', url)
        app_logger.exception(f'ERROR RUN PARSE ON URL {url}')
    data = normalize_data(summary_stat)
    write_csv(filename, data, data.keys())
def get_page_source(url):
    try:
        driver = get_driver()
        driver.get(url)
        time.sleep(0.5)
        html = driver.page_source
        driver.quit()
        app_logger.info(f'Received html {url}\n')
    except Exception:
        app_logger.exception(f'Error receive html {url}\n')
    return html
def run_parse(filename, url):
    summary_stat = {}
    try:
        started_at = time.time()
        summary_stat.update(get_live_stat(url))
        summary_stat.update(get_past_stat(url))
        ended_at = time.time()
    except Exception:
        app_logger.exception(f'ERROR RUN PARSE ON URL {url}')
    processed_time = round(ended_at - started_at, 4)
    write_text_file('stat_scraper/logs/time_tracks/processed_1_url.txt',
                    f'{processed_time}\n')
def main(champ_urls):
    count_records = 0
    for champ_url in tqdm(champ_urls):
        time.sleep(1)
        try:
            events_urls = normalize_list_urls(
                get_events_urls(champ_url + 'results/'))
            app_logger.info(f'Received {len(events_urls)} events urls')
            [insert_into_events_urls(event_url) for event_url in (events_urls)]
            app_logger.info(f'Record in db {len(events_urls)} urls ')
            count_records += len(events_urls)
            app_logger.info(f'Total number of records = {count_records}\n')
        except Exception:
            app_logger.exception('\nreceive or record error')
def rows_filter(stat_rows, championate, limit=15):
    filtered_rows = []
    for stat_row in stat_rows:
        if len(filtered_rows) == limit:
            return filtered_rows
        try:
            event_id = stat_row['id'][4:]
            url = 'https://www.flashscore.com/match/' + event_id
            soup = BeautifulSoup(get_html(url), 'lxml')
            elem_champ = soup.select('span.description__country')[
                0].text.split(':')[1].split('-')[0].strip()
        except Exception:
            app_logger.exception('Error RECEIVNING INFO FOR ROWS FILTER!!!')
        if elem_champ == championate:
            filtered_rows.append(stat_row)
    return filtered_rows
def get_more_events(url, clicks=12):
    driver = get_driver()
    driver.get(url)
    time.sleep(1)
    more_event_btn = driver.find_element_by_css_selector('a.event__more')
    more_event_btn.send_keys(Keys.END)
    app_logger.info(f'Start CLICKING to show more btn on {url} page')
    for i in range(clicks):
        try:
            time.sleep(1)
            more_event_btn.click()
        except Exception:
            app_logger.exception('Button show more events not found url {url}')
            html = driver.page_source
            driver.quit()
            return html
    html = driver.page_source
    driver.quit()
    return html
def get_summary_stat(stat_rows,
                     command,
                     championate,
                     position,
                     select_type='position'):
    app_logger.info(f'Start received SUMMARY stats for {command}\n')
    stat_rows = (find_position_events(stat_rows, command, position)
                 if select_type == 'position' else stat_rows)
    stat_rows = rows_filter(stat_rows, championate)
    app_logger.info(f'LEFT AFTER FILTER {len(stat_rows)} rows ')
    summary_stats = []
    for stat_row in stat_rows:
        event_stat = {}
        try:
            home_command = stat_row.select(
                'div.event__participant--home')[0].text.strip()
            away_command = stat_row.select(
                'div.event__participant--away')[0].text.strip()
            event_scores = stat_row.select('div.event__scores span')
            first_half_scores = stat_row.select(
                'div.event__part')[0].text.strip('(').strip(')').split('-')
            command_id = 0 if command in home_command else 1
            event_stat['goals_scored'] = event_scores[command_id].text
            event_stat['goals_missed'] = event_scores[command_id - 1].text
            event_stat['1half_goals_scored'] = first_half_scores[command_id]
            event_stat['1half_goals_missed'] = first_half_scores[command_id -
                                                                 1]
            event_id = stat_row['id'][4:]
            first_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;1'
            second_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;2'
            app_logger.info(
                f'DETAIL STAT {home_command} {event_scores} {away_command}')
            event_stat.update(
                get_half_stat(first_half_url, '1st_half', command_id))
            event_stat.update(
                get_half_stat(second_half_url, '2nd_half', command_id))
            summary_stats.append(event_stat)
        except Exception:
            app_logger.exception(
                f'\nError received data from stat row {command}')
        app_logger.debug(f'Formed event stats: \n{event_stat}\n')
    return (calculate_stat(summary_stats))
def make_file_champ_urls(country_urls, amount_seasons=4):
    for url in tqdm(country_urls):
        archive_url = url + 'archive/'
        driver = get_driver
        driver.get(archive_url)
        time.sleep(1)
        champs_by_years = driver.find_elements_by_css_selector(
            'div.leagueTable__season div.leagueTable__seasonName')
        for i, champ in enumerate(champs_by_years[:amount_seasons + 1]):
            champ_text = champ.find_element_by_css_selector('a').text
            season = champ_text.split(' ')[1]
            country = driver.find_element_by_css_selector(
                'h2.tournament').text.split('\n')[1]
            try:
                champ_url = champ.find_element_by_css_selector(
                    'a').get_attribute('href')
                app_logger.debug(
                    f'received url - {champ_url} by {country} {season}')
                write_url_in_file(champ_url)
            except Exception:
                app_logger.exception(
                    '\nError getting or writing in file element')
示例#13
0
def get_data_from_table(trs, type_odds=None):
    result = []
    for tr in trs[2:]:
        try:
            tds = tr.select('td')
            min_match = tds[0].text
            score = tds[1].text
            home_odds = tds[2].text
            draw_or_value = tds[3].text
            away_odds = tds[4].text
            status = tds[6].text
            variable_name = 'draw_odds' if type_odds == '1x2' else 'value'
        except Exception:
            app_logger.exception('Error received html element')

        result.append({
            'min_match': min_match,
            'score': score,
            'home_odds': home_odds,
            f'{variable_name}': draw_or_value,
            'away_odds': away_odds,
            'status': status
        })
    return result