def get_seasons_for_league(self, main_league_results_url): """ Params: (str) main_league_results_url e.g. https://www.oddsportal.com/hockey/usa/nhl/results/ Returns: (list) urls to each season for given league """ seasons = [] print('Getting all seasons for league via %s' % (main_league_results_url)) if not self.go_to_link(main_league_results_url): print('League results URL loaded unsuccessfully %s' % (main_league_results_url)) # Going to send back empty list so this is not processed further return seasons html_source = self.get_html_source() html_querying = pyquery(html_source) season_links = html_querying.find( 'div.main-menu2.main-menu-gray > ul.main-filter > li > span > strong > a' ) print('Extracted links to %d seasons' % (len(season_links))) for season_link in season_links: this_season = Season(season_link.text) # Start the Season's list of URLs with just the root one this_season_url = self.base_url + season_link.attrib['href'] this_season.urls.append(this_season_url) seasons.append(this_season) return seasons
def extract_urls(html): """ 解析无io操作 """ urls = [] pq = pyquery(html) for link in pq.items('a'): url = link.attr('href') if url and url.startwith('http') and url not in urls: urls.append(url) waiting_urls.append(url) return urls
async def handle_article(url, session, pool): """ 处理文章 """ html = await fetch(url, session) seen_urls.append(url) extract_urls(html) pq = pyquery(html) title = pq('title').text() async with pool.acquire() as conn: async with conn.cursor() as cur: insert_sql = "insert into Test(title) values('{}')".format(title) await cur.execute(insert_sql)
def fill_in_season_pagination_links(self, season): """ Params: (Season) object with just one entry in its urls field, to be modified """ first_url_in_season = season.urls[0] self.go_to_link(first_url_in_season) html_source = self.get_html_source() html_querying = pyquery(html_source) # Check if the page says "No data available" no_data_div = html_querying.find( 'div.message-info > ul > li > div.cms') if no_data_div != None and no_data_div.text() == 'No data available': # Yes, found "No data available" print('Found "No data available", skipping %s' % (first_url_in_season)) return # Just need to locate the final pagination tag pagination_links = html_querying.find('div#pagination > a') # It's possible, however, there is no pagination... if len(pagination_links) <= 1: return last_page_number = -1 last_page_url = None for link in reversed(pagination_links): span = link.find('span') if span != None and span.text != None and '»|' in span.text: # This is the last link because it has these two characters in it... last_page_number = int(link.attrib['x-page']) last_page_url = first_url_in_season + link.attrib['href'] break # If the last page number was set, the page format must've changed - RuntimeError if last_page_number == -1: print('Could not locate final page URL from %s' % (first_url_in_season)) raise RuntimeError('Could not locate final page URL from %s', first_url_in_season) for i in range(2, last_page_number): this_url = last_page_url.replace('page/' + str(last_page_number), 'page/' + str(i)) season.urls.append(this_url) season.urls.append(last_page_url)
def populate_games_into_season(self, season): """ Params: season (Season) with urls but not games populated, to modify """ for url in season.urls: self.go_to_link(url) html_source = self.get_html_source() html_querying = pyquery(html_source) # Check if the page says "No data available" no_data_div = html_querying.find( 'div.message-info > ul > li > div.cms') if no_data_div != None and no_data_div.text( ) == 'No data available': # Yes, found "No data available" logger.warning('Found "No data available", skipping %s', url) continue retrieval_time_for_reference = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) tournament_table = html_querying.find( 'div#tournamentTable > table#tournamentTable') table_rows = tournament_table.find('tbody > tr') num_table_rows = len(table_rows) for i in range(0, num_table_rows): try: # Finding the table cell with game time and assessing if its blank tells us if this is a game data row time_cell = tournament_table.find('tbody > tr').eq(i).find( 'td.table-time') if 0 == len(str(time_cell).strip()): # This row of the table does not contain game/match data continue game = Game() # Need to get the actual HtmlElement out of the PyQuery object that time_cell currently is time_cell = time_cell[0] for key, value in time_cell.attrib.items(): if key == 'class': time_cell_classes = value.split(' ') for time_cell_class in time_cell_classes: if 0 == len(time_cell_class ) or time_cell_class[0] != 't': continue if time_cell_class[1] == '0' or time_cell_class[ 1] == '1' or time_cell_class[2] == '2': unix_time = int( time_cell_class.split('-')[0].replace( 't', '')) game.game_datetime = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(unix_time)) break break # If time still isn't set at this point, then assume corrupt data and skip the row if 0 == len(game.game_datetime): continue # Set some of the other Game fields that are easy to fill in game.retrieval_datetime = retrieval_time_for_reference game.retrieval_url = url game.num_possible_outcomes = season.possible_outcomes number_of_outcomes = season.possible_outcomes # Now get the table cell - the link within it, actually - with participants participants_link = tournament_table.find('tbody > tr').eq( i).find('td.table-participant > a') participants = participants_link.text().split(' - ') game.team_home = participants[0] game.team_away = participants[1] game.game_url = self.base_url + participants_link[ 0].attrib['href'] # Now get the table cell with overall score overall_score_cell = tournament_table.find( 'tbody > tr').eq(i).find('td.table-score') overall_score_string = overall_score_cell.text() # Perform crude sanitization against various things appended to scores, like " OT" overall_score_string = overall_score_string.split()[0] # Home team/participant is always listed first in Odds Portal's scores if ':' in overall_score_string: game.score_home = int( overall_score_string.split(':')[0]) game.score_away = int( overall_score_string.split(':')[1]) elif '-' in overall_score_string: game.score_home = int( overall_score_string.split('-')[0]) game.score_away = int( overall_score_string.split('-')[1]) else: logger.warning( 'Could not split score string - delimiter unknown') raise RuntimeError( 'Could not split score string - delimiter unknown') # Based on the score we can infer the outcome, as follows... if game.score_home > game.score_away: game.outcome = 'HOME' elif game.score_home < game.score_away: game.outcome = 'AWAY' else: game.outcome = 'DRAW' # Finally, get the cells with odds - either 2 or 3 depending on number of possible outcomes individual_odds_links = tournament_table.find( 'tbody > tr').eq(i).find('td.odds-nowrp > a') if len(individual_odds_links) < 2: # Assume data corruption and skip to next row of tournament table continue elif number_of_outcomes != 2 and number_of_outcomes != 3: raise RuntimeError( 'Unsupported number of outcomes specified - ' + str(number_of_outcomes)) for x, individual_odds_link in enumerate( individual_odds_links): if 2 == number_of_outcomes: if x == 0: # home team odds game.odds_home = individual_odds_link.text else: # away team odds - x must be 1 game.odds_away = individual_odds_link.text elif 3 == number_of_outcomes: if x == 0: # home team odds game.odds_home = individual_odds_link.text elif x == 1: # draw/tie odds game.odds_draw = individual_odds_link.text else: # away team odds - x must be 2 game.odds_away = individual_odds_link.text # And then, at this point, let's mark draw odds as None/null if only 2 outcomes if number_of_outcomes == 2: game.odds_draw = None season.add_game(game) except Exception as e: logger.warning( 'Skipping row, encountered exception - data format not as expected' ) continue
def populate_games_into_season(self, season): """ Params: season (Season) with urls but not games populated, to modify """ page_counter = 0 for url in season.urls: page_counter = page_counter + 1 self.go_to_link(url) html_source = self.get_html_source() html_querying = pyquery(html_source) # Check if the page says "No data available" no_data_div = html_querying.find( 'div.message-info > ul > li > div.cms') if no_data_div != None and no_data_div.text( ) == 'No data available': # Yes, found "No data available" print('Found "No data available", skipping %s' % (url)) continue if season.name == 'current': tournament_table = html_querying.find('table#tournamentTable') else: tournament_table = html_querying.find( 'div#tournamentTable > table#tournamentTable') table_rows = tournament_table.find('tbody > tr') num_table_rows = len(table_rows) for i in range(0, num_table_rows): # Finding the table cell with game time and assessing if its blank tells us if this is a game data row time_cell = tournament_table.find('tbody > tr').eq(i).find( 'td.table-time') if len(str(time_cell).strip()) == 0: # This row of the table does not contain game/match data continue time_cell = time_cell[0] for key, value in time_cell.attrib.items(): if key == 'class': time_cell_classes = value.split(' ') for time_cell_class in time_cell_classes: if 0 == len(time_cell_class ) or time_cell_class[0] != 't': continue if time_cell_class[1] == '0' or time_cell_class[ 1] == '1' or time_cell_class[2] == '2': unix_time = int( time_cell_class.split('-')[0].replace( 't', '')) game_datetime = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(unix_time)) break break # If time still isn't set at this point, then assume corrupt data and skip the row if len(game_datetime) == 0: continue # Now get the table cell - the link within it, actually - with participants participants_link = tournament_table.find('tbody > tr').eq( i).find('td.table-participant > a') for plink in participants_link: if "javascript:void(0)" not in plink.attrib['href']: plink = plink.attrib['href'] break if "inplay-odds" in plink: #trim plink = plink[:-12] game_url = self.base_url + plink participants = participants_link.text().split(' - ') team_home = participants[0].strip() team_away = participants[1].strip() if season.name != 'current': #Get score overall_score_cell = tournament_table.find( 'tbody > tr').eq(i).find('td.table-score') overall_score_string = overall_score_cell.text() overall_score_string = overall_score_string.split()[0] if ':' in overall_score_string: score_home = int(overall_score_string.split(':')[0]) score_away = int(overall_score_string.split(':')[1]) elif '-' in overall_score_string: score_home = int(overall_score_string.split('-')[0]) score_away = int(overall_score_string.split('-')[1]) else: print( 'Could not split score string - delimiter unknown. Skipping game' ) continue # Based on the score we can infer the outcome, as follows... if score_home > score_away: outcome = 2 #HOME else: outcome = 1 #AWAY else: score_home = 0 score_away = 0 outcome = 0 #visit the odd details URL self.go_to_link(game_url) html_source = self.get_html_source() html_querying = pyquery(html_source) # Check if the page says "No data available" no_data_div = html_querying.find( 'div.message-info > ul > li > div.cms') if no_data_div != None and no_data_div.text( ) == 'No data available': # Yes, found "No data available" print('Found "No data available", skipping %s' % (game_url)) continue #Get epxerts + odds experts_dict = {} odds_table = html_querying.find( "div#odds-data-table > div.table-container > table.table-main" ) odds_table_rows = odds_table.find('tbody > tr') num_odds_table_rows = len(odds_table_rows) print("\tProcessing games in page %i of %i: \t%.2f%%\t" % (page_counter, len(season.urls), (i + 1) / num_table_rows * 100), end="\r", flush=True) for j in range(num_odds_table_rows): expert_name = odds_table.find('tbody > tr').eq(j).find( 'td> div.l > a.name').text() if len(expert_name) == 0: continue odds = odds_table.find('tbody > tr').eq(j).find( 'td.right ').text().split() #drop any odds that cannot be cast to float for odd in odds: try: float(odd) except: odds.remove(odd) #add to dictionary but drop any odds that are if len(odds) == 2: experts_dict[expert_name] = [ odds[1], odds[0] ] #reverse odds to AWAY, HOME #append game info to list game = [ game_datetime, team_away, team_home, score_away, score_home, outcome, experts_dict ] season.add_game(game)
def populate_odds_detailed(self, season, game, game_row): # game_row parameter is not being used. Just matching signature with default. # save URL season_url = self.driver.current_url game_href = game_row.find('td.table-participant > a').attr.href game_link = f"{self.base_url}{game_href}#{season.bet_type}" if season.sub_bet_type is not None: game_link += f";{season.sub_bet_type}" try: self.go_to_link(game_link) except WebDriverException: logger.warning(f'Link is malformed. Could not get odds for game. Link: {game_link}') return # For some games, desired bet type not available. If so, URL will be different from expected URL. if self.driver.current_url != game_link: logger.warning(f'Odds for bet type combination not available for this game. Link: {game_link}') self.driver.get(season_url) return if season.bet_options is not None: self.open_rows({o.lower() for o in season.bet_options}) # Freeze html after opening rows. pq = pyquery(self.get_html_source()) odds_table = pq.find('div#odds-data-table') tcs = odds_table.find('div.table-container') # If bet_options is None then we should be in the only table container. if season.bet_options is None: try: game.odds = self.scrape_odds_detailed(season, game_link, tcs[0]) except IndexError: logger.warning(f'Some problem occurred parsing odds. Could not get odds from {s}. Link: {game_link}') return # Otherwise, we need to go into each table row individually. else: # Create set of options for constant time access. option_set = {o.lower() for o in season.bet_options} # Use this to preserve case in output JSON. bet_options_lower = {opt.lower(): opt for opt in season.bet_options} # Each tc is an lxml object. for tc in tcs: try: style = tc.get('style') # Don't want any elements that aren't visible. try: if 'display:none' in style.replace(' ', ''): continue except AttributeError: # Displayed elements do not have a style attribute. pass try: a = tc.cssselect('div > strong > a[onclick]')[0] except IndexError: # If there is no anchor tag for some reason, the table container is not formed as expected, # so go to the next one. continue option = a.text.lower().strip() if len(option_set) == 0 or option in option_set: game_odds = self.scrape_odds_detailed(season, game_link, tc) try: game.odds[bet_options_lower[option]] = game_odds except KeyError: game.odds[option] = game_odds except NoSuchElementException: pass # restore URL self.driver.get(season_url)
def get_intro(self): post = pyquery(self.content).text() return post[:250]
def get_pic(self): items = pyquery(self.content).find('img') if items: return items[0].attrib['src'] else: return None
def getJobContent(html): py = pyquery(html) content1 = py('.templatetext').text().encode('ascii', 'ignore').decode('ascii') content2 = str(content1) return content2
def getTitle(html): py = pyquery(html) title = py('.jobtitle').text() return title