Python pyquery示例，pyquery.pyquery Python示例

示例#1

0

显示文件

文件： ProcessHistoricMarketData.py 项目： tropicalsnow/NBA_prediction

    def get_seasons_for_league(self, main_league_results_url):
        """
        Params:
            (str) main_league_results_url e.g. https://www.oddsportal.com/hockey/usa/nhl/results/

        Returns:
            (list) urls to each season for given league
        """
        seasons = []
        print('Getting all seasons for league via %s' %
              (main_league_results_url))
        if not self.go_to_link(main_league_results_url):
            print('League results URL loaded unsuccessfully %s' %
                  (main_league_results_url))
            # Going to send back empty list so this is not processed further
            return seasons
        html_source = self.get_html_source()
        html_querying = pyquery(html_source)
        season_links = html_querying.find(
            'div.main-menu2.main-menu-gray > ul.main-filter > li > span > strong > a'
        )
        print('Extracted links to %d seasons' % (len(season_links)))
        for season_link in season_links:
            this_season = Season(season_link.text)
            # Start the Season's list of URLs with just the root one
            this_season_url = self.base_url + season_link.attrib['href']
            this_season.urls.append(this_season_url)
            seasons.append(this_season)
        return seasons

示例#2

0

显示文件

def extract_urls(html):
    """
    解析无io操作
    """
    urls = []
    pq = pyquery(html)
    for link in pq.items('a'):
        url = link.attr('href')
        if url and url.startwith('http') and url not in urls:
            urls.append(url)
            waiting_urls.append(url)
    return urls

示例#3

0

显示文件

async def handle_article(url, session, pool):
    """
    处理文章
    """
    html = await fetch(url, session)
    seen_urls.append(url)
    extract_urls(html)
    pq = pyquery(html)
    title = pq('title').text()
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            insert_sql = "insert into Test(title) values('{}')".format(title)
            await cur.execute(insert_sql)

示例#4

0

显示文件

文件： ProcessHistoricMarketData.py 项目： tropicalsnow/NBA_prediction

 def fill_in_season_pagination_links(self, season):
     """
     Params:
         (Season) object with just one entry in its urls field, to be modified
     """
     first_url_in_season = season.urls[0]
     self.go_to_link(first_url_in_season)
     html_source = self.get_html_source()
     html_querying = pyquery(html_source)
     # Check if the page says "No data available"
     no_data_div = html_querying.find(
         'div.message-info > ul > li > div.cms')
     if no_data_div != None and no_data_div.text() == 'No data available':
         # Yes, found "No data available"
         print('Found "No data available", skipping %s' %
               (first_url_in_season))
         return
     # Just need to locate the final pagination tag
     pagination_links = html_querying.find('div#pagination > a')
     # It's possible, however, there is no pagination...
     if len(pagination_links) <= 1:
         return
     last_page_number = -1
     last_page_url = None
     for link in reversed(pagination_links):
         span = link.find('span')
         if span != None and span.text != None and '»|' in span.text:
             # This is the last link because it has these two characters in it...
             last_page_number = int(link.attrib['x-page'])
             last_page_url = first_url_in_season + link.attrib['href']
             break
     # If the last page number was set, the page format must've changed - RuntimeError
     if last_page_number == -1:
         print('Could not locate final page URL from %s' %
               (first_url_in_season))
         raise RuntimeError('Could not locate final page URL from %s',
                            first_url_in_season)
     for i in range(2, last_page_number):
         this_url = last_page_url.replace('page/' + str(last_page_number),
                                          'page/' + str(i))
         season.urls.append(this_url)
     season.urls.append(last_page_url)

示例#5

0

显示文件

文件： scraper.py 项目： ilarinie/aenaeri-app

 def populate_games_into_season(self, season):
     """
     Params:
         season (Season) with urls but not games populated, to modify
     """
     for url in season.urls:
         self.go_to_link(url)
         html_source = self.get_html_source()
         html_querying = pyquery(html_source)
         # Check if the page says "No data available"
         no_data_div = html_querying.find(
             'div.message-info > ul > li > div.cms')
         if no_data_div != None and no_data_div.text(
         ) == 'No data available':
             # Yes, found "No data available"
             logger.warning('Found "No data available", skipping %s', url)
             continue
         retrieval_time_for_reference = time.strftime(
             "%Y-%m-%d %H:%M:%S", time.localtime())
         tournament_table = html_querying.find(
             'div#tournamentTable > table#tournamentTable')
         table_rows = tournament_table.find('tbody > tr')
         num_table_rows = len(table_rows)
         for i in range(0, num_table_rows):
             try:
                 # Finding the table cell with game time and assessing if its blank tells us if this is a game data row
                 time_cell = tournament_table.find('tbody > tr').eq(i).find(
                     'td.table-time')
                 if 0 == len(str(time_cell).strip()):
                     # This row of the table does not contain game/match data
                     continue
                 game = Game()
                 # Need to get the actual HtmlElement out of the PyQuery object that time_cell currently is
                 time_cell = time_cell[0]
                 for key, value in time_cell.attrib.items():
                     if key == 'class':
                         time_cell_classes = value.split(' ')
                         for time_cell_class in time_cell_classes:
                             if 0 == len(time_cell_class
                                         ) or time_cell_class[0] != 't':
                                 continue
                             if time_cell_class[1] == '0' or time_cell_class[
                                     1] == '1' or time_cell_class[2] == '2':
                                 unix_time = int(
                                     time_cell_class.split('-')[0].replace(
                                         't', ''))
                                 game.game_datetime = time.strftime(
                                     "%Y-%m-%d %H:%M:%S",
                                     time.localtime(unix_time))
                                 break
                         break
                 # If time still isn't set at this point, then assume corrupt data and skip the row
                 if 0 == len(game.game_datetime):
                     continue
                 # Set some of the other Game fields that are easy to fill in
                 game.retrieval_datetime = retrieval_time_for_reference
                 game.retrieval_url = url
                 game.num_possible_outcomes = season.possible_outcomes
                 number_of_outcomes = season.possible_outcomes
                 # Now get the table cell - the link within it, actually - with participants
                 participants_link = tournament_table.find('tbody > tr').eq(
                     i).find('td.table-participant > a')
                 participants = participants_link.text().split(' - ')
                 game.team_home = participants[0]
                 game.team_away = participants[1]
                 game.game_url = self.base_url + participants_link[
                     0].attrib['href']
                 # Now get the table cell with overall score
                 overall_score_cell = tournament_table.find(
                     'tbody > tr').eq(i).find('td.table-score')
                 overall_score_string = overall_score_cell.text()
                 # Perform crude sanitization against various things appended to scores, like " OT"
                 overall_score_string = overall_score_string.split()[0]
                 # Home team/participant is always listed first in Odds Portal's scores
                 if ':' in overall_score_string:
                     game.score_home = int(
                         overall_score_string.split(':')[0])
                     game.score_away = int(
                         overall_score_string.split(':')[1])
                 elif '-' in overall_score_string:
                     game.score_home = int(
                         overall_score_string.split('-')[0])
                     game.score_away = int(
                         overall_score_string.split('-')[1])
                 else:
                     logger.warning(
                         'Could not split score string - delimiter unknown')
                     raise RuntimeError(
                         'Could not split score string - delimiter unknown')
                 # Based on the score we can infer the outcome, as follows...
                 if game.score_home > game.score_away:
                     game.outcome = 'HOME'
                 elif game.score_home < game.score_away:
                     game.outcome = 'AWAY'
                 else:
                     game.outcome = 'DRAW'
                 # Finally, get the cells with odds - either 2 or 3 depending on number of possible outcomes
                 individual_odds_links = tournament_table.find(
                     'tbody > tr').eq(i).find('td.odds-nowrp > a')
                 if len(individual_odds_links) < 2:
                     # Assume data corruption and skip to next row of tournament table
                     continue
                 elif number_of_outcomes != 2 and number_of_outcomes != 3:
                     raise RuntimeError(
                         'Unsupported number of outcomes specified - ' +
                         str(number_of_outcomes))
                 for x, individual_odds_link in enumerate(
                         individual_odds_links):
                     if 2 == number_of_outcomes:
                         if x == 0:
                             # home team odds
                             game.odds_home = individual_odds_link.text
                         else:
                             # away team odds - x must be 1
                             game.odds_away = individual_odds_link.text
                     elif 3 == number_of_outcomes:
                         if x == 0:
                             # home team odds
                             game.odds_home = individual_odds_link.text
                         elif x == 1:
                             # draw/tie odds
                             game.odds_draw = individual_odds_link.text
                         else:
                             # away team odds - x must be 2
                             game.odds_away = individual_odds_link.text
                 # And then, at this point, let's mark draw odds as None/null if only 2 outcomes
                 if number_of_outcomes == 2:
                     game.odds_draw = None
                 season.add_game(game)
             except Exception as e:
                 logger.warning(
                     'Skipping row, encountered exception - data format not as expected'
                 )
                 continue

示例#6

0

显示文件

文件： ProcessHistoricMarketData.py 项目： tropicalsnow/NBA_prediction

    def populate_games_into_season(self, season):
        """
        Params:
            season (Season) with urls but not games populated, to modify
        """
        page_counter = 0
        for url in season.urls:
            page_counter = page_counter + 1
            self.go_to_link(url)
            html_source = self.get_html_source()
            html_querying = pyquery(html_source)
            # Check if the page says "No data available"
            no_data_div = html_querying.find(
                'div.message-info > ul > li > div.cms')
            if no_data_div != None and no_data_div.text(
            ) == 'No data available':
                # Yes, found "No data available"
                print('Found "No data available", skipping %s' % (url))
                continue
            if season.name == 'current':
                tournament_table = html_querying.find('table#tournamentTable')
            else:
                tournament_table = html_querying.find(
                    'div#tournamentTable > table#tournamentTable')
            table_rows = tournament_table.find('tbody > tr')
            num_table_rows = len(table_rows)
            for i in range(0, num_table_rows):
                # Finding the table cell with game time and assessing if its blank tells us if this is a game data row
                time_cell = tournament_table.find('tbody > tr').eq(i).find(
                    'td.table-time')
                if len(str(time_cell).strip()) == 0:
                    # This row of the table does not contain game/match data
                    continue

                time_cell = time_cell[0]
                for key, value in time_cell.attrib.items():
                    if key == 'class':
                        time_cell_classes = value.split(' ')
                        for time_cell_class in time_cell_classes:
                            if 0 == len(time_cell_class
                                        ) or time_cell_class[0] != 't':
                                continue
                            if time_cell_class[1] == '0' or time_cell_class[
                                    1] == '1' or time_cell_class[2] == '2':
                                unix_time = int(
                                    time_cell_class.split('-')[0].replace(
                                        't', ''))
                                game_datetime = time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(unix_time))
                                break
                        break

                # If time still isn't set at this point, then assume corrupt data and skip the row
                if len(game_datetime) == 0:
                    continue
                # Now get the table cell - the link within it, actually - with participants
                participants_link = tournament_table.find('tbody > tr').eq(
                    i).find('td.table-participant > a')

                for plink in participants_link:
                    if "javascript:void(0)" not in plink.attrib['href']:
                        plink = plink.attrib['href']
                        break

                if "inplay-odds" in plink:
                    #trim
                    plink = plink[:-12]

                game_url = self.base_url + plink

                participants = participants_link.text().split(' - ')
                team_home = participants[0].strip()
                team_away = participants[1].strip()

                if season.name != 'current':

                    #Get score
                    overall_score_cell = tournament_table.find(
                        'tbody > tr').eq(i).find('td.table-score')
                    overall_score_string = overall_score_cell.text()
                    overall_score_string = overall_score_string.split()[0]
                    if ':' in overall_score_string:
                        score_home = int(overall_score_string.split(':')[0])
                        score_away = int(overall_score_string.split(':')[1])
                    elif '-' in overall_score_string:
                        score_home = int(overall_score_string.split('-')[0])
                        score_away = int(overall_score_string.split('-')[1])
                    else:
                        print(
                            'Could not split score string - delimiter unknown. Skipping game'
                        )
                        continue

                    # Based on the score we can infer the outcome, as follows...
                    if score_home > score_away:
                        outcome = 2  #HOME
                    else:
                        outcome = 1  #AWAY
                else:
                    score_home = 0
                    score_away = 0
                    outcome = 0

                #visit the odd details URL
                self.go_to_link(game_url)
                html_source = self.get_html_source()
                html_querying = pyquery(html_source)
                # Check if the page says "No data available"
                no_data_div = html_querying.find(
                    'div.message-info > ul > li > div.cms')
                if no_data_div != None and no_data_div.text(
                ) == 'No data available':
                    # Yes, found "No data available"
                    print('Found "No data available", skipping %s' %
                          (game_url))
                    continue

                #Get epxerts + odds
                experts_dict = {}
                odds_table = html_querying.find(
                    "div#odds-data-table > div.table-container > table.table-main"
                )
                odds_table_rows = odds_table.find('tbody > tr')
                num_odds_table_rows = len(odds_table_rows)

                print("\tProcessing games in page %i of %i:  \t%.2f%%\t" %
                      (page_counter, len(season.urls),
                       (i + 1) / num_table_rows * 100),
                      end="\r",
                      flush=True)

                for j in range(num_odds_table_rows):

                    expert_name = odds_table.find('tbody > tr').eq(j).find(
                        'td>  div.l > a.name').text()

                    if len(expert_name) == 0:
                        continue

                    odds = odds_table.find('tbody > tr').eq(j).find(
                        'td.right ').text().split()
                    #drop any odds that cannot be cast to float
                    for odd in odds:
                        try:
                            float(odd)
                        except:
                            odds.remove(odd)

                    #add to dictionary but drop any odds that are
                    if len(odds) == 2:
                        experts_dict[expert_name] = [
                            odds[1], odds[0]
                        ]  #reverse odds to AWAY, HOME

                #append game info to list
                game = [
                    game_datetime, team_away, team_home, score_away,
                    score_home, outcome, experts_dict
                ]
                season.add_game(game)

示例#7

0

显示文件

文件： scraper.py 项目： james-james-james/odds-portal-scraper

    def populate_odds_detailed(self, season, game, game_row):
        # game_row parameter is not being used. Just matching signature with default.
        
        # save URL
        season_url = self.driver.current_url

        game_href = game_row.find('td.table-participant > a').attr.href
        game_link = f"{self.base_url}{game_href}#{season.bet_type}"
        if season.sub_bet_type is not None:
            game_link += f";{season.sub_bet_type}"
        try:
            self.go_to_link(game_link)
        except WebDriverException:
            logger.warning(f'Link is malformed. Could not get odds for game. Link: {game_link}')
            return

        # For some games, desired bet type not available. If so, URL will be different from expected URL.
        if self.driver.current_url != game_link:
            logger.warning(f'Odds for bet type combination not available for this game. Link: {game_link}')
            self.driver.get(season_url)
            return
           
        if season.bet_options is not None:
            self.open_rows({o.lower() for o in season.bet_options})
        
        # Freeze html after opening rows.
        pq = pyquery(self.get_html_source())

        odds_table = pq.find('div#odds-data-table')
        tcs = odds_table.find('div.table-container')

        # If bet_options is None then we should be in the only table container.
        if season.bet_options is None:
            try:
                game.odds = self.scrape_odds_detailed(season, game_link, tcs[0])
            except IndexError:
                logger.warning(f'Some problem occurred parsing odds. Could not get odds from {s}. Link: {game_link}')
                return
        # Otherwise, we need to go into each table row individually.
        else:
            # Create set of options for constant time access.
            option_set = {o.lower() for o in season.bet_options}
            # Use this to preserve case in output JSON.
            bet_options_lower = {opt.lower(): opt for opt in season.bet_options}

            # Each tc is an lxml object.
            for tc in tcs:
                try:
                    style = tc.get('style')

                    # Don't want any elements that aren't visible.
                    try:
                        if 'display:none' in style.replace(' ', ''):
                            continue
                    except AttributeError:
                        # Displayed elements do not have a style attribute.
                        pass
                    
                    try:
                        a = tc.cssselect('div > strong > a[onclick]')[0]
                    except IndexError:
                        # If there is no anchor tag for some reason, the table container is not formed as expected,
                        # so go to the next one.
                        continue
                    
                    option = a.text.lower().strip()
                    if len(option_set) == 0 or option in option_set:
                        game_odds = self.scrape_odds_detailed(season, game_link, tc)
                        try:
                            game.odds[bet_options_lower[option]] = game_odds
                        except KeyError:
                            game.odds[option] = game_odds
                            
                except NoSuchElementException:
                    pass

        # restore URL
        self.driver.get(season_url)

示例#8

0

显示文件

文件： models.py 项目： cnssnewscenter/16-i18nschool

 def get_intro(self):
     post = pyquery(self.content).text()
     return post[:250]

示例#9

0

显示文件

文件： models.py 项目： cnssnewscenter/16-i18nschool

 def get_pic(self):
     items = pyquery(self.content).find('img')
     if items:
         return items[0].attrib['src']
     else:
         return None

示例#10

0

显示文件

def getJobContent(html):
	py = pyquery(html)
	content1 = py('.templatetext').text().encode('ascii', 'ignore').decode('ascii')
	content2 = str(content1)	
	return content2

示例#11

0

显示文件

def getTitle(html):
	py = pyquery(html)
	title = py('.jobtitle').text()
	return title