Пример #1
0
    def __parse_2020_search_soup(self, soup):
        auctions_list = soup.find('ul', id='ListViewInner')
        if auctions_list is None:
            auctions_list = soup.find('ul', {'class': 'srp-results'})
        results = auctions_list.find_all('li', recursive=False)

        auctions = {}
        for result in results:
            # Filter out sponsored results
            if result.find('div', attrs={'class': 'promoted-lv'}) or \
                    result.find('div', attrs={'class': 's-item__title--tagblock'}) or \
                    result.find('a', href=re.compile('.*pulsar.*')) or \
                    result.find('span', attrs={'class', re.compile('.*SPONSORED.*')}):
                continue

            try:
                auction_id = int(result.attrs['listingid'])
            except KeyError:
                print("Found a non-item. Skipping...")
                print(result.prettify())
                continue
            except ValueError:
                print(f"Could not convert auction ID {auction_id} to int")

            name = ' '.join(result.find('h3').find('a').find( \
                    text=True, recursive=False).split())
            # Strip tracking query parameters from the uri
            tracking_uri = result.find('h3').find('a').attrs['href']
            uri = urljoin(tracking_uri, urlparse(tracking_uri).path)

            auctions[auction_id] = SearchResult(name, uri)
        return auctions
Пример #2
0
    def _scrape_search_page(self, uri):
        data = self._get_json(uri)

        output = {}
        for result in data['lots']:
            output[str(result['id'])] = \
                    SearchResult(result['title'], result['url'])

        return output, json_dumps_unicode(data)
Пример #3
0
    def _scrape_search_page(self, uri):
        soup = self._get_page(uri)
        json = self.__extract_data_json(soup)

        output = {}
        for auction_id in (json['search']['itemIds'] or []):
            item = json['item']['byId'][str(auction_id)]
            output[auction_id] = SearchResult( \
                name=item['title'], uri=self.base_auction_uri.format(auction_id))
            # print(f'Found auction page "{item["title"]}"')

        return output, soup.prettify()
Пример #4
0
    def _scrape_search_page(self, uri):
        soup = self._get_page(uri)
        results_grid = soup.find_all(
            'div', attrs={'class': re.compile('item-title-container')})

        output = {}
        for result in results_grid:
            link = result.find('a')
            if link is None:
                continue
            href = link['href']
            auction_id = href.split('/')[2].split('_')[0]
            output[auction_id] = SearchResult( \
                name=result.text, uri=self.base_auction_uri.format(auction_id))

        return output, soup.prettify()