def __parse_2020_search_soup(self, soup): auctions_list = soup.find('ul', id='ListViewInner') if auctions_list is None: auctions_list = soup.find('ul', {'class': 'srp-results'}) results = auctions_list.find_all('li', recursive=False) auctions = {} for result in results: # Filter out sponsored results if result.find('div', attrs={'class': 'promoted-lv'}) or \ result.find('div', attrs={'class': 's-item__title--tagblock'}) or \ result.find('a', href=re.compile('.*pulsar.*')) or \ result.find('span', attrs={'class', re.compile('.*SPONSORED.*')}): continue try: auction_id = int(result.attrs['listingid']) except KeyError: print("Found a non-item. Skipping...") print(result.prettify()) continue except ValueError: print(f"Could not convert auction ID {auction_id} to int") name = ' '.join(result.find('h3').find('a').find( \ text=True, recursive=False).split()) # Strip tracking query parameters from the uri tracking_uri = result.find('h3').find('a').attrs['href'] uri = urljoin(tracking_uri, urlparse(tracking_uri).path) auctions[auction_id] = SearchResult(name, uri) return auctions
def _scrape_search_page(self, uri): data = self._get_json(uri) output = {} for result in data['lots']: output[str(result['id'])] = \ SearchResult(result['title'], result['url']) return output, json_dumps_unicode(data)
def _scrape_search_page(self, uri): soup = self._get_page(uri) json = self.__extract_data_json(soup) output = {} for auction_id in (json['search']['itemIds'] or []): item = json['item']['byId'][str(auction_id)] output[auction_id] = SearchResult( \ name=item['title'], uri=self.base_auction_uri.format(auction_id)) # print(f'Found auction page "{item["title"]}"') return output, soup.prettify()
def _scrape_search_page(self, uri): soup = self._get_page(uri) results_grid = soup.find_all( 'div', attrs={'class': re.compile('item-title-container')}) output = {} for result in results_grid: link = result.find('a') if link is None: continue href = link['href'] auction_id = href.split('/')[2].split('_')[0] output[auction_id] = SearchResult( \ name=result.text, uri=self.base_auction_uri.format(auction_id)) return output, soup.prettify()