class WebScraper: def __init__(self): self.logger = Logger() def get_all_departments(self): url = 'https://www.emag.ro/all-departments' request = requests.get(url) root_url = 'https://www.emag.ro' prods = {} n_prods = 0 soup = BeautifulSoup(request.text, 'html.parser') departments_tiles = soup.findAll( 'div', {'id': 'departments-page'})[0].findAll( 'div', {'id': 'department-expanded'}) for dept in departments_tiles: n_prods += int( dept.findAll('h3', recursive=False)[0].findAll('span') [0].contents[0].strip().replace('produse', '').replace( 'de', '').replace(' ', '')[1:-1]) tile_name = dept.findAll('h3', recursive=False)[0].contents[1].strip() prods[tile_name] = {} cols = dept.findAll('ul')[0].findAll('li', recursive=False) for col in cols: col_name = None elems = col.findAll('ul')[0].findAll('li') col_name = col.findAll( 'a', recursive=False)[0].findAll('h4')[0].contents[0].strip() prods[tile_name][col_name] = [] for elem in elems: anchor = elem.findAll('a')[0] name = anchor.findAll('h5')[0].contents[0].strip() url = root_url + '/'.join(anchor['href'].strip().split('/') [:-1]) + '/sort-pricedesc' prods[tile_name][col_name].append((name, url)) return prods def get_total_products(self, url): while True: try: request = requests.get(url + '/c') soup = BeautifulSoup(request.text, 'html.parser') total = int(soup.findAll('div', {'class' : 'page-container'})[0]\ .findAll('div', {'class' : 'listing-panel'})[1]\ .findAll('div', {'class' : 'listing-panel-footer'})[0]\ .findAll('div', {'class' : 'row'})[0]\ .findAll('div', recursive = False)[1]\ .findAll('p')[0]\ .findAll('strong')[1].contents[0]) self.logger.total_products(url, total) return total except IndexError: self.logger.failed_total_number_products(url.split('/')[-2]) #f***s up here #remove while, raise exception, catch it on the other side, uncount iterator for 200 def __find_items(self, response): soup = BeautifulSoup(response, 'html.parser') card_items = soup.findAll('div', {'class' : 'page-container'})[0]\ .findAll('div', {'id' : 'card_grid'})[0]\ .findAll('div', {'class' : 'card-item'}) items = list( map( lambda item: item.findAll('div', {'class': 'card'})[0].findAll( 'div', {'class': 'card-section-wrapper'})[0], card_items)) return items def __extract_image(self, item): imagesrc = item.findAll('div', {'class': 'card-section-top'})[0]\ .findAll('div', {'class': 'card-heading'})[0]\ .findAll('a', {'class' : 'thumbnail-wrapper'})[0]\ .findAll('div', {'class': 'thumbnail'})[0]\ .findAll('img', {'class': 'lozad'})[0]['src'] return imagesrc def __extract_title(self, item): title = item.findAll('div', {'class': 'card-section-mid'})[0]\ .findAll('h2', {'class': 'card-body'})[0]\ .findAll('a', {'class' : 'product-title'})[0].contents[0].strip() return title def __extract_link(self, item): link = item.findAll('div', {'class': 'card-section-mid'})[0]\ .findAll('h2', {'class': 'card-body'})[0]\ .findAll('a', {'class' : 'product-title'})[0]['href'] return link def __extract_prices(self, item): price_card = item.findAll('div', {'class': 'card-section-btm'})[0] try: price_card = price_card.findAll('div', {'class': 'card-body'})[1] except: price_card = price_card.findAll('div', {'class': 'card-body'})[0] oprice = None try: old_price = price_card.findAll('p', {'class': 'product-old-price'})[0] oprice = float( old_price.findAll('s')[0].contents[0].strip().replace('.', '') ) + float(old_price.findAll('sup')[0].contents[0].strip()) / 100 except: pass new_price = price_card.findAll('p', {'class': 'product-new-price'})[0] nprice = float( new_price.find(text=re.compile('[0-9]$')).strip().replace('.', '') ) + float(new_price.findAll('sup')[0].contents[0].strip()) / 100 return oprice, nprice def all_prods_in_url(self, base_url, start_page, start_no_products): page = start_page + 1 max_retries_200 = 1 max_retries_511 = 120 finished_items = False products_to_be_ignored = start_no_products - start_page * 60 while True: products = [] url = base_url + '/p{}/c'.format(page) try: request = requests.get(url) if (request.url != url and page != 1) or finished_items: self.logger.finished_products(url) break items = self.__find_items(request.text) for item in items: if products_to_be_ignored > 0: products_to_be_ignored -= 1 continue prices = self.__extract_prices(item) if (prices[0] is not None and prices[0] < 150.0) or (prices[0] is None and prices[1] < 150): finished_items = True print('finished items: {}'.format(prices)) break products.append( Product(self.__extract_link(item), self.__extract_title(item), *self.__extract_prices(item), self.__extract_prices(item))) _ = (yield page, products) page += 1 except IndexError as e: if max_retries_200 == 0 or max_retries_511 == 0: raise StopIteration() exit(1) break if request.status_code == 200: max_retries_200 -= 1 if request.status_code == 511: while max_retries_511 > 0: print( 'Waiting {} more seconds'.format(max_retries_511)) time.sleep(1) max_retries_511 -= 1 self.logger.failed_to_load_products(url, request.status_code, e)