def parse_pages(start_page, pages, debug, sql_table, creds, proxy_pass): import url parse_page = url.Url(start_page) first_page = html_page.HtmlPage(parse_page.get_url()) html = first_page.get_html(creds, proxy_pass) if html: soup = BeautifulSoup(html, 'html.parser') # 1st page arts_dict = {} for i in soup.findAll('div', class_="j-card-item"): art_num = re.search(r'\d+', i.get('data-popup-nm-id')) arts_dict[art_num[0]] = i.find('a')['href'] for art, link in arts_dict.items(): if not sql_table.table_check_presence(art, creds[6]): handbag = bag.Bag() if not link.startswith('https'): link = "https://www.wildberries.ru" + link handbag.get_bag_page(art, link, debug, creds, proxy_pass) sql_table.table_append(handbag) sql_table.cnx.commit() # after 1st page if parse_page.check_key('page'): return 0 parse_page.add_key('page', '1') # 2nd page and further for i in range(2, pages + 1): parse_page.change_key('page', str(i)) print(parse_page.get_url()) have_a_try = 3 if have_a_try: further_page = html_page.HtmlPage(parse_page.get_url()) arts_dict = further_page.get_wb_page(creds, proxy_pass) if arts_dict: for art, url in arts_dict.items(): if not sql_table.table_check_presence(art, creds[6]): handbag = bag.Bag() handbag.get_bag_page(art, url, debug, creds, proxy_pass) sql_table.table_append(handbag) sql_table.cnx.commit() continue else: sql_table.cnx.commit() print(f"Page {str(i)} parse error. Trying again.") have_a_try -= 1 else: sql_table.cnx.commit() print(f"No luck. Next page.")
def set_bag_fields(self, article, url, debug, creds, proxy_pass): h = html_page.HtmlPage(url) page = h.get_html(creds, proxy_pass) if page: soup = BeautifulSoup(page, 'html.parser') self.set_article(article) self.set_name(soup) self.set_image(soup) self.set_url(url) self.set_material(soup, debug) self.set_price(soup, page) self.set_price_sale(soup, page) self.set_rating(soup) self.set_reviews(soup) self.set_sold(page) return False
mysql_table.table_make() if args.update or args.https: clear_table = True h = proxy.Proxy('http', http_url) s = proxy.Proxy('https', https_url) len_table = h.form_table(clear_table) if args.https: print(f"В базе {len_table} прокси.") clear_table = False time.sleep(60) len_table += s.form_table(clear_table) print(f"В базе {len_table} прокси.") link = url.Url(args.source) main_page = html_page.HtmlPage(link.get_url()) main_html = main_page.get_html(cred_tuple, args.noproxy) if main_html and not args.material: if link.check_key('page'): parse_pages(link.get_url(), 1, args.debug, mysql_table, cred_tuple, args.noproxy) else: main_soup = BeautifulSoup(main_html, 'html.parser') try: items = main_soup.find('span', class_="total many").find('span').text except AttributeError: print("Bad first page. Try to run again.") sys.exit(0) print(f"{items} товаров")