示例#1
0
def parse_pages(start_page, pages, debug, sql_table, creds, proxy_pass):

    import url
    parse_page = url.Url(start_page)
    first_page = html_page.HtmlPage(parse_page.get_url())
    html = first_page.get_html(creds, proxy_pass)

    if html:
        soup = BeautifulSoup(html, 'html.parser')

        # 1st page
        arts_dict = {}
        for i in soup.findAll('div', class_="j-card-item"):
            art_num = re.search(r'\d+', i.get('data-popup-nm-id'))
            arts_dict[art_num[0]] = i.find('a')['href']
        for art, link in arts_dict.items():
            if not sql_table.table_check_presence(art, creds[6]):
                handbag = bag.Bag()
                if not link.startswith('https'):
                    link = "https://www.wildberries.ru" + link
                handbag.get_bag_page(art, link, debug, creds, proxy_pass)
                sql_table.table_append(handbag)
        sql_table.cnx.commit()

        # after 1st page
        if parse_page.check_key('page'):
            return 0
        parse_page.add_key('page', '1')

        # 2nd page and further
        for i in range(2, pages + 1):
            parse_page.change_key('page', str(i))
            print(parse_page.get_url())
            have_a_try = 3
            if have_a_try:
                further_page = html_page.HtmlPage(parse_page.get_url())
                arts_dict = further_page.get_wb_page(creds, proxy_pass)
                if arts_dict:
                    for art, url in arts_dict.items():
                        if not sql_table.table_check_presence(art, creds[6]):
                            handbag = bag.Bag()
                            handbag.get_bag_page(art, url, debug, creds,
                                                 proxy_pass)
                            sql_table.table_append(handbag)
                    sql_table.cnx.commit()
                    continue
                else:
                    sql_table.cnx.commit()
                    print(f"Page {str(i)} parse error. Trying again.")
                    have_a_try -= 1
            else:
                sql_table.cnx.commit()
                print(f"No luck. Next page.")
示例#2
0
 def set_bag_fields(self, article, url, debug, creds, proxy_pass):
     h = html_page.HtmlPage(url)
     page = h.get_html(creds, proxy_pass)
     if page:
         soup = BeautifulSoup(page, 'html.parser')
         self.set_article(article)
         self.set_name(soup)
         self.set_image(soup)
         self.set_url(url)
         self.set_material(soup, debug)
         self.set_price(soup, page)
         self.set_price_sale(soup, page)
         self.set_rating(soup)
         self.set_reviews(soup)
         self.set_sold(page)
     return False
示例#3
0
        mysql_table.table_make()

    if args.update or args.https:
        clear_table = True
        h = proxy.Proxy('http', http_url)
        s = proxy.Proxy('https', https_url)
        len_table = h.form_table(clear_table)
        if args.https:
            print(f"В базе {len_table} прокси.")
            clear_table = False
            time.sleep(60)
            len_table += s.form_table(clear_table)
        print(f"В базе {len_table} прокси.")

    link = url.Url(args.source)
    main_page = html_page.HtmlPage(link.get_url())
    main_html = main_page.get_html(cred_tuple, args.noproxy)

    if main_html and not args.material:
        if link.check_key('page'):
            parse_pages(link.get_url(), 1, args.debug, mysql_table, cred_tuple,
                        args.noproxy)
        else:
            main_soup = BeautifulSoup(main_html, 'html.parser')
            try:
                items = main_soup.find('span',
                                       class_="total many").find('span').text
            except AttributeError:
                print("Bad first page. Try to run again.")
                sys.exit(0)
            print(f"{items} товаров")