Exemplo n.º 1
0
    def extract_products(self):

        path = Global().path_chromedriver  # здесь вставить путь к chomedriver

        path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')

        #keywords = df_desc.loc[['7', '18', '21']]['Ключевы слова, которые должны присутствовать'].values
        urls = sfb_df.fillna('')[sfb_df['URL'].fillna('').str.contains(
            'piluli')]['URL'].values
        ids = sfb_df.fillna('')[sfb_df['URL'].fillna('').str.contains(
            'piluli')].index.astype(int)

        category_titles = sfb_df.fillna('')[sfb_df['URL'].fillna(
            '').str.contains('piluli')]['cat_title'].values

        # запуск парсинга
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        # options = webdriver.ChromeOptions()
        #options.add_argument('--headless')
        #options.add_argument('--disable-gpu')

        driver = webdriver.Chrome(executable_path=path,
                                  chrome_options=Global().chrome_options)

        for index, link in enumerate(urls):
            price_dict = dict()
            print(link)
            driver.get(link)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            price_dict['category_id'] = ids[index]
            price_dict['date'] = Global().date
            price_dict['site_code'] = 'piluli'
            price_dict['site_unit'] = 'шт.'
            price_dict['type'] = 'non-food'
            price_dict['category_title'] = category_titles[index]
            price_dict['site_link'] = link
            price_dict['site_title'] = soup.find('h1', {
                'id': 'offer-title'
            }).text
            price_dict['price_new'] = int(
                soup.find('span', {
                    'id': 'products_price'
                }).text)
            price_dict['price_old'] = int(
                soup.find('span', {
                    'class': 'old-price'
                }).text) if soup.find('span', {
                    'class': 'old-price'
                }).text != '\n' else ''

            res = res.append(price_dict, ignore_index=True)

        driver.quit()

        return res
Exemplo n.º 2
0
    def get_proxy(self):  # опционально, если понадобится прокси
        success = False
        while True:
            driver = webdriver.Chrome(
                executable_path=Global().path_chromedriver)
            driver.get(
                "https://hidemyna.me/ru/proxy-list/?maxtime=300&ports=3128..")
            while True:
                # time.sleep(1)
                if "maxtime" in driver.page_source:
                    ip_list = re.findall(
                        r'\d{2,3}[.]\d{2,3}[.]\d{2,3}[.]\d{2,3}',
                        driver.page_source)
                    break
            driver.quit()

            for it in range(5):
                print('it =', it)
                proxy = random.choice(ip_list[1:]) + ":3128"
                success = False

                driver = webdriver.Chrome(
                    executable_path=Global().path_chromedriver,
                    chrome_options=Global().chrome_options)
                driver.get("https://ozon.ru")
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "__nuxt")))
                    success = True
                    break
                finally:
                    driver.quit()
                if success == True:
                    break

            if success == True:
                break
            else:
                continue
        print('good proxy: {}'.format(proxy))
        driver.quit()
        return (proxy)
Exemplo n.º 3
0
    def extract_product_page(self):
        site_code = 'mvideo'
        ua = UserAgent()
        header = {'User-Agent': str(ua.chrome)}
        desc_df = Global().desc_df
        links_df = Global().links.replace(np.nan, '')
        links_df = links_df[links_df['site_link'].str.contains(site_code)]
        # print(links_df.head())
        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=['date', 'type', 'category_id', 'category_title',
                                    'site_title', 'price_new', 'price_old', 'site_unit',
                                    'site_link', 'site_code'])

        # proxies = get_proxy('https://www.utkonos.ru/')
        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id == cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            i = 0

            while i + 1 <= len(url_list):

                href_i = url_list[i]
                i += 1
                page = 0
                print(href_i)

                r = requests.get(href_i, headers=header)
                html = r.content

                soup = BeautifulSoup(html, 'html.parser')
                # print('soup:\n', soup)
                price_dict = dict()

                price_dict['date'] = Global().date
                price_dict['site_code'] = site_code
                price_dict['category_id'] = int(cat_id)
                price_dict['category_title'] = category_title

                price_dict['site_title'] = wspex_space(
                    soup.find('h1', {'class': 'e-h1 sel-product-title'}).text)
                price_dict['site_link'] = href_i
                # print(price_dict['site_link'])

                # if filter_flag(id_n, price_dict['site_title']) == False:
                # print("   skipped position: {}".format(price_dict['site_title']))
                # continue

                div_sale = soup.find('div', {'class': 'c-pdp-price__old'})
                # print('div_sale:', div_sale)
                if div_sale is not None and div_sale.text != '':
                    # print('div_sale: ',div_sale)
                    price_dict['price_old'] = float(re.match('\d+', wspex(div_sale.text))[0])
                else:
                    price_dict['price_old'] = ''

                div_new = soup.find('div', {'class': 'c-pdp-price__current sel-product-tile-price'})
                price_dict['price_new'] = float(re.match('\d+', wspex(div_new.text))[0])
                price_dict['site_unit'] = 'шт.'
                print('site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.format(price_dict['site_title'],
                                                                                        price_dict['price_new'],
                                                                                        price_dict['price_old'],
                                                                                        price_dict['site_unit']))
                # print(price_dict)
                price_dict['type'] = 'non-food'
                res = res.append(price_dict, ignore_index=True)

        print('Mvideo has successfully parsed')
        return res
Exemplo n.º 4
0
    def extract_products(self):
        start_time = datetime.now().minute

        path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')
        hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains(
            'globus')]['URL'].values
        id_n = 0
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        proxies = get_proxy(hrefs[0])

        header = UserAgent().chrome
        for href in tqdm(hrefs):  # испр
            id_n += 1
            category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'globus')]['cat_title'].iloc[id_n - 1]

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            url_list = list_html(href)
            i = 0

            while i + 1 <= len(url_list):
                url = url_list[i]

                i += 1
                it_error = 0

                page = 0
                while True:
                    page += 1

                    href_i = self.construct_html(url, page)
                    # print('loading {} ...'.format(href_i))
                    try:
                        clever_sleep()
                        if proxies != None:
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header,
                                             timeout=10)
                        else:
                            r = requests.get(href_i,
                                             headers=header,
                                             timeout=10)
                    except:
                        while r.status_code != 200:
                            proxies = get_proxy(href_i)
                            time.sleep(3)
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header,
                                             timeout=10)
                    html = r.content
                    soup = BeautifulSoup(html, 'lxml')
                    products_div = soup.find('div',
                                             {'class': 'catalog-section'})
                    if not products_div:
                        print('WARNING! {} has not product_div'.format(href_i))
                        it_error += 1
                        if it_error > 5:
                            break
                        else:
                            continue
                    amount_div = soup.find('div', {'class': 'catalog-content'})
                    total_amount = int(
                        '0' +
                        amount_div.find('h1').find('sub').text.split(' ')[0])
                    price_list = products_div.find_all(
                        'div', {'class': 'catalog-section__item__body trans'})

                    if page * 64 >= total_amount:
                        flag_nextpage = False
                    else:
                        flag_nextpage = True

                    for price_elem in price_list:
                        price_dict = dict()
                        price_dict['date'] = Global().date
                        price_dict['site_code'] = 'globus'
                        price_dict['category_id'] = id_n
                        price_dict['category_title'] = \
                            sfb_df[sfb_df.fillna('')['URL'].str.contains('globus')]['cat_title'].iloc[id_n - 1]
                        price_dict['type'] = 'food'
                        price_dict['site_title'] = price_elem.find(
                            'span', {
                                'class': 'catalog-section__item__title'
                            }).text

                        # print('category_title: {}\nsite_title: {}'.format(price_dict['category_title'],price_dict['site_title']))
                        if filter_flag(id_n,
                                       price_dict['site_title']) == False:
                            # print("skipped position: {}".format(price_dict['site_title']))
                            continue

                        price_text_rub_div = price_elem.find(
                            'span', {'class': 'item-price__rub'})
                        price_text_kop_div = price_elem.find(
                            'span', {'class': 'item-price__kop'})
                        price_text_old_div = price_elem.find(
                            'span', {'class': 'item-price__old'})

                        if not price_text_rub_div or not price_text_kop_div:
                            continue

                        try:
                            price_dict['price_new'] = int(price_text_rub_div.text.replace(" ", "")) + \
                                                      0.01 * int(price_text_kop_div.text)
                        except:
                            price_dict['price_new'] = int(price_text_rub_div.text.replace("\xa0", "")) + \
                                                      0.01 * int(price_text_kop_div.text)

                        if price_text_old_div:
                            list_ = re.findall(
                                r'\s+', wspex_space(price_text_old_div.text))

                            if len(list_) == 2:
                                price_text = re.sub(
                                    r'\s+',
                                    '',
                                    wspex_space(price_text_old_div.text),
                                    count=1)
                                price_text = re.sub(r'\s+', '.', price_text)

                            else:
                                price_text = re.sub(
                                    r'\s+', '.',
                                    wspex_space(price_text_old_div.text))

                            price_dict['price_old'] = float(price_text)
                        else:
                            price_dict['price_old'] = ''

                        price_dict['site_unit'] = price_elem.find(
                            'span', {
                                'class':
                                'item-price__additional item-price__additional--solo'
                            }).text.strip()
                        price_dict['site_link'] = price_elem.find(
                            'a', {
                                'class':
                                'catalog-section__item__link catalog-section__item__link--one-line notrans'
                            }).get('href')
                        res = res.append(price_dict, ignore_index=True)

                    if not flag_nextpage:
                        break
                    else:
                        continue

        end_time = datetime.now().minute
        time_execution = str(timedelta(minutes=end_time - start_time))
        print('GLOBUS has successfully parsed\ntotal time of execution: {}'.
              format(time_execution))
        return (res)
Exemplo n.º 5
0
    def extract_product_page(self):
        site_code = 'globus'
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(site_code)]
        ua = UserAgent()
        header = {'User-Agent': str(ua.chrome)}
        if Global().max_links != None:
            links_df = links_df.iloc[:Global().max_links]
        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        proxies = get_proxy('https://online.globus.ru/')

        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            i = 0

            while i + 1 <= len(url_list):
                url = url_list[i]

                i += 1

                print('{} ...'.format(url))
                try:
                    # time.sleep(3)
                    if proxies is not None:
                        r = requests.get(url,
                                         proxies=proxies,
                                         headers=header,
                                         timeout=10)  # CRITICAL
                    else:
                        r = requests.get(url, headers=header, timeout=10)
                except:
                    while True:
                        try:
                            proxies = get_proxy(url)
                            time.sleep(3)
                            r = requests.get(url,
                                             proxies=proxies,
                                             headers=header)
                            if r.status_code == 200:
                                break
                        except:
                            continue
                html = r.content
                soup = BeautifulSoup(html, 'lxml')
                products_div = soup.find(
                    'div', {'class': 'item-card__content--right'})

                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = site_code
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title
                price_dict['type'] = 'food'
                try:
                    price_dict['site_title'] = wspex_space(
                        products_div.find('h1', {
                            'class': 'js-with-nbsp-after-digit'
                        }).text)
                except:
                    # print('OOPS! {} has not been parsed'.format(url))
                    continue

                # if filter_flag(id_n, price_dict['site_title']) == False:
                # print("skipped position: {}".format(price_dict['site_title']))
                # continue
                price_div = products_div.find('span', {'class': 'item-price'})

                price_text_rub_div = price_div.find(
                    'span', {'class': 'item-price__rub'})
                price_text_kop_div = price_div.find(
                    'span', {'class': 'item-price__kop'})
                price_text_old_div = price_div.find(
                    'span', {'class': 'item-price__old'})

                if not price_text_rub_div or not price_text_kop_div:
                    continue

                try:
                    price_dict['price_new'] = int(price_text_rub_div.text.replace(" ", "")) + \
                                              0.01 * int(price_text_kop_div.text)
                except:
                    price_dict['price_new'] = int(price_text_rub_div.text.replace("\xa0", "")) + \
                                              0.01 * int(price_text_kop_div.text)

                if price_text_old_div:
                    list_ = re.findall('\s+',
                                       wspex_space(price_text_old_div.text))

                    if len(list_) == 2:
                        price_text = re.sub('\s+',
                                            '',
                                            wspex_space(
                                                price_text_old_div.text),
                                            count=1)
                        price_text = re.sub('\s+', '.', price_text)

                    else:
                        price_text = re.sub(
                            '\s+', '.', wspex_space(price_text_old_div.text))

                    price_dict['price_old'] = float(price_text)

                else:
                    price_dict['price_old'] = ''

                price_dict['site_unit'] = products_div.find(
                    'span', {
                        'class': 'item-price__unit'
                    }).text.strip()
                price_dict['site_link'] = url
                print(
                    'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.
                    format(price_dict['site_title'], price_dict['price_new'],
                           price_dict['price_old'], price_dict['site_unit']))

                res = res.append(price_dict, ignore_index=True)

        print('GLOBUS has successfully parsed')
        return res
Exemplo n.º 6
0
    def extract_products(self):
        start_time = datetime.now().minute
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')
        hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains(
            'okeydostavka')]['URL'].values
        hrefs = [href for href in hrefs if type(href) is not float]
        id_n = 0
        # proxies = get_proxy('https://www.okeydostavka.ru/')

        for href in tqdm(hrefs):

            page = 0
            max_page_index = 1
            i = 0
            id_n += 1
            category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'okey')]['cat_title'].iloc[id_n - 1]
            print("{}...".format(category_title))
            while True:
                url_full = self.construct_html(href, i)
                cookie = \
                    r"_ga=GA1.2.1743913103.1529597174; _ym_uid=1529597174997115265; _gac_UA-58508147-1=1.1529607077.EAIaIQobChMItoj" + \
                    r"f2rLl2wIVjIeyCh2stAAuEAAYASAAEgLCdvD_BwE; _gid=GA1.2.654182099.1529924428; _ym_d=1529924428; _ym_isad=1; _ym_" + \
                    r"visorc_27891822=w; storeGroup=msk1; ffcId=13151; WC_SESSION" + \
                    r"_ESTABLISHED=true; WC_PERSISTENT=3EJGXVtLqH2nPYh%2FBwXZCgqDdro%3D%0A%3B2018-06-26+21%3A22%3A20.903_1530037336" + \
                    r"387-297473_10151; WC_AUTHENTICATION_-1002=-1002%2CshqcDFo2KYvSQjMlws143PZaUdk%3D; WC_ACTIVEPOINTER=-20%2C10151;" + \
                    r"WC_GENERIC_ACTIVITYDATA=[876474606%3Atrue%3Afalse%3A0%3ACLFoHnycXg06Qmg4qmgtx7v6u%2Bc%3D][com.ibm.commerce" + \
                    r".context.audit.AuditContext|1530037336387-297473][com.ibm.commerce.store.facade.server.context.StoreGeoCodeContext" + \
                    r"|null%26null%26null%26null%26null%26null][CTXSETNAME|Store][com.ibm.commerce.context.globalization.Globalization" + \
                    r"Context|-20%26RUB%26-20%26RUB][com.ibm.commerce.catalog.businesscontext.CatalogContext|12051%26null%26false%26false" + \
                    r"%26false][com.ibm.commerce.context.ExternalCartContext|null][com.ibm.commerce.context.base.BaseContext|10151%26-" + \
                    r"1002%26-1002%26-1][com.ibm.commerce.context.experiment.ExperimentContext|null][com.ibm.commerce.context.entitlement" + \
                    r".EntitlementContext|4000000000000000003%264000000000000000003%26null%26-2000%26null%26null%26null][com.ibm." + \
                    r"commerce.giftcenter.context.GiftCenterContext|null%26null%26null]; isNative=1; searchTermHistory=%7C%D1%81%D0%" + \
                    r"BC%D0%B5%D1%82%D0%B0%D0%BD%D0%B0; gtmListKey=GTM_LIST_SEARCH; tmr_detect=1%7C1530037350771"

                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
                    'Cookie': cookie,
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
                    'Cache-Control': 'max-age=0'
                }

                # print('loading url', url_full)

                try:
                    r = requests.get(url_full, headers=headers)  # CRITICAL
                    clever_sleep()
                except:
                    r = 404
                    while r.status_code != 200:
                        proxies = get_proxy(url_full)
                        time.sleep(3)
                        r = requests.get(url_full,
                                         proxies=proxies,
                                         headers=headers)
                html = r.content

                soup = BeautifulSoup(html, 'lxml')

                products_div = soup.find(
                    'div', {'class': 'product_listing_container'})
                if not products_div:
                    continue
                pages_controller_div = soup.find(
                    'div', {'class': 'pages pageControlMenu'})
                if not pages_controller_div:
                    flag_nextpage = False
                else:
                    pages_refs = pages_controller_div.find_all(
                        'a', {'class': 'hoverover'})
                    page += 1
                    for ref in pages_refs:
                        page_index = int(ref.text.strip())
                        if page_index > max_page_index:
                            max_page_index = page_index
                    if max_page_index > page:
                        flag_nextpage = True
                    else:
                        flag_nextpage = False
                price_list = products_div.find_all(
                    'div', {'class': 'product ok-theme'})

                i += len(price_list)

                for price_elem in price_list:
                    price_dict = dict()
                    price_dict['date'] = Global().date
                    price_dict['site_code'] = 'okey'
                    price_dict['category_id'] = id_n
                    price_dict['category_title'] = category_title
                    product_unavailable_div = price_elem.find(
                        'div', {'class': 'product-unavailable-text'})
                    if product_unavailable_div:
                        continue

                    aref = price_elem.find('a')

                    price_dict['site_title'] = aref.get('title')

                    if filter_flag(id_n, price_dict['site_title']) == False:
                        # print("skipped position: {}".format(price_dict['site_title']))
                        continue

                    product_price_script = price_elem.find(
                        'script', {'id': 'productData_'})
                    script_text = product_price_script.text
                    sr = re.search('var\s+product\s*=\s*(?P<dct>.+\});\s*$\s*',
                                   script_text, re.MULTILINE)
                    dct_str = sr.group('dct')
                    dct = demjson.decode(dct_str)  # yaml and json fails here
                    price_dict['price_new'] = dct[
                        'price']  # показывает цену, название товара и ссылку на него
                    sale_div = price_elem.find(
                        'span', {'class': 'label small crossed'})
                    if sale_div:
                        list_price = re.search('\d+\,\d+', sale_div.text)
                        price_dict['price_old'] = tofloat(list_price[0])
                    else:
                        price_dict['price_old'] = ''
                    weight_div = price_elem.find('div',
                                                 {'class': 'product_weight'})
                    if weight_div:
                        price_dict['site_unit'] = wspex_space(weight_div.text)
                    else:
                        quantity_div = price_elem.find(
                            'div', {'class': 'quantity_section'})
                        if quantity_div:
                            price_dict['site_unit'] = '1 уп.'
                        else:
                            print('[okey] For product',
                                  price_dict['site_title'],
                                  ' weight not found!')
                    price_dict['site_link'] = aref.get(
                        'href')  # показывает название товара и ссылку на него
                    price_dict['type'] = 'food'
                    res = res.append(price_dict, ignore_index=True)
                if flag_nextpage == False:
                    break

        end_time = datetime.now().minute
        time_execution = str(timedelta(minutes=end_time - start_time))
        print(
            'OKEY has successfully parsed\ntotal time of execution: {}'.format(
                time_execution))
        return res
Exemplo n.º 7
0
    def extract_product_page(self):
        site_code = 'okey'
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(site_code)]
        if Global().max_links != None:
            links_df = links_df.iloc[:Global().max_links]

        if Global().is_selenium_okey:
            path = Global().path_chromedriver
            # options = webdriver.ChromeOptions()
            driver = webdriver.Chrome(executable_path=path,
                                      chrome_options=Global().chrome_options)

        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        # proxies = get_proxy('https://okeydostavka.ru/')
        proxies = None
        ua = UserAgent()
        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))
            n_err = 0
            # print(' id_n =', id_n)
            i = 0

            while i + 1 <= len(url_list):

                href_i = url_list[i]
                print(href_i)
                i += 1
                # if i % 10 == 0 and i != 0:
                # proxies = get_proxy(href_i)
                cookie = r'_ga=GA1.2.1325218443.1577886613; gtmListKey=GTM_LIST_RECOMENDATIONS; _ym_' + \
                         r'uid=15778866221036907447; _ym_d=1577886622; isNative=1; selectedCity=%D0%9C%D0%' + \
                         r'BE%D1%81%D0%BA%D0%B2%D0%B0; selectedStore=10151_13151; acceptCookie=1; storeGroup=msk1;' + \
                         r'ffcId=13151; WC_SESSION_ESTABLISHED=true; WC_AUTHENTICATION_-1002=-1002%2CzZHlyRjQcgW' + \
                         r'KqNcfDjyX4iZ02zjcQoyDurbFiQxFNVk%3D; WC_ACTIVEPOINTER=-20%2C10151; WC_USERACTIVITY_-1' + \
                         r'002=-1002%2C10151%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C1877362032%2C' + \
                         r'ver_null%2CDg2tDaIGqtvlUd7GeVDIZu1DtkcjFvj1SdTgnMiPwCmRMdhqBYKQ9oMgiku72VhoL3OKnTP2aV5k8V' + \
                         r'zF6ztiaJ508J0SZkHyBJdFQodkOMqqwSEr%2Bg%2B0C1rETa4auryIDSq4FP7c1urrNfoJqDzAkdVBlG8NuO0KAfb' + \
                         r'PocosaJL1o7xK78QvuQz25bWv8w%2BzRoaWagOu7%2BQUD%2B%2FGPrl94xaDOHhYYdgsXrofcc04xzx0c%2BlK6F' + \
                         r'FHANLAGseWFGCm; WC_GENERIC_ACTIVITYDATA=[1996034293%3Atrue%3Afalse%3A0%3AaSne5YGZoxA4Mpz2' + \
                         r'j8qE86%2FndHXVreuwTKmYZIVqRY4%3D][com.ibm.commerce.context.entitlement.EntitlementContext' + \
                         r'|4000000000000000003%264000000000000000003%26null%26-2000%26null%26null%26null][com.ibm' + \
                         r'.commerce.context.audit.AuditContext|null][com.ibm.commerce.context.globalization.Global' + \
                         r'izationContext|-20%26RUB%26-20%26RUB][com.ibm.commerce.store.facade.server.context.StoreG' + \
                         r'eoCodeContext|null%26null%26null%26null%26null%26null][com.ibm.commerce.catalog.businessc' + \
                         r'ontext.CatalogContext|12051%26null%26false%26false%26false][com.ibm.commerce.context.exp' + \
                         r'eriment.ExperimentContext|null][com.ibm.commerce.context.ExternalCartContext|null][com.ib' + \
                         r'm.commerce.context.bcsversion.BusinessContextVersionContext|null][CTXSETNAME|Store][com.ib' + \
                         r'm.commerce.context.base.BaseContext|10151%26-1002%26-1002%26-1][com.ibm.commerce.giftcenter.context.GiftCenterContext|null%26null%26null]; solarfri=6a3c99192124a2fe; _gid=GA1.2.311834681.1579169412; _ym_isad=1; JSESSIONID=0000LPiEiWXPfA6ejMPrOUxMf90:-1; _gat_UA-58508147-1=1; _ym_visorc_27891822=w'

                headers = {
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
                    'Cache-Control': 'max-age=0',
                    'Connection': 'keep-alive',
                    'Cookie': cookie,
                    'Host': 'www.okeydostavka.ru',
                    'Sec-Fetch-Mode': 'navigate',
                    'Sec-Fetch-Site': 'none',
                    'Sec-Fetch-User': '******',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': str(ua.chrome),
                }
                if Global().is_selenium_okey:
                    driver.get(href_i)
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    # driver.close()
                else:
                    try:
                        # clever_sleep()
                        if proxies is not None:
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=headers,
                                             timeout=60)  # CRITICAL
                        else:
                            r = requests.get(href_i,
                                             headers=headers,
                                             timeout=60)
                    except Exception as e:
                        print(str(e) + '!')
                        while True:
                            try:
                                proxies = get_proxy(href_i)
                                time.sleep(3)
                                r = requests.get(href_i,
                                                 headers=headers,
                                                 proxies=proxies,
                                                 timeout=60)
                                if r.status_code == 200:
                                    break
                            except Exception as e:
                                print(str(e) + '!')
                                continue

                    html = r.content
                    soup = BeautifulSoup(html, 'lxml')
                # print('url: ', href_i)
                # print(soup)
                products_div = soup.find(
                    'div', {
                        'class':
                        re.compile('col-8\s+col-lg-7\s+col-md-6\s+'
                                   'col-sm-12\s+product-information')
                    })  #col4 product-information
                # if soup.find('ul', {'class': 'categoryList catalog-menu__category-list'}) is not None:
                # print('yes, catalog is here!')
                # else:
                # print('no')
                # print(products_div)
                if products_div is None:
                    print('no products_div!')

                    # proxies = get_proxy('https://okeydostavka.ru/')
                    if soup.find(
                            'ul',
                        {'class': 'categoryList catalog-menu__category-list'
                         }) is None:

                        print('OOPS, it seems that we have been blocked!')
                        print(soup.text)
                        i -= 1
                        proxies = get_proxy('https://okeydostavka.ru/')

                    continue

                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = 'okey'
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title

                price_dict['site_title'] = wspex_space(
                    products_div.find('h1', {
                        'class': 'main_header'
                    }).text)
                # print('site_title:{}\nurl:{}\n\n'.format(price_dict['site_title'],href_i))

                # if filter_flag(id_n, price_dict['site_title']) == False:
                # print("skipped position: {}".format(price_dict['site_title']))
                # continue

                if re.search('price\s+label\s+label-red\s*',
                             products_div.text) is not None:
                    print(href_i, 'has sale!')
                try:
                    if products_div.find(
                            'span',
                        {'class': re.compile('price\s+label\s+label-red\s*')
                         }) is not None:
                        price_new_div = wspex(
                            products_div.find(
                                'span', {
                                    'class':
                                    re.compile('price\s+label\s+label-red\s*')
                                }).text)
                        sale_div = products_div.find(
                            'span', {'class': 'label small crossed'})
                        price_dict['price_new'] = float(
                            re.search('\d+\,\d+',
                                      price_new_div)[0].replace(',', '.'))
                        price_dict['price_old'] = float(
                            re.search('\d+\,\d+',
                                      sale_div.text)[0].replace(',', '.'))
                    else:
                        price_dict['price_new'] = products_div.find(
                            'span', {'class': re.compile('price\s+label\s*')}
                        )  # показывает цену, название товара и ссылку на него
                        price_dict['price_new'] = float(
                            re.search('\d+\,\d+',
                                      price_dict['price_new'].text)[0].replace(
                                          ',', '.'))
                        price_dict['price_old'] = ''
                except:
                    continue
                piece_units = [
                    'шт', 'штук', 'штуки', 'штука', 'пак', 'пакетиков', 'пак'
                ]
                kg_units = ['кг', 'kg', 'килограмм']  # оставить в граммах
                gram_units = ['г', 'g', 'грамм', 'граммов', 'гр']  # в кг
                litre_units = ['л', 'l', 'литр', 'литров', 'литра']
                ml_units = ['мл', 'ml', 'миллилитров', 'миллилитра']
                tenpiece_units = [
                    '10 шт', '10 шт.', '10шт', '10шт.', 'десяток', 'дес.'
                ]

                kg_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    kg_units) + r')' + '(?:\s+|$)'
                g_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    gram_units) + r')' + '(?:\s+|$)'
                l_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    litre_units) + r')' + '(?:\s+|$)'
                ml_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    ml_units) + r')' + '(?:\s+|$)'
                piece_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    piece_units) + r')' + '(?:\s+|$)'
                tenpiece_pattern = r'\s*(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join(
                    tenpiece_units) + r')' + '(?:\s+|$)'

                patterns = [
                    piece_pattern, tenpiece_pattern, kg_pattern, g_pattern,
                    l_pattern, ml_pattern
                ]
                price_dict['site_unit'] = None
                for pattern in patterns:
                    match = re.search(pattern,
                                      price_dict['site_title'].lower())
                    if match:
                        price_dict['site_unit'] = wspex_space(match[0])
                        # print(price_dict['site_unit'])

                if price_dict['site_unit'] is None:
                    price_dict['site_unit'] = 'шт.'

                price_dict[
                    'site_link'] = href_i  # показывает название товара и ссылку на него
                price_dict['type'] = 'food'
                print(
                    'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.
                    format(price_dict['site_title'], price_dict['price_new'],
                           price_dict['price_old'], price_dict['site_unit']))
                res = res.append(price_dict, ignore_index=True)

        print('OKEY has successfully parsed')
        return res
Exemplo n.º 8
0
    def extract_products(self, max_prod=200):

        path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')

        list_urls = sfb_df.fillna('')[sfb_df.fillna('')['URL'].str.contains(
            'ozon')]['URL'].values

        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        # proxy = self.get_proxy()
        # options = webdriver.ChromeOptions()
        # proxy = get_proxy('http://ozon.ru') # если понадобится прокси
        # options.add_argument('--headless')
        # options.add_argument('--disable-gpu')
        # options.add_argument('--proxy-server=%s' % proxy)

        driver = webdriver.Chrome(
            executable_path=Global().path_chromedriver,
            chrome_options=Global(
            ).chrome_options)  # , chrome_options=self.option_chrome(proxy))

        store = 'ozon'
        driver.implicitly_wait(30)

        id_n = -1

        for url in tqdm(list_urls[id_n + 1:]):
            flag = 0

            id_n += 1

            driver.get(url)
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight*0.01);")

            category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'ozon')]['cat_title'].iloc[id_n]
            print('\n{} ...'.format(category_title))
            offset = 0

            soup = BeautifulSoup(driver.page_source, 'lxml')
            problem_array = []

            i = 0

            page_n = 0
            # print(url)

            while True:

                tiles_list = soup.findAll(
                    'div', {'class': 'tile'
                            })[offset:]  # контейнер для одного продукта
                try:
                    n = int(
                        re.search('\d+',
                                  re.search('\d+ товар[а-я]*',
                                            soup.text)[0])[0])
                except:
                    try:
                        n = int(
                            re.search(
                                '\d+',
                                soup.find('div', {
                                    'class': 'search-title'
                                }).text)[0])
                    except:
                        print(
                            "ACHTUNG! category {} has not been parsed".format(
                                category_title))
                        continue
                # print('amount of items: ', n)

                for tile in tiles_list:
                    i += 1
                    price_dict = dict()
                    # print(tile)

                    try:
                        price_dict['price_old'] = tile.find(
                            'div', {
                                'data-test-id': 'tile-discount'
                            }).text
                        # print('price old:', price_dict['price_old'])
                        price_dict['price_old'] = int(
                            re.search('\d+',
                                      wspex(price_dict['price_old']))[0])
                    except:
                        price_dict['price_old'] = ''

                    price_dict['site_unit'] = 'шт.'
                    price_dict['site_code'] = store
                    price_dict['category_id'] = int(
                        sfb_df.fillna('')[sfb_df.fillna(
                            '')['URL'].str.contains('ozon')].index[id_n])
                    # print('category_id: ',price_dict['category_id'])
                    price_dict['date'] = Global().date
                    price_dict['type'] = 'non-food'

                    try:
                        price_dict['site_title'] = self.tnout(
                            tile.find('a', {
                                'data-test-id': "tile-name"
                            }).text)
                    except:
                        problem_array.append(url)
                        print('OOPS! url {} has not parsed site title'.format(
                            url))
                        break
                    price_dict['category_title'] = category_title
                    price_dict['price_new'] = tile.find(
                        'span', {
                            'class': 'total-price'
                        }).text
                    price_dict['price_new'] = int(
                        re.match('\d+',
                                 self.tnout(wspex(
                                     price_dict['price_new'])))[0])
                    if tile.find('a', {'class': 'full-cover-link'}) == None:
                        price_dict['site_link'] = ''
                        print(
                            "ACHTUNG! link has not parsed for site_title: {}".
                            format(price_dict['site_title']))
                    else:
                        price_dict[
                            'site_link'] = 'https://www.ozon.ru' + tile.find(
                                'a', {
                                    'class': 'full-cover-link'
                                }).get('href')
                    '''print('site_title[{}]: {}\nprice_new: {}\nprice_old: {}\n\n'.format(i,price_dict['site_title'],
                                                                                        price_dict['price_new'],
                                                                                        price_dict['price_old']))'''
                    res = res.append(price_dict, ignore_index=True)

                if i >= n or i >= max_prod or flag == 1:
                    print('   parsing has ended!')
                    break

                offset = offset + len(tiles_list)

                if offset % 280 == 0 and offset != 0:
                    page_n += 11
                    url = url + '&page={}'.format(str(page_n))
                    driver.get(url)
                    print('\n   loading url:{}'.format(url))
                    offset = 0
                    while True:
                        time.sleep(1)

                        soup = BeautifulSoup(driver.page_source, 'lxml')

                        if soup.findAll('div', {'class': 'tile'}) != []:
                            break
                else:

                    scheight = 0.9

                    while True:

                        driver.execute_script(
                            "window.scrollTo(0, document.body.scrollHeight*{});"
                            .format(scheight))

                        soup = BeautifulSoup(driver.page_source, 'lxml')

                        if soup.findAll('div',
                                        {'class': 'tile'})[offset:] != []:
                            print("  offset: {}".format(offset))
                            break
                        if scheight < 1:
                            scheight += 0.01
                        else:
                            print(
                                'WARNING! Scrolling has not been executed (we are here)'
                            )
                            flag = 1
                            break

                        print(scheight)
                        time.sleep(1)

        return res
Exemplo n.º 9
0
    def extract_product_page(self):
        site_code = 'perekrestok'
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(site_code)]
        ua = UserAgent()
        header = {'User-Agent': str(ua.chrome)}

        if Global().max_links != None:
            links_df = links_df.iloc[:Global().max_links]
        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        proxies = None  # get_proxy('https://www.perekrestok.ru/') #

        cookie = r'noHouse=0; _gcl_au=1.1.444475933.1574074757; _ga=GA1.2.762214331.1574074757; _ym_d=1574074757; _ym_uid=1574074757539893444; flocktory-uuid=3da0c784-c6e6-48a1-b5ad-006da3a9393d-1; tracker_ai_user=BWv32|2019-11-18T10:59:21.089Z; cto_lwid=a238aaa4-fac9-42fb-8702-20f8fa785b79; _dy_c_exps=; _dycnst=dg; _dyid=-3805541292711961998; _dy_c_att_exps=; fcf=2; splitVar=test01-B; regionChange=1; luuid=2a83671e-e74e-43bf-9453-1475f62aefda; ins-product-id=484225; insdrSV=18; suuid=96bfa68c-e76a-4623-9bf0-4109601bdb57; _dy_csc_ses=t; _gid=GA1.2.710391697.1575716218; _dyjsession=f58bf955e8baea66ef52b8df2f36e6db; _dy_geo=RU.EU.RU_TUL.RU_TUL_Kireyevsk; _dy_df_geo=Russia..Kireyevsk; _ym_visorc_43992189=w; _ym_isad=1; _dycst=dk.w.c.ss.; _dy_toffset=-3; _dy_ses_load_seq=22331%3A1575717228721; _dy_soct=401501.688467.1575716213*404726.695596.1575716217*405772.698298.1575717228*405837.698434.1575717228*446004.795652.1575717228*366287.608896.1575717228; tmr_detect=1%7C1575717234838; mindboxDeviceUUID=dc46eafc-5856-4f9a-8f46-c7194b0dc0a5; directCrm-session=%7B%22deviceGuid%22%3A%22dc46eafc-5856-4f9a-8f46-c7194b0dc0a5%22%7D; XSRF-TOKEN=eyJpdiI6ImdJYzV2R2xjWHhOSTFKZTFsOFhRcXc9PSIsInZhbHVlIjoiZHhyajVkTTMrQUNXajducW5NeTk2b2JDVHlkVGhYcU9xdkFmU2pEMlBGQ0RIY1NrWlBQaFc2Y2R5MmZsRFFoUE1KS25KcGZjWDJscmRhV2ZrckNJa3c9PSIsIm1hYyI6IjQzODMyMDU5OTI4YzIwOWFkZDA5ODY2YTA1M2QyNjY1MGM5YWVjYzk0NGQ5MmE4MDY3NDE4M2M1ODAyMGZlZTgifQ%3D%3D; aid=eyJpdiI6IndQU3hKYmtDTHdcL1ZHczZtajc4K2JnPT0iLCJ2YWx1ZSI6ImlJQ1ZcL3NHQjE3emg5cDZKdzRJeUllTXBDNmRPcm9aM1JiWmx2OStGK0J5TnJEWWdxZ1FsbDFCUE5FMnlucEk2RFJNN015R0MrWXFNNUhNaXAxeitBQT09IiwibWFjIjoiZDkxYThiOGI0ZjRmNDYyYzU5M2UwYWVlMjJiNjRjYTcwNDFlZDg0ZDg2YTRjOGY0ODkzMWRmNDc5MTM1MmY3YiJ9; appservername=app1; region=1'

        headers = {
            'Accept':
            r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            r'gzip, deflate, br',
            'Accept-Language':
            r'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Cache-Control':
            r'max-age=0',
            'Connection':
            r'keep-alive',
            'Cookie':
            cookie,
            'Host':
            r'www.perekrestok.ru',
            'Referer':
            r'https://www.perekrestok.ru/',
            'Sec-Fetch-Mode':
            r'navigate',
            'Sec-Fetch-Site':
            r'same-origin',
            'Sec-Fetch-User':
            r'?1',
            'Upgrade-Insecure-Requests':
            r'1',
            'User-Agent':
            r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        }

        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            i = 0

            while i + 1 <= len(url_list):
                # time.sleep(3)
                href_i = url_list[i]
                print(href_i)
                i += 1

                try:
                    if proxies != None:
                        r = requests.get(href_i,
                                         proxies=proxies,
                                         headers=headers,
                                         timeout=60)  # CRITICAL
                    else:
                        r = requests.get(href_i, headers=headers, timeout=60)
                except Exception as e:
                    print(e)
                    while True:
                        try:
                            proxies = get_proxy(href_i)
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=headers,
                                             timeout=60)
                            time.sleep(3)
                            if r.status_code == 200:
                                break
                        except:
                            continue

                html = r.content

                soup = BeautifulSoup(html, 'lxml')
                price_dict = dict()

                try:
                    price_dict['site_title'] = wspex_space(
                        soup.find(
                            'h1', {
                                'class':
                                re.compile(
                                    'js-product__title\s+xf-product-card__title'
                                )
                            }).text)
                except:
                    print(soup)

                print('site_title:', price_dict['site_title'])
                products_div = soup.find(
                    'div', {'class': 'xf-product__cost xf-product-cost'})
                if not products_div:
                    print('no products_div!')
                    # print(soup)
                    continue

                price_dict['date'] = Global().date
                price_dict['site_code'] = site_code
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title
                div_sale = products_div.find('div', {
                    'class':
                    'xf-price xf-product-cost__prev js-product__old-cost'
                })
                if div_sale is not None:
                    # print('div-sale:', div_sale)
                    price_dict['price_old'] = float(div_sale.get('data-cost'))
                else:
                    price_dict['price_old'] = ''

                div_new = products_div.find(
                    'div', {
                        'class':
                        'xf-price xf-product-cost__current js-product__cost _highlight'
                    })
                if div_new is None:
                    div_new = products_div.find(
                        'div', {
                            'class':
                            re.compile(
                                'xf-price\s+xf-product-cost__current\s+js-product__cost\s*'
                            )
                        })

                if div_new is None:
                    print('\tdiv_new is None!')
                    # print('products_div:', products_div)
                    continue
                price_dict['price_new'] = float(div_new.get('data-cost'))
                price_dict['site_unit'] = wspex_space(div_new.get('data-type'))
                price_dict[
                    'site_link'] = href_i  # показывает название товара и ссылку на него
                price_dict['type'] = 'food'
                print('price_new: {}\nprice_old: {}\nunit: {}\n'.format(
                    price_dict['price_new'], price_dict['price_old'],
                    price_dict['site_unit']))
                res = res.append(price_dict, ignore_index=True)

        print('PEREKRESTOK has successfully parsed')
        return res
Exemplo n.º 10
0
    def extract_product_page(self):
        site_code = 'ozon'
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(site_code)]

        if Global().max_links != None:
            links_df = links_df.iloc[:Global().max_links]
        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        # options = webdriver.ChromeOptions()
        # proxies = get_proxy('https://www.ozon.ru/')
        # options.add_argument('--headless')
        # options.add_argument('--proxy-server=%s' % proxy)
        if Global().is_selenium_ozon is True:
            driver = webdriver.Chrome(
                executable_path=Global().path_chromedriver,
                chrome_options=Global(
                ).chrome_options)  #, chrome_options=self.option_chrome(proxy))
        ua = UserAgent()
        header = {'User-Agent': str(ua.chrome)}
        proxies = None

        h1_class = 'b4j'
        price_new_class_sale = 'b4u8 b4w0'
        price_new_class = 'b4u8'
        price_old_class = 'b4v2'
        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values
            category_title = desc_df.loc[cat_id, 'cat_title']
            print("{}... ".format(category_title))

            i = 0

            while i + 1 <= len(url_list):

                href_i = url_list[i]
                print(href_i)
                if Global().is_selenium_ozon is True:
                    driver.get(href_i)
                    soup = BeautifulSoup(driver.page_source, 'lxml')
                else:
                    try:
                        # time.sleep(3)
                        if proxies is not None:
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header)  # CRITICAL
                        else:
                            r = requests.get(href_i, headers=header)
                    except:
                        while True:
                            print('im here!')
                            try:
                                proxies = get_proxy(href_i)
                                time.sleep(3)
                                r = requests.get(href_i,
                                                 proxies=proxies,
                                                 headers=header)
                                if r.status_code == 200:
                                    break
                            except:
                                continue
                    html = r.content
                    soup = BeautifulSoup(html, 'lxml')

                i += 1

                # print(soup)
                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = site_code
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title

                try:
                    if soup.find('h1', {'class': h1_class}) is not None:
                        price_dict['site_title'] = wspex_space(
                            soup.find('h1', {
                                'class': h1_class
                            }).text)

                    print('site_title:', price_dict['site_title'])
                except:
                    print('except sitetitle not found')
                    if 'Такой страницы не существует' in soup.text:
                        print('Такой страницы не существует!')
                        continue
                    # i -= 1
                    if soup.find('li', {'class': 'links-item'}) is None:
                        while True:
                            proxies = get_proxy(href_i)
                            time.sleep(3)
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header)
                            if r.status_code == 200:
                                break
                            else:
                                print('r.status_code:', r.status_code)
                    continue

                # div_new = soup.find('span', {'data-test-id': 'saleblock-first-price'})
                # print('soup:\n', soup)
                # if 'Товар закончился' in soup.text:
                # print('Товар закончился!')
                # continue

                div_new = soup.find('span', {'class': price_new_class_sale})

                if div_new is None:
                    div_new = soup.find('span', {'class': price_new_class})

                if div_new is None:
                    print('Товар закончился!\n')
                    continue

                if re.search('\d+', wspex(div_new.text)) is None:
                    print('Товар закончился!\n')
                    continue
                # print('din_new:\n', div_new)
                '''
                soup.find('span', {
                'class': 'price-number'})
                '''

                div_old = soup.find('span', {'class': price_old_class})

                if div_old is not None:
                    price_dict['price_old'] = int(
                        re.search('\d+', wspex(div_old.text))[0])
                else:
                    price_dict['price_old'] = ''

                price_dict['price_new'] = int(
                    re.search('\d+', wspex(div_new.text))[0])

                price_dict['site_unit'] = 'шт.'
                price_dict[
                    'site_link'] = href_i  # показывает название товара и ссылку на него
                price_dict['type'] = 'non-food'
                print('price_new: {}\nprice_old: {}\nunit: {}\n'.format(
                    price_dict['price_new'], price_dict['price_old'],
                    price_dict['site_unit']))
                res = res.append(price_dict, ignore_index=True)

        print('OZON has successfully parsed')
        return res
Exemplo n.º 11
0
    def extract_products(self, max_prod=200):

        # proxies = get_proxy('https://www.lamoda.ru/'
        ua = UserAgent()
        header = {'User-Agent': str(ua.chrome)}
        # количество страниц
        path_sfb = path_sfb = os.path.join(Global().base_dir, 'description',
                                           'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')

        list_urls = sfb_df[sfb_df.fillna('')['URL'].str.contains(
            'lamoda')]['URL'].values  # ссылки на URL lamoda

        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        start_html = 'https://www.lamoda.ru'
        id_n = -1
        fail_list = []

        store = 'lamoda'

        for url in tqdm(list_urls):

            id_n += 1
            category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'lamoda')]['cat_title'].iloc[id_n]

            print('\n{} ...'.format(category_title))
            page = 0

            cat_row = pd.DataFrame(columns=[
                'date', 'type', 'category_id', 'category_title', 'site_title',
                'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
            ])

            try:
                # time.sleep(3)
                r = requests.get(url, headers=header)
            except:
                print('need proxy!')
                proxies = get_proxy(url)
                r = requests.get(url, proxies=proxies, headers=header)
            html = r.content

            soup = BeautifulSoup(html, 'lxml')
            if soup.find(
                    'span',
                {'class': 'products-catalog__head-counter'}) is not None:
                total_amount = int(
                    re.search(
                        r'\d+',
                        wspex(
                            soup.find('span', {
                                'class': 'products-catalog__head-counter'
                            }).text)).group())
                # print('total_amount: ', total_amount)
            else:
                print('total_amount HAS NOT BEEN FOUND! ', total_amount)
                fail_list.append(id_n)
                continue

            while True:
                # time.sleep(2)
                page += 1
                url_i = url + '?page={}'.format(page)
                print('   loading url:{}'.format(url_i))
                try:
                    r = requests.get(url_i, proxies=proxies)
                except:
                    proxies = get_proxy(url)
                    r = requests.get(url_i, proxies=proxies)
                html = r.content

                soup = BeautifulSoup(html, 'lxml')

                product_div = soup.findAll(
                    'a', {'class': 'products-list-item__link link'})
                for product in product_div:
                    product_dict = dict()
                    product_dict['category_id'] = int(
                        sfb_df.fillna('')[sfb_df.fillna(
                            '')['URL'].str.contains('lamoda')].index[id_n])
                    # print('category_id: ', product_dict['category_id'])
                    product_dict['date'] = Global().date
                    product_dict['site_code'] = store
                    product_dict['category_title'] = category_title
                    product_dict['site_title'] = wspex_space(
                        product.find('img').attrs['alt']
                    )  # find('div', {'class': 'products-list-item__brand'}).text)
                    product_dict[
                        'site_link'] = start_html + product.attrs['href']
                    product_dict['site_unit'] = 'шт.'
                    cost_text = product.find('span', {'class': 'price'})
                    #print(cost_text)
                    try:
                        product_dict['price_new'] = tofloat(
                            wspex(
                                cost_text.find('span', {
                                    'class': 'price__new'
                                }).text))

                        product_dict['price_old'] = tofloat(
                            wspex(
                                cost_text.find('span', {
                                    'class': 'price__old'
                                }).text))
                        product_dict['price_new'] = int(
                            product_dict['price_new'])
                        product_dict['price_old'] = int(
                            product_dict['price_old'])

                    except:
                        product_dict['price_new'] = tofloat(
                            wspex(
                                cost_text.find('span', {
                                    'class': 'price__actual'
                                }).text))
                        product_dict['price_old'] = ''
                        product_dict['price_new'] = int(
                            product_dict['price_new'])

                    product_dict['type'] = 'non-food'

                    if product_dict['price_new'] == '' or product_dict[
                            'price_new'] == None:
                        print('{} has no price!!!'.format(
                            product_dict['site_title']))

                        #print('title: {}\nprice_new: {}\nprice_old: {}\n\n'.format(product_dict['site_title'],product_dict['price_new'],product_dict['price_old']))
                    cat_row = cat_row.append(product_dict, ignore_index=True)
                    #print(cat_row[['site_title','price_new','price_old']])

                if len(cat_row) >= max_prod or len(cat_row) == total_amount:
                    res = res.append(cat_row, ignore_index=True)
                    break
                else:
                    continue

        if fail_list != []:
            for elem in fail_list:
                print('CATEGORY {} HAS NOT BEEN PARSED'.format(elem))

        return res
Exemplo n.º 12
0
    def extract_product_page(self):
        site_code = 'lamoda'
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(site_code)]
        if Global().max_links != None:
            links_df = links_df.iloc[:Global().max_links]
        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        # proxies = get_proxy('https://www.lamoda.ru/')
        proxies = None

        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            i = 0

            ua = UserAgent()
            header = {'User-Agent': str(ua.chrome)}
            while i + 1 <= len(url_list):

                href_i = url_list[i]
                print(href_i)
                i += 1

                try:
                    # time.sleep(3)
                    if proxies is not None:
                        r = requests.get(href_i,
                                         proxies=proxies,
                                         headers=header,
                                         timeout=60)  # CRITICAL
                    else:
                        r = requests.get(href_i, headers=header, timeout=60)
                except:
                    while True:
                        proxies = get_proxy(href_i)
                        time.sleep(3)
                        try:
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header)
                            if r.status_code == 200:
                                break
                        except:
                            continue

                html = r.content

                soup = BeautifulSoup(html, 'lxml')

                products_div = soup.find('div', {'class': 'ii-product-buy'})
                if not products_div:
                    proxies = get_proxy('https://www.lamoda.ru/')
                    i -= 1
                    print('no products_div!')
                    continue

                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = site_code
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title
                # print(soup)
                div_sale = soup.find('div',
                                     {'class': 'ii-product__price-discount'})

                if div_sale is not None:
                    # print('div-sale: ',div_sale)
                    price_dict['price_old'] = float(
                        re.match('\d+', wspex(div_sale.text))[0])
                else:
                    price_dict['price_old'] = ''

                type_good = wspex_space(
                    products_div.find('a', {
                        'class': 'hidden'
                    }).text)
                if type_good == '':
                    # print(' imhere!')
                    type_good = wspex_space(
                        text_diff(
                            soup.find('span', {
                                'class': 'heading_m ii-product__title'
                            }).text,
                            soup.find('span', {
                                'class': 'ii-product__brand'
                            }).text))

                try:
                    # if products_div.find('a', {'class': 'hidden'}).text is '':
                    # print(soup)
                    price_dict[
                        'site_title'] = type_good + ' Артикул: ' + wspex_space(
                            products_div.find('div', {
                                'class': 'ii-select__option'
                            }).get('data-value'))

                except:
                    continue
                # print(products_div)
                div_new = products_div.find(
                    'div',
                    {'class': 'ii-product__price ii-product__price_several'})
                if div_new is None:
                    div_new = products_div.find(
                        'div', {
                            'class':
                            'ii-product__price ii-product__price_several DT1717'
                        })
                dct = demjson.decode(div_new.get('data-several-prices'))

                if len(dct['details']) > 1:
                    price_dict['price_old'] = int(dct['details'][0]['value'])
                    price_dict['price_new'] = int(dct['details'][1]['value'])
                else:
                    price_dict['price_new'] = int(dct['details'][0]['value'])
                '''
                else:
                    div_old = 
                    price_dict['price_old'] = int(wspex())
                    price_dict['price_new'] = int(dct['details'][1]['value'])
                '''

                price_dict['site_unit'] = 'шт.'
                price_dict[
                    'site_link'] = href_i  # показывает название товара и ссылку на него
                price_dict['type'] = 'non-food'
                print(
                    'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n\n'
                    .format(price_dict['site_title'], price_dict['price_new'],
                            price_dict['price_old'], price_dict['site_unit']))
                res = res.append(price_dict, ignore_index=True)

        print('LAMODA has successfully parsed')
        return res
Exemplo n.º 13
0
    def extract_products(self):

        start_time = datetime.now()

        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        path_sfb = os.path.join('description', 'urls.csv')

        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')

        hrefs = [href for href in hrefs if type(href) is not float]  # испр
        id_n = 0
        #proxies = get_proxy('https://www.utkonos.ru/')

        for href in tqdm(hrefs):
            #
            id_n += 1
            category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'utkonos')]['cat_title'].iloc[id_n - 1]
            print("{}...".format(category_title))

            url_list = list_html(href)
            i = 0

            while i + 1 <= len(url_list):

                href_i = url_list[i]
                i += 1
                page = 0
                while True:
                    page += 1

                    url_full = self.construct_html(href_i, page)

                    print('loading url', url_full)

                    try:
                        r = requests.get(url_full)  # CRITICAL
                    except:
                        raise ValueError
                        #proxies = get_proxy('https://www.utkonos.ru/')
                        #r = requests.get(url_full, proxies=proxies)
                    html = r.content

                    soup = BeautifulSoup(html, 'lxml')

                    try:
                        products_div = soup.find('div',
                                                 {'class': 'goods_view_box'})

                    except:
                        print("OOPS! {} has no products_div".format(url_full))
                        continue

                    pages_controller_div = soup.find('div',
                                                     {'class': 'el_paginate'})
                    if pages_controller_div is None:
                        # print('no_pages_controller')
                        flag_nextpage = False
                    else:
                        pages_refs = pages_controller_div.find_all('a')
                        max_page_index = 1

                        for ref in pages_refs:
                            page_index = self.representsInt(ref.text.strip())
                            if page_index is not None:
                                if page_index > max_page_index:
                                    max_page_index = page_index
                        if max_page_index > page:
                            # print('max_page_index: ', max_page_index)
                            flag_nextpage = True
                            # print('nextpage!')
                        else:
                            flag_nextpage = False
                            # print('nonextpage!')

                    try:
                        price_list = products_div.find_all(
                            'div', {
                                'class':
                                'goods_view_box-view goods_view goods_view-item'
                            })
                    except:
                        print("OOPS! {} has no price_list".format(url_full))
                        continue

                    for price_elem in price_list:

                        price_dict = dict()

                        price_dict['date'] = Global().date
                        price_dict['site_code'] = 'utkonos'
                        price_dict['category_id'] = id_n
                        price_dict['category_title'] = category_title

                        # product_unavailable_div = price_elem.find('div', {'class': 'product-unavailable-text'})
                        #     if product_unavailable_div is not None:
                        #         continue # just skip
                        #

                        product_name_div = price_elem.find(
                            'div', {'class': 'goods_view_box-caption'})
                        if product_name_div is not None:
                            aref = product_name_div.find('a')
                            if aref is not None:
                                price_dict['site_title'] = wspex_space(
                                    aref.text)
                                price_dict['site_link'] = aref.get('href')
                            else:
                                continue
                        else:
                            continue
                        if filter_flag(id_n,
                                       price_dict['site_title']) == False:
                            # print("   skipped position: {}".format(price_dict['site_title']))
                            continue

                        try:
                            product_price_div = price_elem.find(
                                'div',
                                {'class': 'goods_price-item current big'})
                            div_sale = price_elem.find(
                                'div', {'class': 'goods_price-item old_price'})
                            if div_sale:
                                price_dict['price_old'] = find_float_number(
                                    div_sale.text)
                            else:
                                price_dict['price_old'] = ''

                            # if product_price_div is not None:
                            price_dict['price_new'] = find_float_number(
                                product_price_div.text)

                            if price_dict['price_old'] == price_dict[
                                    'price_new']:
                                price_dict['price_old'] = ''

                            price_dict['site_unit'] = str(
                                product_price_div.get('data-weight'))[1:]
                        except:
                            product_price_div = price_elem.find(
                                'div', {'class': 'goods_price-item current'})

                            # if product_price_div is not None:
                            price_dict['price_new'] = find_float_number(
                                product_price_div.text)
                            price_dict['price_old'] = ''
                            price_dict['site_unit'] = str(
                                product_price_div.get('data-weight'))[1:]
                        """print('site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.format(price_dict['site_title'],
                                                                           price_dict['price_new'],
                                                                           price_dict['price_old'],
                                                                           price_dict['site_unit']))"""
                        # print(price_dict)
                        price_dict['type'] = 'food'
                        res = res.append(price_dict, ignore_index=True)

                    if flag_nextpage == False:
                        break
        end_time = datetime.now()
        time_execution = str(end_time - start_time)
        print('UTKONOS has successfully parsed\ntotal time of execution: {}'.
              format(time_execution))
        return res
Exemplo n.º 14
0
    def extract_product_page(self):
        site_code = 'utkonos'
        # ua = UserAgent()
        # header = {'User-Agent': str(ua.chrome)}
        header = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/51.0.2704.103 Safari/537.36'
        }
        # print(header)
        desc_df = Global().desc_df
        links_df = Global().links
        links_df = links_df[links_df['site_link'].str.contains(
            site_code)].iloc[:Global().max_links]

        category_ids = links_df.category_id.unique()
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        check_url = links_df[links_df.category_id == 1].site_link.values[0]
        # proxies = get_proxy(check_url) #
        proxies = None

        time.sleep(5)
        # selenium
        if Global().is_selenium_utkonos:
            path = Global().path_chromedriver
            # options = webdriver.ChromeOptions()
            # options.add_argument('--headless')
            driver = webdriver.Chrome(executable_path=path,
                                      chrome_options=Global().chrome_options)

        #
        for cat_id in tqdm(category_ids):  # испр
            url_list = links_df[links_df.category_id ==
                                cat_id].site_link.values

            category_title = desc_df.loc[cat_id, 'cat_title']

            print("{}... ".format(category_title))

            # print(' id_n =', id_n)
            i = 0

            while i + 1 <= len(url_list):

                href_i = url_list[i]
                i += 1

                print(href_i)
                if Global().is_selenium_utkonos:
                    driver.get(href_i)
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    # driver.close()
                else:
                    # time.sleep(3)
                    try:
                        if proxies is not None:
                            r = requests.get(href_i,
                                             proxies=proxies,
                                             headers=header)
                        else:
                            r = requests.get(href_i, headers=header)
                    except Exception as e:
                        print('Exception:', e)
                        while True:
                            try:
                                proxies = get_proxy(href_i)
                                time.sleep(3)
                                r = requests.get(href_i,
                                                 proxies=proxies,
                                                 headers=header)
                                break
                            except:
                                continue

                    html = r.content

                    soup = BeautifulSoup(html, 'html.parser')
                    # print('soup:\n', soup)

                products_div = soup.find('div',
                                         {'class': 'goods_view_item-action'})
                if products_div is None:
                    print('soup:\n', soup)
                # print(products_div)
                # products_div = soup.find('div', {'class': 'b-section--bg i-pb30 js-product-item js-product-main'})
                # print('\n\nproducts_div:\n', products_div)
                price_dict = dict()

                price_dict['date'] = Global().date
                price_dict['site_code'] = 'utkonos'
                price_dict['category_id'] = cat_id
                price_dict['category_title'] = category_title

                # try:
                price_dict['site_title'] = wspex_space(
                    products_div.find('div', {
                        'class': 'goods_view_item-action_header'
                    }).text)
                # except:
                #     print('url %s is broken' % href_i)
                #     continue
                price_dict['site_link'] = href_i
                # print(price_dict['site_link'])

                # if filter_flag(id_n, price_dict['site_title']) == False:
                # print("   skipped position: {}".format(price_dict['site_title']))
                # continue
                price_div = products_div.find(
                    'div', {'class': 'goods_price has_old_price'})

                # print('div_sale:', div_sale)
                if price_div is not None:

                    div_sale = price_div.find(
                        'div', {'class': 'goods_price-item old_price'})
                    # print('div_sale: ', div_sale)
                    price_dict['price_old'] = float(
                        re.search('\d+\.\d+',
                                  wspex(div_sale.text).replace(',', '.'))[0])

                    div_new = price_div.find(
                        'div', {'class': 'goods_price-item current'})
                    if div_new is None:
                        div_new = price_div.find(
                            'div', {'class': 'goods_price-item current big'})
                    price_dict['price_new'] = float(
                        re.search('\d+\.\d+',
                                  wspex(div_new.text).replace(',', '.'))[0])

                    price_dict['site_unit'] = str(
                        div_new.get('data-weight'))[1:]

                else:
                    div_new = products_div.find(
                        'div', {'class': 'goods_price-item current'})
                    if div_new is None:
                        div_new = products_div.find(
                            'div', {'class': 'goods_price-item current big'})
                    price_dict['price_new'] = float(
                        re.search('\d+\.\d+',
                                  wspex(div_new.text).replace(',', '.'))[0])
                    price_dict['price_old'] = ''
                    price_dict['site_unit'] = str(
                        div_new.get('data-weight'))[1:]
                print(
                    'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.
                    format(price_dict['site_title'], price_dict['price_new'],
                           price_dict['price_old'], price_dict['site_unit']))
                # print(price_dict)
                price_dict['type'] = 'food'
                res = res.append(price_dict, ignore_index=True)

        if Global().is_selenium_utkonos:
            driver.quit()

        print('UTKONOS has successfully parsed')
        return res
Exemplo n.º 15
0
    def get_df(self):
        print('get data from services...')
        sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id')
        serv_df = sfb_df[sfb_df['type'] == 'services']

        list_url = serv_df['URL'].values
        final_df = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        #mgts
        n = 0
        url = list_url[n]
        print(url)
        html = requests.get(url, headers={
            'User-Agent': UserAgent().chrome
        }).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_list = soup.findAll('div',
                                  {'class': 'slider_slide'})  #0 заменить
        for price_elem in price_list:
            if price_elem.findAll('div',
                                  {'class': 'texts'})[0].text == 'Безлимитный':
                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = 'services'
                id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0])
                price_dict['category_id'] = id_n
                price_dict['category_title'] = serv_df['cat_title'].loc[
                    price_dict['category_id']]
                price_dict['type'] = 'services'
                price_dict['site_title'] = price_elem.findAll(
                    'div', {'class': 'texts'})[0].text
                price_dict['price_new'] = int(
                    price_elem.findAll('div',
                                       {'class': 'slider_price_val'})[0].text)
                price_dict['price_old'] = ''
                price_dict['site_unit'] = price_elem.findAll(
                    'div', {'class': 'slider_price_rub1'
                            })[0].text + '/' + price_elem.findAll(
                                'div', {'class': 'slider_price_rub2'})[0].text
                price_dict['site_link'] = url
                final_df = final_df.append(price_dict, ignore_index=True)
                break

        #Помывка в бане в общем отделении, билет	http://legkiipar.ru/menraz.html
        try:
            n = 1
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')  #Будние дни с 08:00 до 22:00
            pattern = re.compile(r'Будние дни')
            price_dict = dict()
            price_dict['date'] = Global().date
            price_dict['site_code'] = 'services'
            price_dict['type'] = 'services'
            price_dict['site_title'] = soup(text=pattern)[0]
            price_1 = soup.findAll('span', {'class': 'стиль6'})
            price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0]
            price_dict['price_old'] = ''
            price_dict['site_unit'] = re.findall('\d+ часа',
                                                 price_1[4].text[:-1])[0]
            price_dict['category_id'] = int(
                serv_df[serv_df['URL'].str.contains(url)].index[0])
            price_dict['category_title'] = serv_df['cat_title'].loc[
                price_dict['category_id']].values[0]
            price_dict['site_link'] = url
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #Помывка в бане в общем отделении, билет	http://banya-lefortovo.ru/price.html
        n = 2
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        pattern = re.compile(r'Русская общая баня')
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['type'] = 'services'
        price_dict['price_new'] = int(
            re.findall('\d+',
                       re.findall('\d+ рублей',
                                  soup(text=pattern)[0])[0])[0])
        price_dict['price_old'] = ''
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             soup(text=pattern)[0])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	https://rzhevskie-bani.ru/rb/bani.html
        n = 3
        price_dict = dict()
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['price_new'] = int(
            re.findall('\d+',
                       soup.findAll('td', {'class': 'price'})[0].text)[0])
        pattern = re.compile(r'Стоимость')
        soup.findAll('td')
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['type'] = 'services'
        price_dict['site_unit'] = re.findall('(\d+.*\d часа)',
                                             soup(text=pattern)[0][-9:])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad
        n = 4
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        try:
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                timeout=10).content
        except:
            proxy = get_proxy(url)
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                proxies=proxy).content
        soup = BeautifulSoup(html, 'lxml')
        price_div = soup.findAll('div', {'class': 'price-head'})[0]
        price_dict['price_new'] = int(
            re.findall('\d+',
                       price_div.findAll('span',
                                         {'class': 'price'})[0].text)[0])
        price_dict['price_old'] = ''
        price_dict['site_title'] = price_div.find('p').text.replace(
            '\xa0', ' ')
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             price_dict['site_title'])[0]
        price_dict['type'] = 'services'
        price_dict['site_link'] = url
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        final_df = final_df.append(price_dict, ignore_index=True)

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        '''
        n=5
        price_dict=dict()
        price_dict['date']=Global().date
        price_dict['site_code']='services'
        url=list_url[n]
        print(url)
        html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome}
        soup=BeautifulSoup(html, 'lxml')
        price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('износоустойчивой резины',elem.text)!=[]:
                price_div=elem
                price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0]
                price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1]
                price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text)
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_link']=url
                break

        final_df=final_df.append(price_dict,ignore_index=True)
        '''

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        n = 6
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('эконом', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = self.wspex_space(
                    re.findall(
                        '[А-Яа-яёз(). ]+',
                        price_div.findAll('td', {'align': 'left'})[0].text)[0])
                price_text = price_div.findAll('strong')[0].text
                price_dict['price_new'] = int(re.findall('\d+', price_text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = re.findall(
                    '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1]
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 7
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        #soup.findAll('td')#,{'class':'text-center'})[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title']
        for elem in soup.findAll('td'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                site_title = price_div.text
                break

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        # # стрижка
        try:
            n = 8
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  # , headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            # soup.findAll('td')#,{'class':'text-center'})[0]
            for elem in soup.findAll('tr'):
                if re.findall('(любой длины)', elem.text) != []:
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[-1])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_text = elem.text
                    price_dict['site_title'] = re.findall(
                        '[А-Яа-я ()]+', price_text)[0]
                    price_dict['price_new'] = re.findall('\d+', price_text)[0]
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        try:
            n = 9
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            for elem in soup.findAll('tr'):
                if re.findall('Женская', elem.text) != []:
                    price_div = elem
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[0])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_dict['site_title'] = price_div.find(
                        'td', {
                            'class': 'services-table__name'
                        }).text
                    price_dict['price_new'] = int(
                        self.wspex(
                            price_div.find(
                                'td', {
                                    'class':
                                    'services-table__price services-table__price-small'
                                }).text))
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        n = 10
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        for elem in soup.findAll('tr'):
            if re.findall('лопаток', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(list_url[n -
                                                                 1])].index[0])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title'].values[0]
                price_dict['site_title'] = price_div.find(
                    'td', {
                        'height': '17'
                    }).text
                price_dict['price_new'] = int(
                    self.wspex(price_div.find('td', {
                        'width': '157'
                    }).text))
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'стрижка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 11
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(url)].index[-1])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title']
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)
        final_df = final_df[final_df.site_title.notna()]
        print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!')
        return final_df
Exemplo n.º 16
0
 def __init__(self):
     self.path_sfb = os.path.join(Global().base_dir, 'description',
                                  'urls.csv')
Exemplo n.º 17
0
    def printer_test(self):
        function_start_time = datetime.now()

        Global().getproxies()
        print('Timer call : start making snapshots')

        date_now = datetime.now().strftime("%Y-%m-%d")

        print('Timer call : start making snapshots')

        df = get_empty_handler_DF()

        # use display from pyVirtual display package in order to launch selenium not in a real window
        with Display():
            # tor_webdriver = create_tor_webdriver()
            # df = df.append(IkeaHandlerMSK(tor_driver=tor_webdriver).extract_products())
            # df = df.append(RiglaHandlerSPB(tor_driver=tor_webdriver).extract_products())
            # df = df.append(PerekrestokHandlerSPB(tor_driver=tor_webdriver).extract_products())
            # df = df.append(OkeyHandlerSPB(tor_driver=tor_webdriver).extract_products())
            # tor_webdriver.quit()

            df = df.append(
                IkeaHandlerMSK(proxy_method='tor-service').extract_products())
            df = df.append(
                RiglaHandlerSPB(proxy_method='tor-service').extract_products())
            df = df.append(
                PerekrestokHandlerSPB(
                    proxy_method='tor-service').extract_products())
            df = df.append(
                OkeyHandlerSPB(proxy_method='tor-service',
                               use_request=True).extract_products())

            df = df.append(
                SvaznoyHandlerMSK(proxy_method='no-proxy').extract_products())
            df = df.append(
                EldoradoHandlerMSK(
                    proxy_method='tor-service').extract_products())

            df = df.append(
                LentaHandlerMSK(proxy_method='no-proxy').extract_products())
            df = df.append(
                LentaHandlerSPB(proxy_method='no-proxy').extract_products())

        with Display():
            try:
                df = df.append(TotalGrocery().get_df_page())
            except:
                print('ERROR while handling TotalGrocery')

            try:
                df = df.append(TotalNongrocery().get_df_page())
            except:
                print('ERROR while handling TotalNongrocery')

            try:
                df = df.append(Services().get_df())
            except:
                print('ERROR while handling Services')

        # uncomment for tests
        # df = pd.read_csv(os.path.join('parser_app', 'logic', 'description', 'df_after_handlers_FOR_TESTS.csv'))

        df['date'] = date_now

        df = df.sort_values(['category_id', 'site_link'])

        df['miss'] = 0
        df.reset_index(drop=True, inplace=True)

        path_to_parsed_content_folder = 'parsed_content'
        if not os.path.exists(path_to_parsed_content_folder):
            os.makedirs(path_to_parsed_content_folder)

        df_path = os.path.join('parsed_content',
                               'data_test_{}.csv'.format(date_now))
        pivot_path = os.path.join('parsed_content',
                                  'pivot_test_{}.csv'.format(date_now))

        pivot = df.pivot_table(index='category_id',
                               columns=['type', 'site_code'],
                               values='site_link',
                               aggfunc='nunique')

        if sys.platform.startswith('linux'):
            df.to_csv(df_path)
            pivot.to_csv(pivot_path)
        elif sys.platform.contain('win'):
            df.to_csv(os.path.join(r'D:\ANE_2', df_path))
            pivot.to_csv(os.path.join(r'D:\ANE_2', pivot_path))
        else:
            raise ValueError("your operation system not found")

        df['price_old'] = df['price_old'].replace('', -1.0)
        df['price_old'] = df['price_old'].fillna(-1.0)

        cached_list = []
        print('Storing raw prices to db...')
        for _, row in df.iterrows():
            prod = PricesRaw(
                date=row['date'],
                type=row['type'],
                category_id=row['category_id'],
                category_title=row['category_title'],
                site_title=row['site_title'],
                price_new=row['price_new'],
                price_old=row['price_old'],
                site_unit=row['site_unit'],
                site_link=row['site_link'],
                site_code=row['site_code'],
                miss=row['miss'],
            )
            cached_list.append(prod)
        PricesRaw.objects.bulk_create(cached_list)
        print('Storing complete!')

        print('Filling df...')
        filled_df = fill_df(
            pd.DataFrame(list(PricesRaw.objects.all().values())))
        filled_df.to_csv(os.path.join('parsed_content', 'filled_df.csv'))
        print('Filling complete!')

        df_gks = GKS_weekly_handler().get_df()
        cached_list = []

        Gks.objects.all().delete()
        print('Storing gks prices to db...')
        for _, row in df_gks.iterrows():
            prod = Gks(
                date=row['date'],
                type=row['type'],
                category_id=row['category_id'],
                category_title=row['category_title'],
                site_title=row['site_title'],
                price_new=row['price_new'],
                price_old=row['price_old'],
                site_unit=row['site_unit'],
                site_link=row['site_link'],
                site_code=row['site_code'],
                miss=row['miss'],
            )
            cached_list.append(prod)
        Gks.objects.bulk_create(cached_list)
        print('Storing complete!')

        print('Getting basket df...')
        basket_df = get_basket_df(
            df_gks[df_gks['type'] == 'food'],
            filled_df[filled_df['type'] == 'food'],
        )
        print('Getting complete!')

        print('Storing basket to db...')
        cached_list = []
        Basket.objects.all().delete()
        for _, row in basket_df.iterrows():
            prod = Basket(
                date=row['date'],
                gks_price=row['gks_price'],
                online_price=row['online_price'],
            )
            cached_list.append(prod)
        Basket.objects.bulk_create(cached_list)
        print('Storing completed!')

        function_end_time = datetime.now()
        time_execution = str(function_end_time - function_start_time)

        print('PARSING ENDED!\ntotal time of all execution: {}'.format(
            time_execution))
Exemplo n.º 18
0
    def extract_products(self, is_proxy=True):
        if is_proxy == True:
            proxies = get_proxy('https://www.perekrestok.ru/')
        else:
            proxies = None
        start_time = datetime.now().minute
        res = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])
        fail_array = []
        path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
        sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id')
        hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains(
            'perekrestok')]['URL'].values
        hrefs = [href for href in hrefs if type(href) is not float]
        # print(hrefs)
        id_n = 0

        for href in tqdm(hrefs):

            n_items_before = len(res)

            category_titles = sfb_df[sfb_df.fillna('')['URL'].str.contains(
                'perekrestok')]['cat_title']

            try:
                html = requests.get(href, proxies=proxies).content
            except:
                proxies = get_proxy(href)
                html = requests.get(href, proxies=proxies).content

            soup = BeautifulSoup(html, 'html.parser')
            try:
                helper_div = soup.find(
                    'div', {'class': 'xf-sort__total js-list-total'})
            except:
                print('WARNING!!! helper_div in {} has not found'.format(href))
                fail_array.append(href)
                continue
            total_amount = int(
                helper_div.find('span', {
                    'class': 'js-list-total__total-count'
                }).text)
            print('\n' + category_titles.iloc[id_n] +
                  '... товаров в категории: ' + str(total_amount))
            page = 0

            id_n += 1

            # print('  total_amount: {}'.format(total_amount))
            n_elem = 0
            n_elem_out = 0
            while n_elem < total_amount - n_elem_out:
                # print('n_elem: {} total_amount: {}'.format(n_elem, total_amount))

                total_amount = int(
                    helper_div.find('span', {
                        'class': 'js-list-total__total-count'
                    }).text)

                page += 1
                if href[-1] == '?':
                    href_i = '{}page={}'.format(href, page)
                else:
                    href_i = '{}&page={}'.format(href, page)
                # print('\tgetting page: {}'.format(href_i,page))
                try:
                    html_i = requests.get(href_i, proxies=proxies).content
                    #print('im here')
                except:
                    proxies = get_proxy(href_i)
                    html_i = requests.get(href_i, proxies=proxies).content
                soup = BeautifulSoup(html_i, 'html.parser')
                products_div = soup.find('div', {'class': 'js-catalog-wrap'})
                price_list = products_div.find_all(
                    'div', {'class': 'xf-product js-product'})
                # print('price_list:{}\n\n'.format(products_div,price_list))
                n_elem_out += len(
                    products_div.find_all(
                        'div', {'class': re.compile(r'\w*ot-activ\w+')}))
                # print(n_elem_out)
                for price_elem in price_list:
                    n_elem += 1
                    price_dict = dict()
                    price_dict['date'] = Global().date
                    price_dict['site_code'] = 'perekrestok'
                    price_dict['category_id'] = id_n
                    price_dict['category_title'] = category_titles.iloc[id_n -
                                                                        1]
                    aref = price_elem.find('div', {'class': 'xf-product__title xf-product-title'}).\
                        find('a', {'class': 'xf-product-title__link js-product__title'})

                    price_dict['site_title'] = aref.text.strip()

                    if filter_flag(id_n, price_dict['site_title']) == False:
                        # print("skipped position: {}".format(price_dict['site_title']))
                        continue
                    cost_div = price_elem.find(
                        'div', {'class': 'xf-product__cost xf-product-cost'})
                    if cost_div == None:
                        continue
                    sale_div = cost_div.find(
                        'div', {'class': 'xf-price xf-product-cost__prev'})

                    if sale_div:

                        posted_price_div = cost_div.find(
                            'div', {
                                'class':
                                'xf-price xf-product-cost__current js-product__cost _highlight'
                            })
                        price_dict['price_new'] = int(
                            posted_price_div.find('span', {
                                'class': 'xf-price__rouble'
                            }).text)
                        pennies_cost_div = posted_price_div.find(
                            'span', {'class': 'xf-price__penny'})
                        if pennies_cost_div is not None:
                            pennies_cost = float(
                                pennies_cost_div.text.strip().replace(
                                    ',', '.', 1))
                        else:
                            pennies_cost = 0.0

                        price_dict['price_old'] = tofloat(sale_div.text)
                    else:
                        price_dict['price_new'] = int(
                            cost_div.find('span', {
                                'class': 'xf-price__rouble'
                            }).text)
                        pennies_cost_div = cost_div.find(
                            'span', {'class': 'xf-price__penny'})
                        if pennies_cost_div is not None:
                            pennies_cost = float(
                                pennies_cost_div.text.strip().replace(
                                    ',', '.', 1))
                        else:
                            pennies_cost = 0.0
                        price_dict['price_old'] = ''

                    site_unit_div = cost_div.find('span',
                                                  {'class': 'xf-price__unit'})

                    if site_unit_div is not None:
                        site_unit = site_unit_div.text.split(
                            r'/')[-1].split()[0]
                    else:
                        site_unit = 'шт'
                    price_dict['price_new'] += pennies_cost
                    price_dict['site_unit'] = site_unit
                    price_dict['site_link'] = aref.get('href')
                    price_dict['type'] = 'food'
                    '''
                    print('site_title: {}\nprice_new: {}\nprice_old: {}\n\n'.format(price_dict['site_title'],
                                                                                        price_dict['price_new'],
                                                                                        price_dict['price_old']))
                    '''

                    res = res.append(price_dict, ignore_index=True)

                    # print('   length of res:{}'.format(len(res)))

            # print('\t\tparsed {} items'.format(len(res)- n_items_before))

        end_time = datetime.now().minute
        time_execution = str(timedelta(minutes=end_time - start_time))
        print(
            'PEREKRESTOK has successfully parsed\ntotal time of execution: {}'.
            format(time_execution))
        if fail_array != []:
            print('FAIL URLS:')
            for elem in fail_array:
                print(elem)
        return res