def extract_products(self): path = Global().path_chromedriver # здесь вставить путь к chomedriver path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') #keywords = df_desc.loc[['7', '18', '21']]['Ключевы слова, которые должны присутствовать'].values urls = sfb_df.fillna('')[sfb_df['URL'].fillna('').str.contains( 'piluli')]['URL'].values ids = sfb_df.fillna('')[sfb_df['URL'].fillna('').str.contains( 'piluli')].index.astype(int) category_titles = sfb_df.fillna('')[sfb_df['URL'].fillna( '').str.contains('piluli')]['cat_title'].values # запуск парсинга res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) # options = webdriver.ChromeOptions() #options.add_argument('--headless') #options.add_argument('--disable-gpu') driver = webdriver.Chrome(executable_path=path, chrome_options=Global().chrome_options) for index, link in enumerate(urls): price_dict = dict() print(link) driver.get(link) soup = BeautifulSoup(driver.page_source, 'html.parser') price_dict['category_id'] = ids[index] price_dict['date'] = Global().date price_dict['site_code'] = 'piluli' price_dict['site_unit'] = 'шт.' price_dict['type'] = 'non-food' price_dict['category_title'] = category_titles[index] price_dict['site_link'] = link price_dict['site_title'] = soup.find('h1', { 'id': 'offer-title' }).text price_dict['price_new'] = int( soup.find('span', { 'id': 'products_price' }).text) price_dict['price_old'] = int( soup.find('span', { 'class': 'old-price' }).text) if soup.find('span', { 'class': 'old-price' }).text != '\n' else '' res = res.append(price_dict, ignore_index=True) driver.quit() return res
def get_proxy(self): # опционально, если понадобится прокси success = False while True: driver = webdriver.Chrome( executable_path=Global().path_chromedriver) driver.get( "https://hidemyna.me/ru/proxy-list/?maxtime=300&ports=3128..") while True: # time.sleep(1) if "maxtime" in driver.page_source: ip_list = re.findall( r'\d{2,3}[.]\d{2,3}[.]\d{2,3}[.]\d{2,3}', driver.page_source) break driver.quit() for it in range(5): print('it =', it) proxy = random.choice(ip_list[1:]) + ":3128" success = False driver = webdriver.Chrome( executable_path=Global().path_chromedriver, chrome_options=Global().chrome_options) driver.get("https://ozon.ru") try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "__nuxt"))) success = True break finally: driver.quit() if success == True: break if success == True: break else: continue print('good proxy: {}'.format(proxy)) driver.quit() return (proxy)
def extract_product_page(self): site_code = 'mvideo' ua = UserAgent() header = {'User-Agent': str(ua.chrome)} desc_df = Global().desc_df links_df = Global().links.replace(np.nan, '') links_df = links_df[links_df['site_link'].str.contains(site_code)] # print(links_df.head()) category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=['date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code']) # proxies = get_proxy('https://www.utkonos.ru/') for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) # print(' id_n =', id_n) i = 0 while i + 1 <= len(url_list): href_i = url_list[i] i += 1 page = 0 print(href_i) r = requests.get(href_i, headers=header) html = r.content soup = BeautifulSoup(html, 'html.parser') # print('soup:\n', soup) price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = site_code price_dict['category_id'] = int(cat_id) price_dict['category_title'] = category_title price_dict['site_title'] = wspex_space( soup.find('h1', {'class': 'e-h1 sel-product-title'}).text) price_dict['site_link'] = href_i # print(price_dict['site_link']) # if filter_flag(id_n, price_dict['site_title']) == False: # print(" skipped position: {}".format(price_dict['site_title'])) # continue div_sale = soup.find('div', {'class': 'c-pdp-price__old'}) # print('div_sale:', div_sale) if div_sale is not None and div_sale.text != '': # print('div_sale: ',div_sale) price_dict['price_old'] = float(re.match('\d+', wspex(div_sale.text))[0]) else: price_dict['price_old'] = '' div_new = soup.find('div', {'class': 'c-pdp-price__current sel-product-tile-price'}) price_dict['price_new'] = float(re.match('\d+', wspex(div_new.text))[0]) price_dict['site_unit'] = 'шт.' print('site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) # print(price_dict) price_dict['type'] = 'non-food' res = res.append(price_dict, ignore_index=True) print('Mvideo has successfully parsed') return res
def extract_products(self): start_time = datetime.now().minute path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'globus')]['URL'].values id_n = 0 res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) proxies = get_proxy(hrefs[0]) header = UserAgent().chrome for href in tqdm(hrefs): # испр id_n += 1 category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'globus')]['cat_title'].iloc[id_n - 1] print("{}... ".format(category_title)) # print(' id_n =', id_n) url_list = list_html(href) i = 0 while i + 1 <= len(url_list): url = url_list[i] i += 1 it_error = 0 page = 0 while True: page += 1 href_i = self.construct_html(url, page) # print('loading {} ...'.format(href_i)) try: clever_sleep() if proxies != None: r = requests.get(href_i, proxies=proxies, headers=header, timeout=10) else: r = requests.get(href_i, headers=header, timeout=10) except: while r.status_code != 200: proxies = get_proxy(href_i) time.sleep(3) r = requests.get(href_i, proxies=proxies, headers=header, timeout=10) html = r.content soup = BeautifulSoup(html, 'lxml') products_div = soup.find('div', {'class': 'catalog-section'}) if not products_div: print('WARNING! {} has not product_div'.format(href_i)) it_error += 1 if it_error > 5: break else: continue amount_div = soup.find('div', {'class': 'catalog-content'}) total_amount = int( '0' + amount_div.find('h1').find('sub').text.split(' ')[0]) price_list = products_div.find_all( 'div', {'class': 'catalog-section__item__body trans'}) if page * 64 >= total_amount: flag_nextpage = False else: flag_nextpage = True for price_elem in price_list: price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'globus' price_dict['category_id'] = id_n price_dict['category_title'] = \ sfb_df[sfb_df.fillna('')['URL'].str.contains('globus')]['cat_title'].iloc[id_n - 1] price_dict['type'] = 'food' price_dict['site_title'] = price_elem.find( 'span', { 'class': 'catalog-section__item__title' }).text # print('category_title: {}\nsite_title: {}'.format(price_dict['category_title'],price_dict['site_title'])) if filter_flag(id_n, price_dict['site_title']) == False: # print("skipped position: {}".format(price_dict['site_title'])) continue price_text_rub_div = price_elem.find( 'span', {'class': 'item-price__rub'}) price_text_kop_div = price_elem.find( 'span', {'class': 'item-price__kop'}) price_text_old_div = price_elem.find( 'span', {'class': 'item-price__old'}) if not price_text_rub_div or not price_text_kop_div: continue try: price_dict['price_new'] = int(price_text_rub_div.text.replace(" ", "")) + \ 0.01 * int(price_text_kop_div.text) except: price_dict['price_new'] = int(price_text_rub_div.text.replace("\xa0", "")) + \ 0.01 * int(price_text_kop_div.text) if price_text_old_div: list_ = re.findall( r'\s+', wspex_space(price_text_old_div.text)) if len(list_) == 2: price_text = re.sub( r'\s+', '', wspex_space(price_text_old_div.text), count=1) price_text = re.sub(r'\s+', '.', price_text) else: price_text = re.sub( r'\s+', '.', wspex_space(price_text_old_div.text)) price_dict['price_old'] = float(price_text) else: price_dict['price_old'] = '' price_dict['site_unit'] = price_elem.find( 'span', { 'class': 'item-price__additional item-price__additional--solo' }).text.strip() price_dict['site_link'] = price_elem.find( 'a', { 'class': 'catalog-section__item__link catalog-section__item__link--one-line notrans' }).get('href') res = res.append(price_dict, ignore_index=True) if not flag_nextpage: break else: continue end_time = datetime.now().minute time_execution = str(timedelta(minutes=end_time - start_time)) print('GLOBUS has successfully parsed\ntotal time of execution: {}'. format(time_execution)) return (res)
def extract_product_page(self): site_code = 'globus' desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains(site_code)] ua = UserAgent() header = {'User-Agent': str(ua.chrome)} if Global().max_links != None: links_df = links_df.iloc[:Global().max_links] category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) proxies = get_proxy('https://online.globus.ru/') for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) # print(' id_n =', id_n) i = 0 while i + 1 <= len(url_list): url = url_list[i] i += 1 print('{} ...'.format(url)) try: # time.sleep(3) if proxies is not None: r = requests.get(url, proxies=proxies, headers=header, timeout=10) # CRITICAL else: r = requests.get(url, headers=header, timeout=10) except: while True: try: proxies = get_proxy(url) time.sleep(3) r = requests.get(url, proxies=proxies, headers=header) if r.status_code == 200: break except: continue html = r.content soup = BeautifulSoup(html, 'lxml') products_div = soup.find( 'div', {'class': 'item-card__content--right'}) price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = site_code price_dict['category_id'] = cat_id price_dict['category_title'] = category_title price_dict['type'] = 'food' try: price_dict['site_title'] = wspex_space( products_div.find('h1', { 'class': 'js-with-nbsp-after-digit' }).text) except: # print('OOPS! {} has not been parsed'.format(url)) continue # if filter_flag(id_n, price_dict['site_title']) == False: # print("skipped position: {}".format(price_dict['site_title'])) # continue price_div = products_div.find('span', {'class': 'item-price'}) price_text_rub_div = price_div.find( 'span', {'class': 'item-price__rub'}) price_text_kop_div = price_div.find( 'span', {'class': 'item-price__kop'}) price_text_old_div = price_div.find( 'span', {'class': 'item-price__old'}) if not price_text_rub_div or not price_text_kop_div: continue try: price_dict['price_new'] = int(price_text_rub_div.text.replace(" ", "")) + \ 0.01 * int(price_text_kop_div.text) except: price_dict['price_new'] = int(price_text_rub_div.text.replace("\xa0", "")) + \ 0.01 * int(price_text_kop_div.text) if price_text_old_div: list_ = re.findall('\s+', wspex_space(price_text_old_div.text)) if len(list_) == 2: price_text = re.sub('\s+', '', wspex_space( price_text_old_div.text), count=1) price_text = re.sub('\s+', '.', price_text) else: price_text = re.sub( '\s+', '.', wspex_space(price_text_old_div.text)) price_dict['price_old'] = float(price_text) else: price_dict['price_old'] = '' price_dict['site_unit'] = products_div.find( 'span', { 'class': 'item-price__unit' }).text.strip() price_dict['site_link'] = url print( 'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'. format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) res = res.append(price_dict, ignore_index=True) print('GLOBUS has successfully parsed') return res
def extract_products(self): start_time = datetime.now().minute res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'okeydostavka')]['URL'].values hrefs = [href for href in hrefs if type(href) is not float] id_n = 0 # proxies = get_proxy('https://www.okeydostavka.ru/') for href in tqdm(hrefs): page = 0 max_page_index = 1 i = 0 id_n += 1 category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'okey')]['cat_title'].iloc[id_n - 1] print("{}...".format(category_title)) while True: url_full = self.construct_html(href, i) cookie = \ r"_ga=GA1.2.1743913103.1529597174; _ym_uid=1529597174997115265; _gac_UA-58508147-1=1.1529607077.EAIaIQobChMItoj" + \ r"f2rLl2wIVjIeyCh2stAAuEAAYASAAEgLCdvD_BwE; _gid=GA1.2.654182099.1529924428; _ym_d=1529924428; _ym_isad=1; _ym_" + \ r"visorc_27891822=w; storeGroup=msk1; ffcId=13151; WC_SESSION" + \ r"_ESTABLISHED=true; WC_PERSISTENT=3EJGXVtLqH2nPYh%2FBwXZCgqDdro%3D%0A%3B2018-06-26+21%3A22%3A20.903_1530037336" + \ r"387-297473_10151; WC_AUTHENTICATION_-1002=-1002%2CshqcDFo2KYvSQjMlws143PZaUdk%3D; WC_ACTIVEPOINTER=-20%2C10151;" + \ r"WC_GENERIC_ACTIVITYDATA=[876474606%3Atrue%3Afalse%3A0%3ACLFoHnycXg06Qmg4qmgtx7v6u%2Bc%3D][com.ibm.commerce" + \ r".context.audit.AuditContext|1530037336387-297473][com.ibm.commerce.store.facade.server.context.StoreGeoCodeContext" + \ r"|null%26null%26null%26null%26null%26null][CTXSETNAME|Store][com.ibm.commerce.context.globalization.Globalization" + \ r"Context|-20%26RUB%26-20%26RUB][com.ibm.commerce.catalog.businesscontext.CatalogContext|12051%26null%26false%26false" + \ r"%26false][com.ibm.commerce.context.ExternalCartContext|null][com.ibm.commerce.context.base.BaseContext|10151%26-" + \ r"1002%26-1002%26-1][com.ibm.commerce.context.experiment.ExperimentContext|null][com.ibm.commerce.context.entitlement" + \ r".EntitlementContext|4000000000000000003%264000000000000000003%26null%26-2000%26null%26null%26null][com.ibm." + \ r"commerce.giftcenter.context.GiftCenterContext|null%26null%26null]; isNative=1; searchTermHistory=%7C%D1%81%D0%" + \ r"BC%D0%B5%D1%82%D0%B0%D0%BD%D0%B0; gtmListKey=GTM_LIST_SEARCH; tmr_detect=1%7C1530037350771" headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 'Cookie': cookie, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', 'Cache-Control': 'max-age=0' } # print('loading url', url_full) try: r = requests.get(url_full, headers=headers) # CRITICAL clever_sleep() except: r = 404 while r.status_code != 200: proxies = get_proxy(url_full) time.sleep(3) r = requests.get(url_full, proxies=proxies, headers=headers) html = r.content soup = BeautifulSoup(html, 'lxml') products_div = soup.find( 'div', {'class': 'product_listing_container'}) if not products_div: continue pages_controller_div = soup.find( 'div', {'class': 'pages pageControlMenu'}) if not pages_controller_div: flag_nextpage = False else: pages_refs = pages_controller_div.find_all( 'a', {'class': 'hoverover'}) page += 1 for ref in pages_refs: page_index = int(ref.text.strip()) if page_index > max_page_index: max_page_index = page_index if max_page_index > page: flag_nextpage = True else: flag_nextpage = False price_list = products_div.find_all( 'div', {'class': 'product ok-theme'}) i += len(price_list) for price_elem in price_list: price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'okey' price_dict['category_id'] = id_n price_dict['category_title'] = category_title product_unavailable_div = price_elem.find( 'div', {'class': 'product-unavailable-text'}) if product_unavailable_div: continue aref = price_elem.find('a') price_dict['site_title'] = aref.get('title') if filter_flag(id_n, price_dict['site_title']) == False: # print("skipped position: {}".format(price_dict['site_title'])) continue product_price_script = price_elem.find( 'script', {'id': 'productData_'}) script_text = product_price_script.text sr = re.search('var\s+product\s*=\s*(?P<dct>.+\});\s*$\s*', script_text, re.MULTILINE) dct_str = sr.group('dct') dct = demjson.decode(dct_str) # yaml and json fails here price_dict['price_new'] = dct[ 'price'] # показывает цену, название товара и ссылку на него sale_div = price_elem.find( 'span', {'class': 'label small crossed'}) if sale_div: list_price = re.search('\d+\,\d+', sale_div.text) price_dict['price_old'] = tofloat(list_price[0]) else: price_dict['price_old'] = '' weight_div = price_elem.find('div', {'class': 'product_weight'}) if weight_div: price_dict['site_unit'] = wspex_space(weight_div.text) else: quantity_div = price_elem.find( 'div', {'class': 'quantity_section'}) if quantity_div: price_dict['site_unit'] = '1 уп.' else: print('[okey] For product', price_dict['site_title'], ' weight not found!') price_dict['site_link'] = aref.get( 'href') # показывает название товара и ссылку на него price_dict['type'] = 'food' res = res.append(price_dict, ignore_index=True) if flag_nextpage == False: break end_time = datetime.now().minute time_execution = str(timedelta(minutes=end_time - start_time)) print( 'OKEY has successfully parsed\ntotal time of execution: {}'.format( time_execution)) return res
def extract_product_page(self): site_code = 'okey' desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains(site_code)] if Global().max_links != None: links_df = links_df.iloc[:Global().max_links] if Global().is_selenium_okey: path = Global().path_chromedriver # options = webdriver.ChromeOptions() driver = webdriver.Chrome(executable_path=path, chrome_options=Global().chrome_options) category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) # proxies = get_proxy('https://okeydostavka.ru/') proxies = None ua = UserAgent() for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) n_err = 0 # print(' id_n =', id_n) i = 0 while i + 1 <= len(url_list): href_i = url_list[i] print(href_i) i += 1 # if i % 10 == 0 and i != 0: # proxies = get_proxy(href_i) cookie = r'_ga=GA1.2.1325218443.1577886613; gtmListKey=GTM_LIST_RECOMENDATIONS; _ym_' + \ r'uid=15778866221036907447; _ym_d=1577886622; isNative=1; selectedCity=%D0%9C%D0%' + \ r'BE%D1%81%D0%BA%D0%B2%D0%B0; selectedStore=10151_13151; acceptCookie=1; storeGroup=msk1;' + \ r'ffcId=13151; WC_SESSION_ESTABLISHED=true; WC_AUTHENTICATION_-1002=-1002%2CzZHlyRjQcgW' + \ r'KqNcfDjyX4iZ02zjcQoyDurbFiQxFNVk%3D; WC_ACTIVEPOINTER=-20%2C10151; WC_USERACTIVITY_-1' + \ r'002=-1002%2C10151%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C1877362032%2C' + \ r'ver_null%2CDg2tDaIGqtvlUd7GeVDIZu1DtkcjFvj1SdTgnMiPwCmRMdhqBYKQ9oMgiku72VhoL3OKnTP2aV5k8V' + \ r'zF6ztiaJ508J0SZkHyBJdFQodkOMqqwSEr%2Bg%2B0C1rETa4auryIDSq4FP7c1urrNfoJqDzAkdVBlG8NuO0KAfb' + \ r'PocosaJL1o7xK78QvuQz25bWv8w%2BzRoaWagOu7%2BQUD%2B%2FGPrl94xaDOHhYYdgsXrofcc04xzx0c%2BlK6F' + \ r'FHANLAGseWFGCm; WC_GENERIC_ACTIVITYDATA=[1996034293%3Atrue%3Afalse%3A0%3AaSne5YGZoxA4Mpz2' + \ r'j8qE86%2FndHXVreuwTKmYZIVqRY4%3D][com.ibm.commerce.context.entitlement.EntitlementContext' + \ r'|4000000000000000003%264000000000000000003%26null%26-2000%26null%26null%26null][com.ibm' + \ r'.commerce.context.audit.AuditContext|null][com.ibm.commerce.context.globalization.Global' + \ r'izationContext|-20%26RUB%26-20%26RUB][com.ibm.commerce.store.facade.server.context.StoreG' + \ r'eoCodeContext|null%26null%26null%26null%26null%26null][com.ibm.commerce.catalog.businessc' + \ r'ontext.CatalogContext|12051%26null%26false%26false%26false][com.ibm.commerce.context.exp' + \ r'eriment.ExperimentContext|null][com.ibm.commerce.context.ExternalCartContext|null][com.ib' + \ r'm.commerce.context.bcsversion.BusinessContextVersionContext|null][CTXSETNAME|Store][com.ib' + \ r'm.commerce.context.base.BaseContext|10151%26-1002%26-1002%26-1][com.ibm.commerce.giftcenter.context.GiftCenterContext|null%26null%26null]; solarfri=6a3c99192124a2fe; _gid=GA1.2.311834681.1579169412; _ym_isad=1; JSESSIONID=0000LPiEiWXPfA6ejMPrOUxMf90:-1; _gat_UA-58508147-1=1; _ym_visorc_27891822=w' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': cookie, 'Host': 'www.okeydostavka.ru', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '******', 'Upgrade-Insecure-Requests': '1', 'User-Agent': str(ua.chrome), } if Global().is_selenium_okey: driver.get(href_i) soup = BeautifulSoup(driver.page_source, 'html.parser') # driver.close() else: try: # clever_sleep() if proxies is not None: r = requests.get(href_i, proxies=proxies, headers=headers, timeout=60) # CRITICAL else: r = requests.get(href_i, headers=headers, timeout=60) except Exception as e: print(str(e) + '!') while True: try: proxies = get_proxy(href_i) time.sleep(3) r = requests.get(href_i, headers=headers, proxies=proxies, timeout=60) if r.status_code == 200: break except Exception as e: print(str(e) + '!') continue html = r.content soup = BeautifulSoup(html, 'lxml') # print('url: ', href_i) # print(soup) products_div = soup.find( 'div', { 'class': re.compile('col-8\s+col-lg-7\s+col-md-6\s+' 'col-sm-12\s+product-information') }) #col4 product-information # if soup.find('ul', {'class': 'categoryList catalog-menu__category-list'}) is not None: # print('yes, catalog is here!') # else: # print('no') # print(products_div) if products_div is None: print('no products_div!') # proxies = get_proxy('https://okeydostavka.ru/') if soup.find( 'ul', {'class': 'categoryList catalog-menu__category-list' }) is None: print('OOPS, it seems that we have been blocked!') print(soup.text) i -= 1 proxies = get_proxy('https://okeydostavka.ru/') continue price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'okey' price_dict['category_id'] = cat_id price_dict['category_title'] = category_title price_dict['site_title'] = wspex_space( products_div.find('h1', { 'class': 'main_header' }).text) # print('site_title:{}\nurl:{}\n\n'.format(price_dict['site_title'],href_i)) # if filter_flag(id_n, price_dict['site_title']) == False: # print("skipped position: {}".format(price_dict['site_title'])) # continue if re.search('price\s+label\s+label-red\s*', products_div.text) is not None: print(href_i, 'has sale!') try: if products_div.find( 'span', {'class': re.compile('price\s+label\s+label-red\s*') }) is not None: price_new_div = wspex( products_div.find( 'span', { 'class': re.compile('price\s+label\s+label-red\s*') }).text) sale_div = products_div.find( 'span', {'class': 'label small crossed'}) price_dict['price_new'] = float( re.search('\d+\,\d+', price_new_div)[0].replace(',', '.')) price_dict['price_old'] = float( re.search('\d+\,\d+', sale_div.text)[0].replace(',', '.')) else: price_dict['price_new'] = products_div.find( 'span', {'class': re.compile('price\s+label\s*')} ) # показывает цену, название товара и ссылку на него price_dict['price_new'] = float( re.search('\d+\,\d+', price_dict['price_new'].text)[0].replace( ',', '.')) price_dict['price_old'] = '' except: continue piece_units = [ 'шт', 'штук', 'штуки', 'штука', 'пак', 'пакетиков', 'пак' ] kg_units = ['кг', 'kg', 'килограмм'] # оставить в граммах gram_units = ['г', 'g', 'грамм', 'граммов', 'гр'] # в кг litre_units = ['л', 'l', 'литр', 'литров', 'литра'] ml_units = ['мл', 'ml', 'миллилитров', 'миллилитра'] tenpiece_units = [ '10 шт', '10 шт.', '10шт', '10шт.', 'десяток', 'дес.' ] kg_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( kg_units) + r')' + '(?:\s+|$)' g_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( gram_units) + r')' + '(?:\s+|$)' l_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( litre_units) + r')' + '(?:\s+|$)' ml_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( ml_units) + r')' + '(?:\s+|$)' piece_pattern = r'\s+(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( piece_units) + r')' + '(?:\s+|$)' tenpiece_pattern = r'\s*(?:\d{1,4}[×,.]\d{1,4}|\d{0,4})\s*(?:' + r'|'.join( tenpiece_units) + r')' + '(?:\s+|$)' patterns = [ piece_pattern, tenpiece_pattern, kg_pattern, g_pattern, l_pattern, ml_pattern ] price_dict['site_unit'] = None for pattern in patterns: match = re.search(pattern, price_dict['site_title'].lower()) if match: price_dict['site_unit'] = wspex_space(match[0]) # print(price_dict['site_unit']) if price_dict['site_unit'] is None: price_dict['site_unit'] = 'шт.' price_dict[ 'site_link'] = href_i # показывает название товара и ссылку на него price_dict['type'] = 'food' print( 'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'. format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) res = res.append(price_dict, ignore_index=True) print('OKEY has successfully parsed') return res
def extract_products(self, max_prod=200): path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') list_urls = sfb_df.fillna('')[sfb_df.fillna('')['URL'].str.contains( 'ozon')]['URL'].values res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) # proxy = self.get_proxy() # options = webdriver.ChromeOptions() # proxy = get_proxy('http://ozon.ru') # если понадобится прокси # options.add_argument('--headless') # options.add_argument('--disable-gpu') # options.add_argument('--proxy-server=%s' % proxy) driver = webdriver.Chrome( executable_path=Global().path_chromedriver, chrome_options=Global( ).chrome_options) # , chrome_options=self.option_chrome(proxy)) store = 'ozon' driver.implicitly_wait(30) id_n = -1 for url in tqdm(list_urls[id_n + 1:]): flag = 0 id_n += 1 driver.get(url) driver.execute_script( "window.scrollTo(0, document.body.scrollHeight*0.01);") category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'ozon')]['cat_title'].iloc[id_n] print('\n{} ...'.format(category_title)) offset = 0 soup = BeautifulSoup(driver.page_source, 'lxml') problem_array = [] i = 0 page_n = 0 # print(url) while True: tiles_list = soup.findAll( 'div', {'class': 'tile' })[offset:] # контейнер для одного продукта try: n = int( re.search('\d+', re.search('\d+ товар[а-я]*', soup.text)[0])[0]) except: try: n = int( re.search( '\d+', soup.find('div', { 'class': 'search-title' }).text)[0]) except: print( "ACHTUNG! category {} has not been parsed".format( category_title)) continue # print('amount of items: ', n) for tile in tiles_list: i += 1 price_dict = dict() # print(tile) try: price_dict['price_old'] = tile.find( 'div', { 'data-test-id': 'tile-discount' }).text # print('price old:', price_dict['price_old']) price_dict['price_old'] = int( re.search('\d+', wspex(price_dict['price_old']))[0]) except: price_dict['price_old'] = '' price_dict['site_unit'] = 'шт.' price_dict['site_code'] = store price_dict['category_id'] = int( sfb_df.fillna('')[sfb_df.fillna( '')['URL'].str.contains('ozon')].index[id_n]) # print('category_id: ',price_dict['category_id']) price_dict['date'] = Global().date price_dict['type'] = 'non-food' try: price_dict['site_title'] = self.tnout( tile.find('a', { 'data-test-id': "tile-name" }).text) except: problem_array.append(url) print('OOPS! url {} has not parsed site title'.format( url)) break price_dict['category_title'] = category_title price_dict['price_new'] = tile.find( 'span', { 'class': 'total-price' }).text price_dict['price_new'] = int( re.match('\d+', self.tnout(wspex( price_dict['price_new'])))[0]) if tile.find('a', {'class': 'full-cover-link'}) == None: price_dict['site_link'] = '' print( "ACHTUNG! link has not parsed for site_title: {}". format(price_dict['site_title'])) else: price_dict[ 'site_link'] = 'https://www.ozon.ru' + tile.find( 'a', { 'class': 'full-cover-link' }).get('href') '''print('site_title[{}]: {}\nprice_new: {}\nprice_old: {}\n\n'.format(i,price_dict['site_title'], price_dict['price_new'], price_dict['price_old']))''' res = res.append(price_dict, ignore_index=True) if i >= n or i >= max_prod or flag == 1: print(' parsing has ended!') break offset = offset + len(tiles_list) if offset % 280 == 0 and offset != 0: page_n += 11 url = url + '&page={}'.format(str(page_n)) driver.get(url) print('\n loading url:{}'.format(url)) offset = 0 while True: time.sleep(1) soup = BeautifulSoup(driver.page_source, 'lxml') if soup.findAll('div', {'class': 'tile'}) != []: break else: scheight = 0.9 while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight*{});" .format(scheight)) soup = BeautifulSoup(driver.page_source, 'lxml') if soup.findAll('div', {'class': 'tile'})[offset:] != []: print(" offset: {}".format(offset)) break if scheight < 1: scheight += 0.01 else: print( 'WARNING! Scrolling has not been executed (we are here)' ) flag = 1 break print(scheight) time.sleep(1) return res
def extract_product_page(self): site_code = 'perekrestok' desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains(site_code)] ua = UserAgent() header = {'User-Agent': str(ua.chrome)} if Global().max_links != None: links_df = links_df.iloc[:Global().max_links] category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) proxies = None # get_proxy('https://www.perekrestok.ru/') # cookie = r'noHouse=0; _gcl_au=1.1.444475933.1574074757; _ga=GA1.2.762214331.1574074757; _ym_d=1574074757; _ym_uid=1574074757539893444; flocktory-uuid=3da0c784-c6e6-48a1-b5ad-006da3a9393d-1; tracker_ai_user=BWv32|2019-11-18T10:59:21.089Z; cto_lwid=a238aaa4-fac9-42fb-8702-20f8fa785b79; _dy_c_exps=; _dycnst=dg; _dyid=-3805541292711961998; _dy_c_att_exps=; fcf=2; splitVar=test01-B; regionChange=1; luuid=2a83671e-e74e-43bf-9453-1475f62aefda; ins-product-id=484225; insdrSV=18; suuid=96bfa68c-e76a-4623-9bf0-4109601bdb57; _dy_csc_ses=t; _gid=GA1.2.710391697.1575716218; _dyjsession=f58bf955e8baea66ef52b8df2f36e6db; _dy_geo=RU.EU.RU_TUL.RU_TUL_Kireyevsk; _dy_df_geo=Russia..Kireyevsk; _ym_visorc_43992189=w; _ym_isad=1; _dycst=dk.w.c.ss.; _dy_toffset=-3; _dy_ses_load_seq=22331%3A1575717228721; _dy_soct=401501.688467.1575716213*404726.695596.1575716217*405772.698298.1575717228*405837.698434.1575717228*446004.795652.1575717228*366287.608896.1575717228; tmr_detect=1%7C1575717234838; mindboxDeviceUUID=dc46eafc-5856-4f9a-8f46-c7194b0dc0a5; directCrm-session=%7B%22deviceGuid%22%3A%22dc46eafc-5856-4f9a-8f46-c7194b0dc0a5%22%7D; XSRF-TOKEN=eyJpdiI6ImdJYzV2R2xjWHhOSTFKZTFsOFhRcXc9PSIsInZhbHVlIjoiZHhyajVkTTMrQUNXajducW5NeTk2b2JDVHlkVGhYcU9xdkFmU2pEMlBGQ0RIY1NrWlBQaFc2Y2R5MmZsRFFoUE1KS25KcGZjWDJscmRhV2ZrckNJa3c9PSIsIm1hYyI6IjQzODMyMDU5OTI4YzIwOWFkZDA5ODY2YTA1M2QyNjY1MGM5YWVjYzk0NGQ5MmE4MDY3NDE4M2M1ODAyMGZlZTgifQ%3D%3D; aid=eyJpdiI6IndQU3hKYmtDTHdcL1ZHczZtajc4K2JnPT0iLCJ2YWx1ZSI6ImlJQ1ZcL3NHQjE3emg5cDZKdzRJeUllTXBDNmRPcm9aM1JiWmx2OStGK0J5TnJEWWdxZ1FsbDFCUE5FMnlucEk2RFJNN015R0MrWXFNNUhNaXAxeitBQT09IiwibWFjIjoiZDkxYThiOGI0ZjRmNDYyYzU5M2UwYWVlMjJiNjRjYTcwNDFlZDg0ZDg2YTRjOGY0ODkzMWRmNDc5MTM1MmY3YiJ9; appservername=app1; region=1' headers = { 'Accept': r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': r'gzip, deflate, br', 'Accept-Language': r'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': r'max-age=0', 'Connection': r'keep-alive', 'Cookie': cookie, 'Host': r'www.perekrestok.ru', 'Referer': r'https://www.perekrestok.ru/', 'Sec-Fetch-Mode': r'navigate', 'Sec-Fetch-Site': r'same-origin', 'Sec-Fetch-User': r'?1', 'Upgrade-Insecure-Requests': r'1', 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', } for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) # print(' id_n =', id_n) i = 0 while i + 1 <= len(url_list): # time.sleep(3) href_i = url_list[i] print(href_i) i += 1 try: if proxies != None: r = requests.get(href_i, proxies=proxies, headers=headers, timeout=60) # CRITICAL else: r = requests.get(href_i, headers=headers, timeout=60) except Exception as e: print(e) while True: try: proxies = get_proxy(href_i) r = requests.get(href_i, proxies=proxies, headers=headers, timeout=60) time.sleep(3) if r.status_code == 200: break except: continue html = r.content soup = BeautifulSoup(html, 'lxml') price_dict = dict() try: price_dict['site_title'] = wspex_space( soup.find( 'h1', { 'class': re.compile( 'js-product__title\s+xf-product-card__title' ) }).text) except: print(soup) print('site_title:', price_dict['site_title']) products_div = soup.find( 'div', {'class': 'xf-product__cost xf-product-cost'}) if not products_div: print('no products_div!') # print(soup) continue price_dict['date'] = Global().date price_dict['site_code'] = site_code price_dict['category_id'] = cat_id price_dict['category_title'] = category_title div_sale = products_div.find('div', { 'class': 'xf-price xf-product-cost__prev js-product__old-cost' }) if div_sale is not None: # print('div-sale:', div_sale) price_dict['price_old'] = float(div_sale.get('data-cost')) else: price_dict['price_old'] = '' div_new = products_div.find( 'div', { 'class': 'xf-price xf-product-cost__current js-product__cost _highlight' }) if div_new is None: div_new = products_div.find( 'div', { 'class': re.compile( 'xf-price\s+xf-product-cost__current\s+js-product__cost\s*' ) }) if div_new is None: print('\tdiv_new is None!') # print('products_div:', products_div) continue price_dict['price_new'] = float(div_new.get('data-cost')) price_dict['site_unit'] = wspex_space(div_new.get('data-type')) price_dict[ 'site_link'] = href_i # показывает название товара и ссылку на него price_dict['type'] = 'food' print('price_new: {}\nprice_old: {}\nunit: {}\n'.format( price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) res = res.append(price_dict, ignore_index=True) print('PEREKRESTOK has successfully parsed') return res
def extract_product_page(self): site_code = 'ozon' desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains(site_code)] if Global().max_links != None: links_df = links_df.iloc[:Global().max_links] category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) # options = webdriver.ChromeOptions() # proxies = get_proxy('https://www.ozon.ru/') # options.add_argument('--headless') # options.add_argument('--proxy-server=%s' % proxy) if Global().is_selenium_ozon is True: driver = webdriver.Chrome( executable_path=Global().path_chromedriver, chrome_options=Global( ).chrome_options) #, chrome_options=self.option_chrome(proxy)) ua = UserAgent() header = {'User-Agent': str(ua.chrome)} proxies = None h1_class = 'b4j' price_new_class_sale = 'b4u8 b4w0' price_new_class = 'b4u8' price_old_class = 'b4v2' for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) i = 0 while i + 1 <= len(url_list): href_i = url_list[i] print(href_i) if Global().is_selenium_ozon is True: driver.get(href_i) soup = BeautifulSoup(driver.page_source, 'lxml') else: try: # time.sleep(3) if proxies is not None: r = requests.get(href_i, proxies=proxies, headers=header) # CRITICAL else: r = requests.get(href_i, headers=header) except: while True: print('im here!') try: proxies = get_proxy(href_i) time.sleep(3) r = requests.get(href_i, proxies=proxies, headers=header) if r.status_code == 200: break except: continue html = r.content soup = BeautifulSoup(html, 'lxml') i += 1 # print(soup) price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = site_code price_dict['category_id'] = cat_id price_dict['category_title'] = category_title try: if soup.find('h1', {'class': h1_class}) is not None: price_dict['site_title'] = wspex_space( soup.find('h1', { 'class': h1_class }).text) print('site_title:', price_dict['site_title']) except: print('except sitetitle not found') if 'Такой страницы не существует' in soup.text: print('Такой страницы не существует!') continue # i -= 1 if soup.find('li', {'class': 'links-item'}) is None: while True: proxies = get_proxy(href_i) time.sleep(3) r = requests.get(href_i, proxies=proxies, headers=header) if r.status_code == 200: break else: print('r.status_code:', r.status_code) continue # div_new = soup.find('span', {'data-test-id': 'saleblock-first-price'}) # print('soup:\n', soup) # if 'Товар закончился' in soup.text: # print('Товар закончился!') # continue div_new = soup.find('span', {'class': price_new_class_sale}) if div_new is None: div_new = soup.find('span', {'class': price_new_class}) if div_new is None: print('Товар закончился!\n') continue if re.search('\d+', wspex(div_new.text)) is None: print('Товар закончился!\n') continue # print('din_new:\n', div_new) ''' soup.find('span', { 'class': 'price-number'}) ''' div_old = soup.find('span', {'class': price_old_class}) if div_old is not None: price_dict['price_old'] = int( re.search('\d+', wspex(div_old.text))[0]) else: price_dict['price_old'] = '' price_dict['price_new'] = int( re.search('\d+', wspex(div_new.text))[0]) price_dict['site_unit'] = 'шт.' price_dict[ 'site_link'] = href_i # показывает название товара и ссылку на него price_dict['type'] = 'non-food' print('price_new: {}\nprice_old: {}\nunit: {}\n'.format( price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) res = res.append(price_dict, ignore_index=True) print('OZON has successfully parsed') return res
def extract_products(self, max_prod=200): # proxies = get_proxy('https://www.lamoda.ru/' ua = UserAgent() header = {'User-Agent': str(ua.chrome)} # количество страниц path_sfb = path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') list_urls = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'lamoda')]['URL'].values # ссылки на URL lamoda res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) start_html = 'https://www.lamoda.ru' id_n = -1 fail_list = [] store = 'lamoda' for url in tqdm(list_urls): id_n += 1 category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'lamoda')]['cat_title'].iloc[id_n] print('\n{} ...'.format(category_title)) page = 0 cat_row = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) try: # time.sleep(3) r = requests.get(url, headers=header) except: print('need proxy!') proxies = get_proxy(url) r = requests.get(url, proxies=proxies, headers=header) html = r.content soup = BeautifulSoup(html, 'lxml') if soup.find( 'span', {'class': 'products-catalog__head-counter'}) is not None: total_amount = int( re.search( r'\d+', wspex( soup.find('span', { 'class': 'products-catalog__head-counter' }).text)).group()) # print('total_amount: ', total_amount) else: print('total_amount HAS NOT BEEN FOUND! ', total_amount) fail_list.append(id_n) continue while True: # time.sleep(2) page += 1 url_i = url + '?page={}'.format(page) print(' loading url:{}'.format(url_i)) try: r = requests.get(url_i, proxies=proxies) except: proxies = get_proxy(url) r = requests.get(url_i, proxies=proxies) html = r.content soup = BeautifulSoup(html, 'lxml') product_div = soup.findAll( 'a', {'class': 'products-list-item__link link'}) for product in product_div: product_dict = dict() product_dict['category_id'] = int( sfb_df.fillna('')[sfb_df.fillna( '')['URL'].str.contains('lamoda')].index[id_n]) # print('category_id: ', product_dict['category_id']) product_dict['date'] = Global().date product_dict['site_code'] = store product_dict['category_title'] = category_title product_dict['site_title'] = wspex_space( product.find('img').attrs['alt'] ) # find('div', {'class': 'products-list-item__brand'}).text) product_dict[ 'site_link'] = start_html + product.attrs['href'] product_dict['site_unit'] = 'шт.' cost_text = product.find('span', {'class': 'price'}) #print(cost_text) try: product_dict['price_new'] = tofloat( wspex( cost_text.find('span', { 'class': 'price__new' }).text)) product_dict['price_old'] = tofloat( wspex( cost_text.find('span', { 'class': 'price__old' }).text)) product_dict['price_new'] = int( product_dict['price_new']) product_dict['price_old'] = int( product_dict['price_old']) except: product_dict['price_new'] = tofloat( wspex( cost_text.find('span', { 'class': 'price__actual' }).text)) product_dict['price_old'] = '' product_dict['price_new'] = int( product_dict['price_new']) product_dict['type'] = 'non-food' if product_dict['price_new'] == '' or product_dict[ 'price_new'] == None: print('{} has no price!!!'.format( product_dict['site_title'])) #print('title: {}\nprice_new: {}\nprice_old: {}\n\n'.format(product_dict['site_title'],product_dict['price_new'],product_dict['price_old'])) cat_row = cat_row.append(product_dict, ignore_index=True) #print(cat_row[['site_title','price_new','price_old']]) if len(cat_row) >= max_prod or len(cat_row) == total_amount: res = res.append(cat_row, ignore_index=True) break else: continue if fail_list != []: for elem in fail_list: print('CATEGORY {} HAS NOT BEEN PARSED'.format(elem)) return res
def extract_product_page(self): site_code = 'lamoda' desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains(site_code)] if Global().max_links != None: links_df = links_df.iloc[:Global().max_links] category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) # proxies = get_proxy('https://www.lamoda.ru/') proxies = None for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) # print(' id_n =', id_n) i = 0 ua = UserAgent() header = {'User-Agent': str(ua.chrome)} while i + 1 <= len(url_list): href_i = url_list[i] print(href_i) i += 1 try: # time.sleep(3) if proxies is not None: r = requests.get(href_i, proxies=proxies, headers=header, timeout=60) # CRITICAL else: r = requests.get(href_i, headers=header, timeout=60) except: while True: proxies = get_proxy(href_i) time.sleep(3) try: r = requests.get(href_i, proxies=proxies, headers=header) if r.status_code == 200: break except: continue html = r.content soup = BeautifulSoup(html, 'lxml') products_div = soup.find('div', {'class': 'ii-product-buy'}) if not products_div: proxies = get_proxy('https://www.lamoda.ru/') i -= 1 print('no products_div!') continue price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = site_code price_dict['category_id'] = cat_id price_dict['category_title'] = category_title # print(soup) div_sale = soup.find('div', {'class': 'ii-product__price-discount'}) if div_sale is not None: # print('div-sale: ',div_sale) price_dict['price_old'] = float( re.match('\d+', wspex(div_sale.text))[0]) else: price_dict['price_old'] = '' type_good = wspex_space( products_div.find('a', { 'class': 'hidden' }).text) if type_good == '': # print(' imhere!') type_good = wspex_space( text_diff( soup.find('span', { 'class': 'heading_m ii-product__title' }).text, soup.find('span', { 'class': 'ii-product__brand' }).text)) try: # if products_div.find('a', {'class': 'hidden'}).text is '': # print(soup) price_dict[ 'site_title'] = type_good + ' Артикул: ' + wspex_space( products_div.find('div', { 'class': 'ii-select__option' }).get('data-value')) except: continue # print(products_div) div_new = products_div.find( 'div', {'class': 'ii-product__price ii-product__price_several'}) if div_new is None: div_new = products_div.find( 'div', { 'class': 'ii-product__price ii-product__price_several DT1717' }) dct = demjson.decode(div_new.get('data-several-prices')) if len(dct['details']) > 1: price_dict['price_old'] = int(dct['details'][0]['value']) price_dict['price_new'] = int(dct['details'][1]['value']) else: price_dict['price_new'] = int(dct['details'][0]['value']) ''' else: div_old = price_dict['price_old'] = int(wspex()) price_dict['price_new'] = int(dct['details'][1]['value']) ''' price_dict['site_unit'] = 'шт.' price_dict[ 'site_link'] = href_i # показывает название товара и ссылку на него price_dict['type'] = 'non-food' print( 'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n\n' .format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) res = res.append(price_dict, ignore_index=True) print('LAMODA has successfully parsed') return res
def extract_products(self): start_time = datetime.now() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) path_sfb = os.path.join('description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') hrefs = [href for href in hrefs if type(href) is not float] # испр id_n = 0 #proxies = get_proxy('https://www.utkonos.ru/') for href in tqdm(hrefs): # id_n += 1 category_title = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'utkonos')]['cat_title'].iloc[id_n - 1] print("{}...".format(category_title)) url_list = list_html(href) i = 0 while i + 1 <= len(url_list): href_i = url_list[i] i += 1 page = 0 while True: page += 1 url_full = self.construct_html(href_i, page) print('loading url', url_full) try: r = requests.get(url_full) # CRITICAL except: raise ValueError #proxies = get_proxy('https://www.utkonos.ru/') #r = requests.get(url_full, proxies=proxies) html = r.content soup = BeautifulSoup(html, 'lxml') try: products_div = soup.find('div', {'class': 'goods_view_box'}) except: print("OOPS! {} has no products_div".format(url_full)) continue pages_controller_div = soup.find('div', {'class': 'el_paginate'}) if pages_controller_div is None: # print('no_pages_controller') flag_nextpage = False else: pages_refs = pages_controller_div.find_all('a') max_page_index = 1 for ref in pages_refs: page_index = self.representsInt(ref.text.strip()) if page_index is not None: if page_index > max_page_index: max_page_index = page_index if max_page_index > page: # print('max_page_index: ', max_page_index) flag_nextpage = True # print('nextpage!') else: flag_nextpage = False # print('nonextpage!') try: price_list = products_div.find_all( 'div', { 'class': 'goods_view_box-view goods_view goods_view-item' }) except: print("OOPS! {} has no price_list".format(url_full)) continue for price_elem in price_list: price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'utkonos' price_dict['category_id'] = id_n price_dict['category_title'] = category_title # product_unavailable_div = price_elem.find('div', {'class': 'product-unavailable-text'}) # if product_unavailable_div is not None: # continue # just skip # product_name_div = price_elem.find( 'div', {'class': 'goods_view_box-caption'}) if product_name_div is not None: aref = product_name_div.find('a') if aref is not None: price_dict['site_title'] = wspex_space( aref.text) price_dict['site_link'] = aref.get('href') else: continue else: continue if filter_flag(id_n, price_dict['site_title']) == False: # print(" skipped position: {}".format(price_dict['site_title'])) continue try: product_price_div = price_elem.find( 'div', {'class': 'goods_price-item current big'}) div_sale = price_elem.find( 'div', {'class': 'goods_price-item old_price'}) if div_sale: price_dict['price_old'] = find_float_number( div_sale.text) else: price_dict['price_old'] = '' # if product_price_div is not None: price_dict['price_new'] = find_float_number( product_price_div.text) if price_dict['price_old'] == price_dict[ 'price_new']: price_dict['price_old'] = '' price_dict['site_unit'] = str( product_price_div.get('data-weight'))[1:] except: product_price_div = price_elem.find( 'div', {'class': 'goods_price-item current'}) # if product_price_div is not None: price_dict['price_new'] = find_float_number( product_price_div.text) price_dict['price_old'] = '' price_dict['site_unit'] = str( product_price_div.get('data-weight'))[1:] """print('site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'.format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit']))""" # print(price_dict) price_dict['type'] = 'food' res = res.append(price_dict, ignore_index=True) if flag_nextpage == False: break end_time = datetime.now() time_execution = str(end_time - start_time) print('UTKONOS has successfully parsed\ntotal time of execution: {}'. format(time_execution)) return res
def extract_product_page(self): site_code = 'utkonos' # ua = UserAgent() # header = {'User-Agent': str(ua.chrome)} header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.103 Safari/537.36' } # print(header) desc_df = Global().desc_df links_df = Global().links links_df = links_df[links_df['site_link'].str.contains( site_code)].iloc[:Global().max_links] category_ids = links_df.category_id.unique() res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) check_url = links_df[links_df.category_id == 1].site_link.values[0] # proxies = get_proxy(check_url) # proxies = None time.sleep(5) # selenium if Global().is_selenium_utkonos: path = Global().path_chromedriver # options = webdriver.ChromeOptions() # options.add_argument('--headless') driver = webdriver.Chrome(executable_path=path, chrome_options=Global().chrome_options) # for cat_id in tqdm(category_ids): # испр url_list = links_df[links_df.category_id == cat_id].site_link.values category_title = desc_df.loc[cat_id, 'cat_title'] print("{}... ".format(category_title)) # print(' id_n =', id_n) i = 0 while i + 1 <= len(url_list): href_i = url_list[i] i += 1 print(href_i) if Global().is_selenium_utkonos: driver.get(href_i) soup = BeautifulSoup(driver.page_source, 'html.parser') # driver.close() else: # time.sleep(3) try: if proxies is not None: r = requests.get(href_i, proxies=proxies, headers=header) else: r = requests.get(href_i, headers=header) except Exception as e: print('Exception:', e) while True: try: proxies = get_proxy(href_i) time.sleep(3) r = requests.get(href_i, proxies=proxies, headers=header) break except: continue html = r.content soup = BeautifulSoup(html, 'html.parser') # print('soup:\n', soup) products_div = soup.find('div', {'class': 'goods_view_item-action'}) if products_div is None: print('soup:\n', soup) # print(products_div) # products_div = soup.find('div', {'class': 'b-section--bg i-pb30 js-product-item js-product-main'}) # print('\n\nproducts_div:\n', products_div) price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'utkonos' price_dict['category_id'] = cat_id price_dict['category_title'] = category_title # try: price_dict['site_title'] = wspex_space( products_div.find('div', { 'class': 'goods_view_item-action_header' }).text) # except: # print('url %s is broken' % href_i) # continue price_dict['site_link'] = href_i # print(price_dict['site_link']) # if filter_flag(id_n, price_dict['site_title']) == False: # print(" skipped position: {}".format(price_dict['site_title'])) # continue price_div = products_div.find( 'div', {'class': 'goods_price has_old_price'}) # print('div_sale:', div_sale) if price_div is not None: div_sale = price_div.find( 'div', {'class': 'goods_price-item old_price'}) # print('div_sale: ', div_sale) price_dict['price_old'] = float( re.search('\d+\.\d+', wspex(div_sale.text).replace(',', '.'))[0]) div_new = price_div.find( 'div', {'class': 'goods_price-item current'}) if div_new is None: div_new = price_div.find( 'div', {'class': 'goods_price-item current big'}) price_dict['price_new'] = float( re.search('\d+\.\d+', wspex(div_new.text).replace(',', '.'))[0]) price_dict['site_unit'] = str( div_new.get('data-weight'))[1:] else: div_new = products_div.find( 'div', {'class': 'goods_price-item current'}) if div_new is None: div_new = products_div.find( 'div', {'class': 'goods_price-item current big'}) price_dict['price_new'] = float( re.search('\d+\.\d+', wspex(div_new.text).replace(',', '.'))[0]) price_dict['price_old'] = '' price_dict['site_unit'] = str( div_new.get('data-weight'))[1:] print( 'site_title: {}\nprice_new: {}\nprice_old: {}\nunit: {}\n'. format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'], price_dict['site_unit'])) # print(price_dict) price_dict['type'] = 'food' res = res.append(price_dict, ignore_index=True) if Global().is_selenium_utkonos: driver.quit() print('UTKONOS has successfully parsed') return res
def get_df(self): print('get data from services...') sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id') serv_df = sfb_df[sfb_df['type'] == 'services'] list_url = serv_df['URL'].values final_df = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) #mgts n = 0 url = list_url[n] print(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_list = soup.findAll('div', {'class': 'slider_slide'}) #0 заменить for price_elem in price_list: if price_elem.findAll('div', {'class': 'texts'})[0].text == 'Безлимитный': price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_id'] = id_n price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']] price_dict['type'] = 'services' price_dict['site_title'] = price_elem.findAll( 'div', {'class': 'texts'})[0].text price_dict['price_new'] = int( price_elem.findAll('div', {'class': 'slider_price_val'})[0].text) price_dict['price_old'] = '' price_dict['site_unit'] = price_elem.findAll( 'div', {'class': 'slider_price_rub1' })[0].text + '/' + price_elem.findAll( 'div', {'class': 'slider_price_rub2'})[0].text price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) break #Помывка в бане в общем отделении, билет http://legkiipar.ru/menraz.html try: n = 1 url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #Будние дни с 08:00 до 22:00 pattern = re.compile(r'Будние дни') price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['type'] = 'services' price_dict['site_title'] = soup(text=pattern)[0] price_1 = soup.findAll('span', {'class': 'стиль6'}) price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0] price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', price_1[4].text[:-1])[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']].values[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #Помывка в бане в общем отделении, билет http://banya-lefortovo.ru/price.html n = 2 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') pattern = re.compile(r'Русская общая баня') price_dict['site_title'] = soup(text=pattern)[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['type'] = 'services' price_dict['price_new'] = int( re.findall('\d+', re.findall('\d+ рублей', soup(text=pattern)[0])[0])[0]) price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', soup(text=pattern)[0])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет https://rzhevskie-bani.ru/rb/bani.html n = 3 price_dict = dict() url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['price_new'] = int( re.findall('\d+', soup.findAll('td', {'class': 'price'})[0].text)[0]) pattern = re.compile(r'Стоимость') soup.findAll('td') price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = soup(text=pattern)[0] price_dict['type'] = 'services' price_dict['site_unit'] = re.findall('(\d+.*\d часа)', soup(text=pattern)[0][-9:])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad n = 4 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) try: html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, timeout=10).content except: proxy = get_proxy(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, proxies=proxy).content soup = BeautifulSoup(html, 'lxml') price_div = soup.findAll('div', {'class': 'price-head'})[0] price_dict['price_new'] = int( re.findall('\d+', price_div.findAll('span', {'class': 'price'})[0].text)[0]) price_dict['price_old'] = '' price_dict['site_title'] = price_div.find('p').text.replace( '\xa0', ' ') price_dict['site_unit'] = re.findall('\d+ часа', price_dict['site_title'])[0] price_dict['type'] = 'services' price_dict['site_link'] = url price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] final_df = final_df.append(price_dict, ignore_index=True) #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny ''' n=5 price_dict=dict() price_dict['date']=Global().date price_dict['site_code']='services' url=list_url[n] print(url) html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome} soup=BeautifulSoup(html, 'lxml') price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('износоустойчивой резины',elem.text)!=[]: price_div=elem price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0] price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1] price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_link']=url break final_df=final_df.append(price_dict,ignore_index=True) ''' #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny n = 6 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('эконом', elem.text) != []: price_div = elem price_dict['site_title'] = self.wspex_space( re.findall( '[А-Яа-яёз(). ]+', price_div.findAll('td', {'align': 'left'})[0].text)[0]) price_text = price_div.findAll('strong')[0].text price_dict['price_new'] = int(re.findall('\d+', price_text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = re.findall( '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1] price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 7 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #soup.findAll('td')#,{'class':'text-center'})[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] for elem in soup.findAll('td'): if re.findall('не более', elem.text) != []: price_div = elem site_title = price_div.text break for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) # # стрижка try: n = 8 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content # , headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') # soup.findAll('td')#,{'class':'text-center'})[0] for elem in soup.findAll('tr'): if re.findall('(любой длины)', elem.text) != []: price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_text = elem.text price_dict['site_title'] = re.findall( '[А-Яа-я ()]+', price_text)[0] price_dict['price_new'] = re.findall('\d+', price_text)[0] price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка try: n = 9 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('Женская', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'class': 'services-table__name' }).text price_dict['price_new'] = int( self.wspex( price_div.find( 'td', { 'class': 'services-table__price services-table__price-small' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка n = 10 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('лопаток', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(list_url[n - 1])].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'height': '17' }).text price_dict['price_new'] = int( self.wspex(price_div.find('td', { 'width': '157' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 11 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) final_df = final_df[final_df.site_title.notna()] print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!') return final_df
def __init__(self): self.path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv')
def printer_test(self): function_start_time = datetime.now() Global().getproxies() print('Timer call : start making snapshots') date_now = datetime.now().strftime("%Y-%m-%d") print('Timer call : start making snapshots') df = get_empty_handler_DF() # use display from pyVirtual display package in order to launch selenium not in a real window with Display(): # tor_webdriver = create_tor_webdriver() # df = df.append(IkeaHandlerMSK(tor_driver=tor_webdriver).extract_products()) # df = df.append(RiglaHandlerSPB(tor_driver=tor_webdriver).extract_products()) # df = df.append(PerekrestokHandlerSPB(tor_driver=tor_webdriver).extract_products()) # df = df.append(OkeyHandlerSPB(tor_driver=tor_webdriver).extract_products()) # tor_webdriver.quit() df = df.append( IkeaHandlerMSK(proxy_method='tor-service').extract_products()) df = df.append( RiglaHandlerSPB(proxy_method='tor-service').extract_products()) df = df.append( PerekrestokHandlerSPB( proxy_method='tor-service').extract_products()) df = df.append( OkeyHandlerSPB(proxy_method='tor-service', use_request=True).extract_products()) df = df.append( SvaznoyHandlerMSK(proxy_method='no-proxy').extract_products()) df = df.append( EldoradoHandlerMSK( proxy_method='tor-service').extract_products()) df = df.append( LentaHandlerMSK(proxy_method='no-proxy').extract_products()) df = df.append( LentaHandlerSPB(proxy_method='no-proxy').extract_products()) with Display(): try: df = df.append(TotalGrocery().get_df_page()) except: print('ERROR while handling TotalGrocery') try: df = df.append(TotalNongrocery().get_df_page()) except: print('ERROR while handling TotalNongrocery') try: df = df.append(Services().get_df()) except: print('ERROR while handling Services') # uncomment for tests # df = pd.read_csv(os.path.join('parser_app', 'logic', 'description', 'df_after_handlers_FOR_TESTS.csv')) df['date'] = date_now df = df.sort_values(['category_id', 'site_link']) df['miss'] = 0 df.reset_index(drop=True, inplace=True) path_to_parsed_content_folder = 'parsed_content' if not os.path.exists(path_to_parsed_content_folder): os.makedirs(path_to_parsed_content_folder) df_path = os.path.join('parsed_content', 'data_test_{}.csv'.format(date_now)) pivot_path = os.path.join('parsed_content', 'pivot_test_{}.csv'.format(date_now)) pivot = df.pivot_table(index='category_id', columns=['type', 'site_code'], values='site_link', aggfunc='nunique') if sys.platform.startswith('linux'): df.to_csv(df_path) pivot.to_csv(pivot_path) elif sys.platform.contain('win'): df.to_csv(os.path.join(r'D:\ANE_2', df_path)) pivot.to_csv(os.path.join(r'D:\ANE_2', pivot_path)) else: raise ValueError("your operation system not found") df['price_old'] = df['price_old'].replace('', -1.0) df['price_old'] = df['price_old'].fillna(-1.0) cached_list = [] print('Storing raw prices to db...') for _, row in df.iterrows(): prod = PricesRaw( date=row['date'], type=row['type'], category_id=row['category_id'], category_title=row['category_title'], site_title=row['site_title'], price_new=row['price_new'], price_old=row['price_old'], site_unit=row['site_unit'], site_link=row['site_link'], site_code=row['site_code'], miss=row['miss'], ) cached_list.append(prod) PricesRaw.objects.bulk_create(cached_list) print('Storing complete!') print('Filling df...') filled_df = fill_df( pd.DataFrame(list(PricesRaw.objects.all().values()))) filled_df.to_csv(os.path.join('parsed_content', 'filled_df.csv')) print('Filling complete!') df_gks = GKS_weekly_handler().get_df() cached_list = [] Gks.objects.all().delete() print('Storing gks prices to db...') for _, row in df_gks.iterrows(): prod = Gks( date=row['date'], type=row['type'], category_id=row['category_id'], category_title=row['category_title'], site_title=row['site_title'], price_new=row['price_new'], price_old=row['price_old'], site_unit=row['site_unit'], site_link=row['site_link'], site_code=row['site_code'], miss=row['miss'], ) cached_list.append(prod) Gks.objects.bulk_create(cached_list) print('Storing complete!') print('Getting basket df...') basket_df = get_basket_df( df_gks[df_gks['type'] == 'food'], filled_df[filled_df['type'] == 'food'], ) print('Getting complete!') print('Storing basket to db...') cached_list = [] Basket.objects.all().delete() for _, row in basket_df.iterrows(): prod = Basket( date=row['date'], gks_price=row['gks_price'], online_price=row['online_price'], ) cached_list.append(prod) Basket.objects.bulk_create(cached_list) print('Storing completed!') function_end_time = datetime.now() time_execution = str(function_end_time - function_start_time) print('PARSING ENDED!\ntotal time of all execution: {}'.format( time_execution))
def extract_products(self, is_proxy=True): if is_proxy == True: proxies = get_proxy('https://www.perekrestok.ru/') else: proxies = None start_time = datetime.now().minute res = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) fail_array = [] path_sfb = os.path.join(Global().base_dir, 'description', 'urls.csv') sfb_df = pd.read_csv(path_sfb, sep=';', index_col='id') hrefs = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'perekrestok')]['URL'].values hrefs = [href for href in hrefs if type(href) is not float] # print(hrefs) id_n = 0 for href in tqdm(hrefs): n_items_before = len(res) category_titles = sfb_df[sfb_df.fillna('')['URL'].str.contains( 'perekrestok')]['cat_title'] try: html = requests.get(href, proxies=proxies).content except: proxies = get_proxy(href) html = requests.get(href, proxies=proxies).content soup = BeautifulSoup(html, 'html.parser') try: helper_div = soup.find( 'div', {'class': 'xf-sort__total js-list-total'}) except: print('WARNING!!! helper_div in {} has not found'.format(href)) fail_array.append(href) continue total_amount = int( helper_div.find('span', { 'class': 'js-list-total__total-count' }).text) print('\n' + category_titles.iloc[id_n] + '... товаров в категории: ' + str(total_amount)) page = 0 id_n += 1 # print(' total_amount: {}'.format(total_amount)) n_elem = 0 n_elem_out = 0 while n_elem < total_amount - n_elem_out: # print('n_elem: {} total_amount: {}'.format(n_elem, total_amount)) total_amount = int( helper_div.find('span', { 'class': 'js-list-total__total-count' }).text) page += 1 if href[-1] == '?': href_i = '{}page={}'.format(href, page) else: href_i = '{}&page={}'.format(href, page) # print('\tgetting page: {}'.format(href_i,page)) try: html_i = requests.get(href_i, proxies=proxies).content #print('im here') except: proxies = get_proxy(href_i) html_i = requests.get(href_i, proxies=proxies).content soup = BeautifulSoup(html_i, 'html.parser') products_div = soup.find('div', {'class': 'js-catalog-wrap'}) price_list = products_div.find_all( 'div', {'class': 'xf-product js-product'}) # print('price_list:{}\n\n'.format(products_div,price_list)) n_elem_out += len( products_div.find_all( 'div', {'class': re.compile(r'\w*ot-activ\w+')})) # print(n_elem_out) for price_elem in price_list: n_elem += 1 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'perekrestok' price_dict['category_id'] = id_n price_dict['category_title'] = category_titles.iloc[id_n - 1] aref = price_elem.find('div', {'class': 'xf-product__title xf-product-title'}).\ find('a', {'class': 'xf-product-title__link js-product__title'}) price_dict['site_title'] = aref.text.strip() if filter_flag(id_n, price_dict['site_title']) == False: # print("skipped position: {}".format(price_dict['site_title'])) continue cost_div = price_elem.find( 'div', {'class': 'xf-product__cost xf-product-cost'}) if cost_div == None: continue sale_div = cost_div.find( 'div', {'class': 'xf-price xf-product-cost__prev'}) if sale_div: posted_price_div = cost_div.find( 'div', { 'class': 'xf-price xf-product-cost__current js-product__cost _highlight' }) price_dict['price_new'] = int( posted_price_div.find('span', { 'class': 'xf-price__rouble' }).text) pennies_cost_div = posted_price_div.find( 'span', {'class': 'xf-price__penny'}) if pennies_cost_div is not None: pennies_cost = float( pennies_cost_div.text.strip().replace( ',', '.', 1)) else: pennies_cost = 0.0 price_dict['price_old'] = tofloat(sale_div.text) else: price_dict['price_new'] = int( cost_div.find('span', { 'class': 'xf-price__rouble' }).text) pennies_cost_div = cost_div.find( 'span', {'class': 'xf-price__penny'}) if pennies_cost_div is not None: pennies_cost = float( pennies_cost_div.text.strip().replace( ',', '.', 1)) else: pennies_cost = 0.0 price_dict['price_old'] = '' site_unit_div = cost_div.find('span', {'class': 'xf-price__unit'}) if site_unit_div is not None: site_unit = site_unit_div.text.split( r'/')[-1].split()[0] else: site_unit = 'шт' price_dict['price_new'] += pennies_cost price_dict['site_unit'] = site_unit price_dict['site_link'] = aref.get('href') price_dict['type'] = 'food' ''' print('site_title: {}\nprice_new: {}\nprice_old: {}\n\n'.format(price_dict['site_title'], price_dict['price_new'], price_dict['price_old'])) ''' res = res.append(price_dict, ignore_index=True) # print(' length of res:{}'.format(len(res))) # print('\t\tparsed {} items'.format(len(res)- n_items_before)) end_time = datetime.now().minute time_execution = str(timedelta(minutes=end_time - start_time)) print( 'PEREKRESTOK has successfully parsed\ntotal time of execution: {}'. format(time_execution)) if fail_array != []: print('FAIL URLS:') for elem in fail_array: print(elem) return res