def rebuild_mf_index(): global driver url = credentials['url_module_mf'].strip() + '&token=' + token driver.get(url) refresh = WebDriverWait(driver, 300).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#msg-refresh_ocmod_cache > span'))) sleep(6) crossparser_tools.write_to_log(refresh.text)
def checking_all_diggers(): global current_site global digger_id is_done = True for attr, value in websites.items(): digger_id = value current_site = attr session_info = check_status(digger_id, False) if session_info['state'] == 'running': is_done = False else: parse_session(session_info) link_to_parse = get_nextlink_forsite(current_site) if link_to_parse != '': crossparser_tools.write_to_log('start digger on link: ' + link_to_parse + ', digger_id: ' + digger_id) start_digger(digger_id, link_to_parse) is_done = False sleep(10) if is_done == False: checking_all_diggers()
def start_digger(digger_id, link_to_parse): headers = { 'Authorization': 'Token ' + token, 'Content-type': 'application/json' } URL = 'https://www.diggernaut.com/api/diggers/' + digger_id + '/start' PARAMS = '{"variables":{"target_page":"' + link_to_parse + '"}}' r = requests.post(url=URL, headers=headers, data=PARAMS) data = r.json() crossparser_tools.write_to_log(data)
def start_all_diggers(): global current_site global digger_id for attr, value in websites.items(): digger_id = value current_site = attr link_to_parse = get_nextlink_forsite(current_site) crossparser_tools.write_to_log('start parsing: ' + current_site + 'digger_id: ' + digger_id) start_digger(digger_id, link_to_parse)
def make_csv(file): global items_counter_parsed global csv_out_data_counter csv_out_data_counter = 0 file_lines = crossparser_tools.read_csv(file, True) global table_titles_list table_titles_list = crossparser_tools.table_titles_list filename = file + '-import.csv' with open(filename, 'w+', newline='', encoding="utf8") as csvexportfile: #Make header row_out = [] for attr, value in export_fields.items(): row_out.append(value) csvexportfile.write(';'.join(row_out) + '\n') #Parse lines for row in file_lines: if row == '': continue items_counter_parsed += 1 row = row.strip().replace('\n', '').replace('\r', '') if credentials['is_server'] == 'no': parse_row(row, csvexportfile) if credentials['is_server'] == 'yes': try: parse_row(row, csvexportfile) except Exception as e: crossparser_tools.write_to_log( 'failed to parse row of file:' + file + '. row: ' + row) crossparser_tools.write_to_log(e) if csv_out_data_counter == 0: crossparser_tools.write_to_log('Failed to make csv file of ' + current_site + '. No entries ') os.remove(filename) return with open(temp_folder + 'files_prod_import.txt', 'a', newline='', encoding="utf8") as files_toimport: files_toimport.write(filename + '\n') crossparser_tools.write_to_log('Made csv file with ' + str(csv_out_data_counter) + ' items of ' + current_site + '. Saved to file: ' + filename)
def check_parse_new_link(site, is_done): #Start parse new link: global websites_parsed global tabs_delay link = get_nextlink_forsite(site) if link != '': tabs_delay[site] = datetime.datetime.now() parse_link(site, link) is_done = False else: crossparser_tools.write_to_log('Done parse all links for site: ' + site) #window_handles.pop(site) websites_parsed[site] = True return is_done
def import_files(): global driver if os.path.isfile(files_prod_import): if os.path.isfile(files_categ_import): parse_websites() if len(categs_files) == 0 or len(categs_files) == 0: crossparser_tools.write_to_log('Nothing to import') return options = webdriver.ChromeOptions() if credentials['is_server'] == 'no': chromedriver_path = config_folder + 'chromedriver.exe' options.add_argument('--window-size=1200,700') if credentials['is_server'] == 'yes': chromedriver_path = config_folder + 'chromedriver' options.add_argument('--no-sandbox') options.add_argument("--disable-dev-shm-usage"); options.add_argument('--headless') options.add_argument('--disable-gpu') import purge_db driver = webdriver.Chrome(chromedriver_path, chrome_options=options) open_web_site() for file in categs_files: import_categ_file(file) for file in products_files: import_products_file(file) rebuild_mf_index() driver.quit()
def parse_new(): file_of_raw_catalogs = crossparser_tools.file_of_raw_catalogs files_to_parse = {} with open(file_of_raw_catalogs, 'r', newline='', encoding="utf8") as files_toimport: for line in files_toimport: line = line.split('$$') files_to_parse[line[1].strip()] = line[0].strip().replace('\n', '') if len(files_to_parse) == 0: return for file, site in files_to_parse.items(): if os.path.isfile(file): crossparser_tools.write_to_log('Start processing file: ' + file) global current_site current_site = site make_csv(file)
def parse_link(site, link): try: global driver global current_links current_links[site] = link url = websites[site] driver.get(url.strip()) input = driver.find_element_by_css_selector( 'div.inputfields input.textbox.urlinput') input.send_keys(link) strt_btn = driver.find_element_by_css_selector('#startBtn') if credentials['is_demo'] == 'yes': strt_btn = driver.find_element_by_css_selector('#startDemoBtn') driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", strt_btn) strt_btn.click() WebDriverWait(driver, 33).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#progressBar'))) crossparser_tools.write_to_log('Initiated parsing of ' + link) except Exception as e: crossparser_tools.write_to_log('Failed initiate parsing of ' + link) crossparser_tools.write_to_log(str(e))
def import_products_file(file): global driver crossparser_tools.write_to_log('Start import ' + file) url = credentials['url_module'].strip() + 'app_product&token=' + token driver.get(url) login_btn = driver.find_element_by_id('link_tab_import') login_btn.click() input_btn = driver.find_element_by_css_selector('#form_product_import > div:nth-child(1) > div:nth-child(1) > div:nth-child(19) > div > input[type="file"]') input_btn.send_keys(file) input_btn = driver.find_element_by_css_selector('#form_product_import > div:nth-child(2) > div > div > button') actions = ActionChains(driver) actions.move_to_element(input_btn) actions.click(input_btn) actions.perform() WebDriverWait(driver, 3600).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div.container-fluid.csvprice_pro_container > div.alert.alert-success.alert-dismissible > i'))) #print('done file:', file) succ_text = driver.find_element_by_css_selector('#content > div.container-fluid.csvprice_pro_container > div.alert.alert-success.alert-dismissible') crossparser_tools.write_to_log('Successfully imported ' + file) crossparser_tools.write_to_log(succ_text.text.replace('μ', '').replace('\n\n', '\n').replace('×', '')) sleep(1)
def download_catalogs(site, is_save): global driver #Dowload primary prices catalog download_btn = WebDriverWait(driver, 3600).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#content .products-menu .export-button"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", download_btn) download_btn.click() before = dict([(f, None) for f in os.listdir(temp_folder)]) final_download_btn = WebDriverWait(driver, 100).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#exportBtn"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", final_download_btn) final_download_btn.click() sleep(5) after = dict([(f, None) for f in os.listdir(temp_folder)]) added = [f for f in after if not f in before] if len(added) == 0: crossparser_tools.write_to_log('Failed to download file. Website: ' + site) current_link = current_links[site] crossparser_tools.write_to_log('Filed to parse link: ' + current_link) driver.save_screenshot( "/var/www/html/boots-market/crossparser/temp/screenshot" + site + ".png") return '' filename = temp_folder + ''.join(added) wait_for_download(filename) if is_save == True: global counter_links_parsed counter_links_parsed += 1 crossparser_tools.write_to_log('Downloaded primary file of ' + site + '. Saved to ' + filename) with open(file_of_raw_catalogs, 'a', newline='', encoding="utf8") as files_toimport: files_toimport.write(site + '$$' + filename + '\n') else: crossparser_tools.write_to_log('Downloaded secondary file of ' + site + '. Saved to ' + filename) return filename
def clear_db(): global sql_query parse_sql_query() #print(sql_query) #delete temp files if os.path.isfile(data_folder + 'partner_links'): os.remove(data_folder + 'partner_links') cat_menu = data_folder + 'category_menu.txt' if os.path.isfile(cat_menu): os.remove(cat_menu) open(cat_menu, 'a').close() mydb = mysql.connector.connect(host='localhost', user=credentials['php_login'], passwd=credentials['php_password'], database=credentials['php_db']) mycursor = mydb.cursor() sql_queries = sql_query.split('\n') for query in sql_queries: mycursor.execute(query) #sql_query = 'TRUNCATE bootsmarketdb.`oc_product_to_category`;' #mycursor.execute(sql_query, multi=True) sql_query = 'SELECT * FROM bootsmarketdb.`oc_product_option_value`;' mycursor.execute(sql_query) myresult = mycursor.fetchall() #print(myresult) if len(myresult) == 0: crossparser_tools.write_to_log('db successfully purged') else: crossparser_tools.write_to_log('db purge failed')
def parse_session(session_info): if len(session_info) == 0: crossparser_tools.write_to_log('Digger ' + str(digger_id) + ' doesnt respond properly') return session_id = session_info['id'] print('session_id:', session_id) session_data = get_session_data(digger_id, session_id) crossparser_tools.write_to_log('Successfully retrieved session ' + str(session_id) + ' data of digger ' + str(digger_id) + '. Items in session: ' + str(len(session_data))) global items_counter_parsed items_counter_parsed += len(session_data) #print('session_data', len(session_data)) #parse_categories() make_csv(session_data, current_site)
def make_csv(data, current_site): filename = temp_folder + current_site + '-' + digger_id + '-import.csv' with open(filename, 'w+', newline='', encoding="utf8") as csvexportfile: csv_out_data = json_to_csv(data, csvexportfile) if csv_out_data == '': crossparser_tools.write_to_log('No csv_out_data fetched') return #print(csv_out_data) global csv_out_data_counter csv_out_data_counter = 0 for row in csv_out_data: #for i in range(len(row)): #row[i] = str(row[i]) #csvexportfile.write(';'.join(row) + '\n') parse_row(row, csvexportfile) if csv_out_data_counter == 0: crossparser_tools.write_to_log('Failed to make csv file of ' + current_site + '. No entries. Digger id: ' + digger_id) return with open(temp_folder + 'files_prod_import.txt', 'a', newline='', encoding="utf8") as files_toimport: files_toimport.write(filename + '\n') crossparser_tools.write_to_log('Made csv file with ' + str(csv_out_data_counter) + ' items of ' + current_site + '. Digger id: ' + digger_id + '. Saved to file: ' + filename)
def parse_row(row, csvwriter): #print("in row: ", row) if row == '': return global table_titles_list n_import = len(table_titles_list) n_export = len(export_fields) row_out = [''] * n_export row = row.split(';') #First 4 cells is usually categories row[0] = row[0] + '|' + row[1] + '|' + row[2] + '|' + row[3] #Format export row i = 0 for cell in row: if i >= n_import: print("col out of range of header: ", cell) break for attr, value in export_fields.items(): if attr == table_titles_list[i]: row_out[export_fields_nums[attr]] = cell.replace('"', '').replace( '\n', '') i = i + 1 if print_rows_formating == True: print("row formated for export: ", row_out) #Little customize fields for special stores if current_site == 'tervolina.ru': index = export_fields_array.index('_MANUFACTURER_') row_out[index] = 'Tervolina' #Leave product if empty fields of: index = export_fields_array.index('_MANUFACTURER_') prod_brand = row_out[index].replace("'", '').replace('"', '') prod_categs = row[0].replace('|', '').replace("'", '').replace('"', '') index = export_fields_array.index('_PRICE_') prod_price = row_out[index].replace("'", '').replace('"', '') index = export_fields_array.index('_NAME_') prod_name = row_out[index].replace("'", '').replace('"', '') if prod_name == '' or prod_name == ' ' or prod_name is None: print('no prod name') return '' if prod_categs == '' or prod_categs == ' ' or prod_categs is None: print('no prod_categs for ' + prod_name) return '' if prod_brand == '' or prod_brand == ' ' or prod_brand is None: print('no prod_brand for ' + prod_name) return '' if prod_price == '' or prod_price == ' ' or prod_price is None: print('no prod_price for ' + prod_name) return '' # Customize special fields (such as Size, etc) # <Optional loop> # From 'cell' to 'row_out[i]' i = -1 #print('row_out', row_out) for cell in row_out: i += 1 current_row_title = export_fields_array[i] if current_row_title == '_DESCRIPTION_' or current_row_title == '_NAME_' or current_row_title == '_MANUFACTURER_': row_out[i] = '"' + cell.strip().replace("'", '') + '"' #Format Size: if current_row_title == '_OPTIONS_': new_size_cell = '"' cell = cell.replace('\n', '').replace('\r', '') sizes_arr = cell.split('|') for size in sizes_arr: new_size_cell += 'select|Размер|' new_size_cell += size new_size_cell += '|1|1000|1|+|0.0000|+|0|+|0.00\n' row_out[i] = new_size_cell + '"' #Format Price: if current_row_title == '_PRICE_': row_out[i] = crossparser_tools.get_only_nums(cell) if current_row_title == '_SPECIAL_': special_price = crossparser_tools.get_only_nums(cell) if special_price != '': price_index = export_fields_array.index('_PRICE_') curr_price = row_out[price_index] if special_price > curr_price: row_out[price_index] = special_price special_price = crossparser_tools.get_only_nums(curr_price) row_out[i] = '1,0,' + str( special_price) + '.00,0000-00-00,0000-00-00' #Form unique SKU if current_row_title == '_SKU_': #Save SKU to _UPC_ upc_index = export_fields_array.index('_UPC_') row_out[upc_index] = cell #Change sku url_index = export_fields_array.index('_LOCATION_') url = row_out[url_index] #id_from_url = crossparser_tools.get_uniqid_from_url(url, current_site) id_from_url = crossparser_tools.get_rand_uniqid(10) if str(cell) not in id_from_url: id_from_url += str(cell) row_out[i] = id_from_url global global_prod_id global_prod_id = id_from_url #Copy SKU to Model: if current_row_title == '_MODEL_': sku_index = export_fields_array.index('_SKU_') curr_sku = row_out[sku_index] row_out[i] = curr_sku #SEO URL: if current_row_title == '_SEO_KEYWORD_': index = export_fields_array.index('_SKU_') prod_sku = row_out[index] index = export_fields_array.index('_NAME_') prod_name = row_out[index] seo_url = crossparser_tools.to_seo_url(prod_name) row_out[i] = seo_url + '-' + prod_sku.lower() #Parse and download all images: if current_row_title == '_IMAGES_': #Decline product without imgs: if cell == '': return imgs = cell.split('|') checked_imgs = [] for img in imgs: check = image_check(img) if check != '': checked_imgs.append(check) if len(checked_imgs) == 0: row_out[i] = '' #Decline product without imgs: return else: row_out[i] = (',').join(checked_imgs) #Set primary (first) image: if current_row_title == '_IMAGE_': imgs_index = export_fields_array.index('_IMAGES_') imgs = row_out[imgs_index] if imgs == '': sku_index = export_fields_array.index('_LOCATION_') imgs = row_out[sku_index] crossparser_tools.write_to_log( 'No images collected for product: ' + curr_sku + ' (failed to download)') #Decline product without imgs: return imgs = imgs.split(',') row_out[i] = imgs[0] imgs.pop(0) row_out[imgs_index] = ','.join(imgs) if current_row_title == '_QUANTITY_': row_out[i] = str(99999) #Create category: if current_row_title == '_CATEGORY_ID_': #print(cell) index = export_fields_array.index('_MANUFACTURER_') prod_brand = row_out[index] index = export_fields_array.index('_NAME_') prod_name = row_out[index] prod_categs = cell index = export_fields_array.index('_PRICE_') prod_price = 999999 if row_out[index] != '': prod_price = int(row_out[index]) index = export_fields_array.index('_IMAGE_') prod_image = row_out[index] categs_ids = create_category(prod_categs, prod_brand, prod_name, prod_price, prod_image) if categs_ids is None: categs_ids = '' if categs_ids == '': #Decline product without categories: url_index = export_fields_array.index('_LOCATION_') url = row_out[url_index] crossparser_tools.write_to_log( 'No categories created for product: ' + url) return row_out[i] = ','.join(categs_ids) #Set up attributes: Brand, Gender, Season: if current_row_title == '_ATTRIBUTES_': global global_prod_gender row = '"Обувь|Пол|' + global_prod_gender.replace('"', '') + '\n' index = export_fields_array.index('_MANUFACTURER_') prod_brand = row_out[index].replace('"', '') row += 'Обувь|Бренд|' + prod_brand + '\n' prod_season = 'Лето' row += 'Обувь|Сезон|' + prod_season + '"' row_out[i] = row #Save this store link if current_row_title == '_EAN_': row_out[i] = current_site if current_row_title == '_LOCATION_': link = cell.replace(current_site, '').replace('https', '').replace('http', '').replace('www', '') link_id = crossparser_tools.get_only_letters(link) with open(data_folder + 'partner_links', 'a+') as partner_links_file: link_no_http = cell.replace('https://', '').replace( 'http://', '').replace('www.', '') if current_site not in partner_links: crossparser_tools.write_to_log( 'No partner link for site: ' + current_site) return part_link = partner_links[current_site] deeplink = part_link + '?ulp=http%3A%2F%2F' + link_no_http link_row = link_id + '$$' + deeplink + '\n' partner_links_file.write(link_row) row_out[i] = link_id # </> #print("out row: ", ';'.join(row_out) + '\n') for i in range(len(row_out)): row_out[i] = str(row_out[i]) csvwriter.write(';'.join(row_out) + '\n') global items_counter_converted items_counter_converted += 1 global csv_out_data_counter csv_out_data_counter += 1
chromedriver_path = config_folder + 'chromedriver.exe' options.add_argument('--window-size=1200,700') if credentials['is_server'] == 'yes': chromedriver_path = config_folder + 'chromedriver' options.add_argument('--no-sandbox') options.add_argument("--disable-dev-shm-usage") options.add_argument('--headless') options.add_argument('--disable-gpu') import purge_db driver = webdriver.Chrome(chromedriver_path, chrome_options=options) open_web_site() for file in categs_files: import_categ_file(file) for file in products_files: import_products_file(file) rebuild_mf_index() driver.quit() crossparser_tools.write_to_log(' ******* Start import files *******') import_files()
def start_checking(): is_done = True global tabs_delay global websites_parsed for site, handle in window_handles.items(): try: if websites_parsed[site] == True: continue driver.switch_to.window(handle) parse_time = datetime.datetime.now() - tabs_delay[site] #Check if parsing hasnt started if '/start/' in driver.current_url: if parse_time > datetime.timedelta(minutes=1.0): current_link = current_links[site] crossparser_tools.write_to_log( 'Parsing hasnt started. Filed to parse link: ' + current_link) #Start parse new link: is_done = check_parse_new_link(site, is_done) #Check if parsing of link last too long if parse_time > max_parse_time: print('parsing of website ' + site + 'took more than hour') if check_exists_by_css_selector('#cancelBtn'): btn = WebDriverWait(driver, 30).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#cancelBtn"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", btn) btn.click() sleep(2) #Start parse new link: is_done = check_parse_new_link(site, is_done) else: pass #print('still parsing ' + str(parse_time)) if check_exists_by_css_selector( '#content .products-menu .export-button') == False: #Tab still parsing, skip is_done = False else: #Parsing complete. Download catalog filename1 = download_catalogs(site, True) #Dowload secondary prices catalog if filename1 != '': btn = WebDriverWait(driver, 30).until( EC.element_to_be_clickable(( By.CSS_SELECTOR, "div.ui-dialog.ui-widget button.ui-dialog-titlebar-close" ))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", btn) btn.click() btn = WebDriverWait(driver, 30).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#changePrice"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", btn) btn.click() btn = WebDriverWait(driver, 30).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#price > option:nth-child(2)"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", btn) btn.click() btn = WebDriverWait(driver, 30).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "#applyChangePriceBtn"))) driver.execute_script( "arguments[0].scrollIntoView();$('#header').remove();", btn) btn.click() sleep(2) filename2 = download_catalogs(site, False) #unite prices: if filename2 != '': unite_prices(filename1, filename2) #Start parse new link: is_done = check_parse_new_link(site, is_done) except Exception as e: crossparser_tools.write_to_log( 'failed to check or initiate link of site: ' + site) current_link = current_links[site] crossparser_tools.write_to_log('Failed to parse link: ' + current_link) crossparser_tools.write_to_log(str(e)) sleep(5) if is_done == False: start_checking()
def json_to_csv(data, csvexportfile): csv_out_data = [] csv_header = [] if len(data) == 1: crossparser_tools.write_to_log(data['detail']) return '' for product in data: product = product['post'] #print('product: ', product['prod_name']) row_out = [''] * max([len(product), len(csv_header)]) for attr, value in product.items(): if attr not in csv_header: csv_header.append(attr) if attr in csv_header: #Optional PRE-processing of input fields if attr == 'products_imgs' or attr == 'prod_categories': value = ','.join(value) if attr == 'all_native_sizes': # Clear disabled sized from all natives: if 'disabled_native_sizes' in product: disabled_native_sizes = product[ 'disabled_native_sizes'] if disabled_native_sizes != '': for size in disabled_native_sizes: if size in value: value.pop(value.index(size)) value = '|'.join(value) if current_site == 'sportmaster.ru': if attr == 'products_imgs': value = value.split("'") imgs = [] for chunk in value: if 'https://cdn.sptmr.ru' in chunk: chunk = chunk.replace( 'resize_cache/', '').replace('/${width}_${height}_1', '') imgs.append(chunk) value = (',').join(imgs) #print(value) if attr == 'products_imgs': imgs = value.split(',') checked_imgs = [] for img in imgs: check = image_check(img) if check != '': checked_imgs.append(check) if len(checked_imgs) == 0: value = '' else: value = (',').join(checked_imgs) row_out[csv_header.index(attr)] = value #print('csv_header: ', csv_header) #print('row_out: ', row_out) csv_out_data.append(row_out) parse_header(csv_header, csvexportfile) return csv_out_data
def parse_row(row, csvwriter): global csv_out_data_counter n_import = len(table_titles_list) n_export = len(export_fields) i = 0 row_out = [''] * n_export #Format export row for cell in row: if i >= n_import: print("col out of range of header: ", cell) break for attr, value in export_fields.items(): if attr == table_titles_list[i]: row_out[export_fields_nums[attr]] = cell.replace('"', '').replace( '\n', '') i = i + 1 if print_rows_formating == True: print("row formated for export: ", row_out) # Customize special fields (such as Size, etc) # <Optional loop> i = -1 for cell in row_out: i += 1 current_row_title = export_fields_array[i] if current_row_title == '_DESCRIPTION_' or current_row_title == '_NAME_' or current_row_title == '_MANUFACTURER_': row_out[i] = '"' + cell.strip() + '"' #Format Size: if current_row_title == '_OPTIONS_': new_size_cell = '"' cell = cell.replace('\n', '').replace('\r', '') sizes_arr = cell.split('|') for size in sizes_arr: new_size_cell += 'select|Размер|' new_size_cell += size new_size_cell += '|1|1000|1|+|0.0000|+|0|+|0.00\n' row_out[i] = new_size_cell + '"' #Format Price: if current_row_title == '_PRICE_': row_out[i] = ''.join(re.findall(r'\d+', cell.replace(' ', ''))) if current_row_title == '_SPECIAL_': special_price = ''.join(re.findall(r'\d+', cell.replace(' ', ''))) if special_price != '': price_index = export_fields_array.index('_PRICE_') curr_price = row_out[price_index] row_out[price_index] = special_price special_price = '.'.join( re.findall( r'\d+', curr_price.replace(' ', '').replace('\n', '').replace('\r', ''))) row_out[i] = '1,0,' + str( special_price) + '.00,0000-00-00,0000-00-00' #Copy SKU to Model: if current_row_title == '_MODEL_': sku_index = export_fields_array.index('_SKU_') curr_sku = row_out[sku_index] row_out[i] = curr_sku #Set primary (first) image: if current_row_title == '_IMAGE_': sku_index = export_fields_array.index('_IMAGES_') curr_sku = row_out[sku_index] if curr_sku == '': sku_index = export_fields_array.index('_LOCATION_') curr_sku = row_out[sku_index] crossparser_tools.write_to_log( 'No images collected for product: ' + curr_sku) #Decline product without imgs: return curr_sku = curr_sku.split(',') row_out[i] = curr_sku[0] curr_sku.pop(0) row_out[sku_index] = ','.join(curr_sku) if current_row_title == '_QUANTITY_': row_out[i] = str(99999) #Create category: if current_row_title == '_CATEGORY_ID_': index = export_fields_array.index('_MANUFACTURER_') prod_brand = row_out[index] index = export_fields_array.index('_NAME_') prod_name = row_out[index] prod_categs = cell index = export_fields_array.index('_PRICE_') prod_price = 999999 if row_out[index] != '': prod_price = int(row_out[index]) index = export_fields_array.index('_IMAGE_') prod_image = row_out[index] categs_ids = create_category(prod_categs, prod_brand, prod_name, prod_price, prod_image) if categs_ids is None: categs_ids = '' row_out[i] = ','.join(categs_ids) #SEO URL: if current_row_title == '_SEO_KEYWORD_': index = export_fields_array.index('_SKU_') prod_sku = row_out[index] index = export_fields_array.index('_NAME_') prod_name = row_out[index] seo_url = to_seo_url(prod_name) row_out[i] = seo_url + '-' + prod_sku.lower() #Set up attributes: Brand, Gender, Season: if current_row_title == '_ATTRIBUTES_': global global_prod_gender row = '"Обувь|Пол|' + global_prod_gender.replace('"', '') + '\n' index = export_fields_array.index('_MANUFACTURER_') prod_brand = row_out[index].replace('"', '') row += 'Обувь|Бренд|' + prod_brand + '\n' prod_season = 'Лето' row += 'Обувь|Сезон|' + prod_season + '"' row_out[i] = row if current_row_title == '_LOCATION_': link_id = ''.join(re.findall(r'\w+', cell)).replace('http', '').replace('www', '') print(link_id) # </> if print_rows_formating == True: print("out row: ", ';'.join(row_out) + '\n') for i in range(len(row_out)): row_out[i] = str(row_out[i]) csvwriter.write(';'.join(row_out) + '\n') global items_counter_converted items_counter_converted += 1 csv_out_data_counter += 1
def make_categories_csv(): global categories_names global categories_ids global categories_max_id global categories_parent_ids if len(categories_names) == 0: crossparser_tools.write_to_log( 'Failed to make categories csv. No entries') return res_eng = regex.search(r'\p{IsCyrillic}', 'lala') categs_for_menu = [] categs_for_menu_urls = [] categs_filename = temp_folder + 'category_export.csv' with open(temp_folder + 'files_categ_import.txt', 'w+', newline='', encoding="utf8") as files_toimport: files_toimport.write(categs_filename) with open(categs_filename, 'w+', newline='', encoding="utf8") as cat_file: row_out = '_ID_;_PARENT_ID_;_NAME_;_META_H1_;_META_TITLE_;_META_KEYWORDS_;_META_DESCRIPTION_;_DESCRIPTION_;_IMAGE_;_SEO_KEYWORD_;_SORT_ORDER_\n' cat_file.write(row_out) #print('categories_qnt: ', categories_qnt) for i_cat in range(len(categories_names)): #Form up H1 header: categs = categories_names[i_cat].split('|') main_subcat = '' if len(categs) == 2: main_subcat = ';'.join(categs) for i in range(len(categs)): res = regex.search(r'\p{IsCyrillic}', categs[i]) if res != res_eng: categs[i] = categs[i].lower() for i in range(len(categs)): for j in range(i + 1, len(categs)): if i > (len(categs) - 1) or j > (len(categs) - 1): break if categs[i] in categs[j]: categs.pop(i) seo_title = '' is_brand_cat = False if categs[0] == 'бренды': categs.pop(0) is_brand_cat = True #Form up SEO Title for brands: if len(categs) > 0: seo_title = 'Купить ' + ' '.join( categs) + '. Каталог ' + categs[0] + ' оригинал, цены' if len(categs) > 0: #Form up SEO Title for gender categs: categs[0] = categs[0].capitalize() if is_brand_cat == False: seo_title = ' '.join(categs) + ' купить по цене от ' + str( categories_lower_price[ categories_ids[i_cat]]) + ' руб. Фото, каталог' h1header = ' '.join(categs) #SEO URL: seo_url = to_seo_url(h1header) #Categs for menu: if main_subcat != '': if main_subcat not in categs_for_menu: num = str(categories_qnt[categories_ids[i_cat]]) categs_for_menu.append( [main_subcat.capitalize(), seo_url, num]) row_out = [''] * 11 row_out[0] = str(categories_ids[i_cat]) row_out[1] = str(categories_parent_ids[i_cat]) row_out[2] = categories_names[i_cat] #print('new category:', categories_names[i_cat]) row_out[3] = h1header row_out[4] = seo_title row_out[8] = str(categories_images[categories_ids[i_cat]]) row_out[9] = seo_url row_out[10] = str(categories_qnt[categories_ids[i_cat]]) #print('adding new category:', row_out) cat_file.write(';'.join(row_out) + '\n') crossparser_tools.write_to_log('Made scv of categories with ' + str(len(categories_names)) + ' items. Saved to ' + categs_filename) categs_for_menu = sorted(categs_for_menu, key=lambda x: x[0], reverse=False) #print(categs_for_menu) #make menu file for web-site with open(data_folder + 'category_menu.txt', 'w+', newline='', encoding="utf8") as cat_file: for cat in categs_for_menu: cat_file.write(';'.join(cat) + '$$')
def parsnew(): #Clear import catalog files (files of files) with open(file_of_raw_catalogs, 'w+', newline='', encoding="utf8") as files_toimport: files_toimport.close() global driver parse_websites() global websites_parsed for attr, value in websites.items(): websites_parsed[attr] = False if credentials['is_server'] == 'no': options = webdriver.ChromeOptions() #prefs = {"download.default_directory" : temp_folder, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True } temp_folder_downl = 'C:\\Work\\Crossparser\\temp' print(temp_folder_downl) prefs = { "download.default_directory": temp_folder_downl, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True } options.add_experimental_option("prefs", prefs) chromedriver_path = config_folder + 'chromedriver.exe' options.add_argument('--window-size=1200,700') driver = webdriver.Chrome(chromedriver_path, chrome_options=options) #Unfortunately chrome doesnt saving any files in specific folder. Need to use Firefox for this only if credentials['is_server'] == 'yes': #chromedriver_path = config_folder + 'chromedriver' #options.add_argument('--no-sandbox') #options.add_argument("--disable-dev-shm-usage"); #options.add_argument('--headless') #options.add_argument('--disable-gpu') profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) # custom location profile.set_preference('browser.download.manager.showWhenStarting', False) profile.set_preference('browser.download.dir', temp_folder) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv') from selenium.webdriver.firefox.options import Options firefox_options = Options() firefox_options.add_argument('-headless') driver = webdriver.Firefox(firefox_profile=profile, options=firefox_options) cloud_login() start_parse_all() start_checking() crossparser_tools.write_to_log('Done with cloudparser') crossparser_tools.write_to_log('Totally links found: ' + str(counter_links_total)) crossparser_tools.write_to_log('Successfully parsed: ' + str(counter_links_parsed)) driver.quit()
if not os.path.exists(website_root + img_folder): os.makedirs(website_root + img_folder) #Clear import catalog files (files of files) with open(temp_folder + 'files_prod_import.txt', 'w+', newline='', encoding="utf8") as files_toimport: files_toimport.close() with open(temp_folder + 'files_categ_import.txt', 'w+', newline='', encoding="utf8") as files_toimport: files_toimport.close() crossparser_tools.write_to_log( '\n\n ********** Script started *********** \n\n') #start_all_diggers() #start_digger(digger_id) #sleep(10) #parse_new() checking_all_diggers() make_categories_csv() crossparser_tools.write_to_log('parsing process completed') crossparser_tools.write_to_log('totally parsed: ' + str(items_counter_parsed) + ' items') crossparser_tools.write_to_log('successfully converted: ' +
def image_check(img): global img_counter_existed global img_counter_dowloaded global img_counter_failed_to_dowload global img_counter_dowloaded_size global img_counter_dowloaded_size_compressed if img in img_db.keys(): img_counter_existed += 1 return img_db[img] try: #print('downloading img:', img) #img_name = crossparser_tools.get_uniqid_from_url(img, current_site) img_name = crossparser_tools.get_rand_uniqid(15) img_name = img_name + '.jpg' #file_path = website_root + img_folder + img_name file_path = img_folder + img_name print('downloading img: ' + img) print('save to: ' + file_path) urllib.request.urlretrieve(img, file_path) size = os.stat(file_path).st_size img_counter_dowloaded_size += size #print('size in: ' , size) compression = 100 if size > 2000000: compression = 10 if size > 1000000 and size < 2000000: compression = 20 if size > 500000 and size < 1000000: compression = 50 if size > 200000 and size < 500000: compression = 70 if size > 200000: image = Image.open(file_path) image.save(file_path, quality=compression) else: compression = 0 size = os.stat(file_path).st_size img_counter_dowloaded_size_compressed += size #print('compression: ' , compression) #print('size out: ' , size) img_db[img] = img_module_folder + img_name global global_prod_id img_db_products[img] = global_prod_id #Write to DB with open(data_folder + 'img_db', 'a+', newline='', encoding="utf8") as img_dbfile: img_dbfile.write(img + '$$' + img_module_folder + img_name + '\n') with open(data_folder + 'img_db_prods', 'a+', newline='', encoding="utf8") as img_dbfile: img_dbfile.write(img_name + '$$' + global_prod_id + '\n') #Add to ElasticSearch if credentials['is_server'] == 'yes': try: image_match_add.add_img(file_path, global_prod_id) except Exception as e: crossparser_tools.write_to_log('unable to download img:', img) crossparser_tools.write_to_log(e) img_counter_dowloaded += 1 return img_module_folder + img_name except Exception as e: print('unable to download img:', img) print(e) img_counter_failed_to_dowload += 1 return ''