async def main(): category_urls = ['https://apolo.com.ua/AppleCo/macbook-air', 'https://apolo.com.ua/AppleCo/macbook-pro', 'https://apolo.com.ua/AppleCo/imac'] all_category_urls = [] all_product_urls = [] for urls in await gather(*[get_all_page_category(url) for url in category_urls]): all_category_urls += urls for i in tqdm(range(0, len(all_category_urls), NUMS_THREADS)): urls = all_category_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(all_category_urls) else all_category_urls[i:] answers = await gather(*[get_product_urls_from_category(url) for url in urls]) for answer in answers: all_product_urls += answer file = open_df(DUMP_FILE) for i in tqdm(range(0, len(all_product_urls), NUMS_THREADS)): urls = all_product_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(all_product_urls) else all_product_urls[i:] answers = await gather(*[parse_product(url) for url in urls]) for answer in answers: file.write(answer) file.close() with open(DUMP_FILE, 'r') as file: write_to_csv(load(file))
def main(): if input('Использовать дамп? [y|n]> ').lower() == 'n': # Парсинг url с сайта catalogs = ['https://al-teh.ru/category/bytovye-resheniya-elektroobogreva/', 'https://al-teh.ru/category/kabelnyj-elektroobogrev/', 'https://al-teh.ru/category/elektro/', 'https://al-teh.ru/category/molniezashita/', 'https://al-teh.ru/category/vzryvozashishennoe-elektrooborudovanie/'] product_urls = get_all_product_urls(catalogs) with open(DUMP_FILE, 'w') as file: dump(product_urls, file) else: # Заргузка url из дампа product_urls = None with open(DUMP_FILE, 'r') as file: product_urls = load(file) print('Найдено {} url\'ов товаров'.format(len(product_urls))) dump_file = open_df('data_' + DUMP_FILE) for url in tqdm(product_urls): # Парсинг данных и их моментальная запись в json файл (для оптимизации) new_data = get_product_data(url) if new_data: dump_file.write(new_data) dump_file.close() with open('data_' + DUMP_FILE) as file: # Конвертирование json файла в csv write_to_csv(load(file))
async def main(): product_urls = await get_all_product_urls() dump_file = open_df(DUMP_FILE) for i in range(0, len(product_urls), NUMS_THREAD): urls = product_urls[i:i + NUMS_THREAD] if i + NUMS_THREAD < len( product_urls) else product_urls[i:] answers = await gather(*[get_product_data(url) for url in urls]) for answer in answers: if answer: dump_file.write(answer) dump_file.close()
async def main(): product_urls = await get_all_product_urls() file_dump = open_df(DUMP_FILE) for i in tqdm(range(0, len(product_urls), NUMS_THREADS)): urls = product_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(product_urls) else product_urls[i:] answers = await gather(*[get_product_data(url) for url in urls]) for answer in answers: if answer: file_dump.write(answer) file_dump.close() with open(DUMP_FILE, 'r') as file: write_to_csv(load(file))