Пример #1
0
async def main():
    category_urls = ['https://apolo.com.ua/AppleCo/macbook-air',
                     'https://apolo.com.ua/AppleCo/macbook-pro',
                     'https://apolo.com.ua/AppleCo/imac']
    all_category_urls = []
    all_product_urls = []

    for urls in await gather(*[get_all_page_category(url) for url in category_urls]):
        all_category_urls += urls

    for i in tqdm(range(0, len(all_category_urls), NUMS_THREADS)):
        urls = all_category_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(all_category_urls) else all_category_urls[i:]
        answers = await gather(*[get_product_urls_from_category(url) for url in urls])
        for answer in answers:
            all_product_urls += answer

    file = open_df(DUMP_FILE)
    for i in tqdm(range(0, len(all_product_urls), NUMS_THREADS)):
        urls = all_product_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(all_product_urls) else all_product_urls[i:]
        answers = await gather(*[parse_product(url) for url in urls])
        for answer in answers:
            file.write(answer)
    file.close()

    with open(DUMP_FILE, 'r') as file:
        write_to_csv(load(file))
Пример #2
0
def main():
    if input('Использовать дамп? [y|n]> ').lower() == 'n':  # Парсинг url с сайта
        catalogs = ['https://al-teh.ru/category/bytovye-resheniya-elektroobogreva/',
                    'https://al-teh.ru/category/kabelnyj-elektroobogrev/',
                    'https://al-teh.ru/category/elektro/',
                    'https://al-teh.ru/category/molniezashita/',
                    'https://al-teh.ru/category/vzryvozashishennoe-elektrooborudovanie/']
        product_urls = get_all_product_urls(catalogs)
        with open(DUMP_FILE, 'w') as file:
            dump(product_urls, file)
    else:  # Заргузка url из дампа
        product_urls = None
        with open(DUMP_FILE, 'r') as file:
            product_urls = load(file)

    print('Найдено {} url\'ов товаров'.format(len(product_urls)))

    dump_file = open_df('data_' + DUMP_FILE)
    for url in tqdm(product_urls):  # Парсинг данных и их моментальная запись в json файл (для оптимизации)
        new_data = get_product_data(url)
        if new_data:
            dump_file.write(new_data)
    dump_file.close()

    with open('data_' + DUMP_FILE) as file:  # Конвертирование json файла в csv
        write_to_csv(load(file))
Пример #3
0
async def main():
    product_urls = await get_all_product_urls()

    dump_file = open_df(DUMP_FILE)
    for i in range(0, len(product_urls), NUMS_THREAD):
        urls = product_urls[i:i + NUMS_THREAD] if i + NUMS_THREAD < len(
            product_urls) else product_urls[i:]

        answers = await gather(*[get_product_data(url) for url in urls])

        for answer in answers:
            if answer:
                dump_file.write(answer)
    dump_file.close()
Пример #4
0
async def main():
    product_urls = await get_all_product_urls()

    file_dump = open_df(DUMP_FILE)
    for i in tqdm(range(0, len(product_urls), NUMS_THREADS)):
        urls = product_urls[i: i + NUMS_THREADS] if i + NUMS_THREADS < len(product_urls) else product_urls[i:]
        answers = await gather(*[get_product_data(url) for url in urls])

        for answer in answers:
            if answer:
                file_dump.write(answer)
    file_dump.close()

    with open(DUMP_FILE, 'r') as file:
        write_to_csv(load(file))