예제 #1
0
def cache_html(url, name, attempts=1):
    # proxies = {
    # 	'http': 'socks5://127.0.0.1:9050',
    # }

    if attempts > MAX_GET_ATTEMPTS:
        logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
        raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
    logger.info(f'GET: {url}')
    if attempts > 1:
        logger.info(f'attempt: {attempts}')

    site = requests.get(url, headers=HEADERS())
    site.encoding = 'utf-8'

    if is_captcha(site.content):
        logger.warning(f'Captcha received for url: {url}')
        logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...')
        sleep(TIMEOUT_SEC * attempts)
        return cache_html(url, name, attempts=attempts + 1)

    try:
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    except FileNotFoundError:
        import os
        os.mkdir(CACHED_FOLDER)
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    logger.info(f'Cache name: {name}')
    return site.content
예제 #2
0
async def fetch(session, url):
    async with session.get(url, headers=HEADERS()) as response:
        return await response.text()