Exemplo n.º 1
0
 def commit(self):
     something_changed = False
     if not self.image.exists:
         self.image.commit()
     for k, v in self.MAPPER:
         k_value, v_value = self.db_instance.__getattribute__(
             k), self.__getattribute__(v)
         if str(k_value) != str(v_value):
             self.db_instance.__setattr__(k, self.__getattribute__(v))
             something_changed = True
             logger.info(k, ' changed from ', k_value, ' to ', v_value)
     if something_changed:
         self.session.add(self.db_instance)
         self.session.commit()
     else:
         print(f'Nothing was changed for {self.db_instance.id}')
Exemplo n.º 2
0
async def get_data(url):
    if is_redirect(url):
        logger.warning(f'URL is a redirect: {url}')
        raise IsRedirectError
    cache_dict = json.load(open(CACHE_DICT_PATH, 'r'))
    file_name = cache_dict.get(url, None)
    if not file_name:
        file_name = '{}.html'.format(str(uuid4())[:8])
        contents = await cache_html(url, file_name)
        cache_dict[url] = file_name
        json.dump(cache_dict, open(CACHE_DICT_PATH, 'w'))
    else:
        try:
            contents = open(f'{CACHED_FOLDER}/{file_name}', 'r').read()
            logger.info(f'Using cached: {file_name} for url: {url}')
        except FileNotFoundError:
            del cache_dict[url]
            json.dump(cache_dict, open(CACHE_DICT_PATH, 'w'))
            return await get_data(url)
    return contents
Exemplo n.º 3
0
def cache_html(url, name, attempts=1):
    # proxies = {
    # 	'http': 'socks5://127.0.0.1:9050',
    # }

    if attempts > MAX_GET_ATTEMPTS:
        logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
        raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
    logger.info(f'GET: {url}')
    if attempts > 1:
        logger.info(f'attempt: {attempts}')

    site = requests.get(url, headers=HEADERS())
    site.encoding = 'utf-8'

    if is_captcha(site.content):
        logger.warning(f'Captcha received for url: {url}')
        logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...')
        sleep(TIMEOUT_SEC * attempts)
        return cache_html(url, name, attempts=attempts + 1)

    try:
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    except FileNotFoundError:
        import os
        os.mkdir(CACHED_FOLDER)
        with open(Path(CACHED_FOLDER, name), 'wb') as out:
            out.write(site.content)
    logger.info(f'Cache name: {name}')
    return site.content
Exemplo n.º 4
0
def get_data(url, recreate_cache_forced=False, **kwargs):
    cache_dict = get_cache_dict()
    file_name = cache_dict.get(url, None)
    if not file_name:
        # file_name = '{}.html'.format(str(uuid4())[:8])
        file_name = get_unique_file_name(CACHED_FOLDER, 'html')
        contents = cache_html(url, file_name)
        cache_dict[url] = file_name
        json.dump(cache_dict, open(CACHE_DICT_PATH, 'w'))
    else:
        file_path = Path(CACHED_FOLDER, file_name)
        try:
            if recreate_cache_forced:
                Path.unlink(file_path)
                logger.info(f'File: {file_path} removed')
                raise FileNotFoundError
            contents = open(file_path, 'r').read()
            logger.info(f'Using cached: {file_name} for url: {url}')
        except FileNotFoundError:
            del cache_dict[url]
            json.dump(cache_dict, open(CACHE_DICT_PATH, 'w'))
            return get_data(url)
    return contents
Exemplo n.º 5
0
async def cache_html(url, name, attempts=1):
    if attempts > MAX_GET_ATTEMPTS:
        logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
        raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}')
    logger.info(f'GET: {url}')
    if attempts > 1:
        logger.info(f'attempt: {attempts}')
    async with aiohttp.ClientSession() as session:
        site = await fetch(session, url)
        # print('SSSSSSSSSSSs', site)
    if is_captcha(site):
        logger.warning(f'Captcha received for url: {url}')
        logger.warning(f'Sleeping for {TIMEOUT_SEC * attempts}s...')
        await asyncio.sleep(TIMEOUT_SEC * attempts)
        return await cache_html(url, name, attempts=attempts + 1)
    with open(f'{CACHED_FOLDER}/{name}', 'w') as out:
        out.write(site)
    logger.info(f'Cache name: {name}')
    return site
Exemplo n.º 6
0
		download_item(i, category_name)


def download_list_all(url_object):
	category_name, url = url_object.get('category_name'), url_object['url']
	soup = get_soup(url)
	pages_count = get_pages_count(soup)

	logger.debug(f'Found {pages_count} pages for list url {url}')
	logger.debug('download_list::getting data from page 1')
	download_page(soup, category_name)

	for page_num in range(2, pages_count + 1):
		logger.debug(f'download_list::getting data from page {page_num}')
		new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num))
		soup = get_soup(new_url)
		download_page(soup, category_name)


if __name__ == '__main__':
	link_set = get_link_set()
	# link_set = json.load(open(LINK_SET_PATH, 'r', encoding='utf8'))
	logger.info('START\n')
	# random.shuffle(link_set)
	DOWNLOAD_IMAGES = input('DOWNLOAD_IMAGESs? (Y/N)\n').lower() == 'y'
	RESOLVE_OTHER_SHOP_URL = input('RESOLVE_OTHER_SHOP_URL? (Y/N)\n').lower() == 'y'
	while link_set:
		url_object_form_json = link_set.pop()
		logger.debug(f'url_object_form_json {str(url_object_form_json)}')
		download_list_all(url_object_form_json)