async def async_handler(url_list, workers, try_count, delay, return_bool): """ make tasks and run them in a queue :return dict of {url: html_page} """ logger.debug(f'input urls for download are: {len(url_list)}') url_list = list(set(url_list)) logger.debug( f'len the url_list after delete duplicates = {len(url_list)}') urls_splited = split_list(url_list, workers) responses = {} for urls in urls_splited: tasks = [ asyncio.ensure_future( single_page_downloader(url, try_count, delay)) for url in urls ] res_list = await asyncio.gather(*tasks) if return_bool: responses.update({ list(res.keys())[0]: list(res.values())[0] for res in res_list }) return responses if return_bool else None
def collect_data_id_from_resource(pages, base, patterns): """ general finding ids from list pages :param pages: :param base: :param patterns: :return: """ logger.info(f'start collecting ids from {base}') new_ids = [] pages_compressed_html = download_pages(pages) for page in pages: logger.debug(f'collecting ids from {page}') souped_page = soup(compressed_to_str(pages_compressed_html.pop(page)), features='lxml') for pattern in patterns: new_pages = [ tag['href'] for tag in souped_page.find_all( 'a', {'href': re.compile(f'({base})?{pattern}')}) ] new_pages = [ base + page if page.find('http') == -1 else page for page in new_pages ] new_pages = [ page for page in new_pages if page[5:].find('http') == -1 ] new_pages = [re.sub(r'/?\?.*', '', page) for page in new_pages] new_ids += [ re.search(f'{base}{pattern}', page).group(1) for page in new_pages ] return new_ids
async def single_page_downloader(url, try_count, delay): """ download one page by send get request to the url save the page and return it as string """ file_address = get_guessed_file_address(url) try: output = {url: load_compressed_object(file_address)} logger.debug(f'already downloaded {url}') return output if return_bool else None except FileNotFoundError as error: logger.debug(f'start downloading {url}') for i in range(try_count): try: async with aiohttp.ClientSession( connector=aiohttp.TCPConnector()) as session: async with session.get(url) as resp: site_html = await resp.text() compressed_html = str_to_compressed(site_html) save_compressed_object(file_address, compressed_html) output = {url: compressed_html} return output if return_bool else None except Exception as error: logger.error( f'try_time: {i}/{try_count}, when downloading {url}: {error}' ) await asyncio.sleep(delay) # urls that not downloaded # comes to here logger.error( f'download FAILED! , could not get the page after {try_count} times of trying!' )
def make_soup(url): """ get the BeautifulSoup object of this page :param url (str): the url of page that we want :returns BeautifulSoup object: content of page of given url """ """ 1. load the page for new urls: download the html and save it as html file in downloaded pages for old urls: loads html for them from files to memory 2. return page as soup object """ if isinstance(url, list): raise MemoryError( 'to avoid memory overflow please use download_pages function for download list of pages' ) file_address = get_guessed_file_address(url) if os.path.isfile(file_address): logger.debug(f'already downloaded {url}') page_html = compressed_to_str(load_compressed_object(file_address)) else: logger.debug(f'start downloading {url}') page_html = get_page(url) save_compressed_object(str_to_compressed(page_html)) return soup(page_html, features='lxml')
def arg_parse(): ''' command line interface ''' parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument( '-r', '--run', dest='function', default=None, type=str, help='name of function to run', ) parser.add_argument( '-db', '--db_name', dest='db', default=None, type=str, help='name of dataset', ) parser.add_argument( '-log', '--log_level', dest='log_level', default=None, type=str, help='level of log', ) args = parser.parse_args() if args.function: dataset = dbManager.dataset(args.db) logger.debug(f'runing arg with args.function = {args.function}') if args.log_level: log_level = args.log_level if log_level == 'debug': logger.setLevel(logging.DEBUG) elif log_level == 'info': logger.setLevel(logging.INFO) elif log_level == 'error': logger.setLevel(logging.ERROR) elif log_level == 'critical': logger.setLevel(logging.CRITICAL) else: logger.setLevel(logging.WARNING) if args.function in ['st', 'start']: dataset.start() if args.function in ['dr', 'download_resource']: dataset.download_resources() elif args.function in ['ip', 'init_project']: dbManager.init_project() elif args.function in ['fd', 'find_db']: dataset.find_ids() elif args.function in ['ud', 'update_db']: dataset.update() elif args.function in ['sct', 'schema_test']: dataset.schema_test() # if there is any arg, return True if (len(sys.argv) == 1) or (len(sys.argv) == 2 and sys.argv[1] == '-log'): return False else: return True
def download_pages(url_list, workers=50, try_count=10, delay=1, return_bool=True): """ download a list of the urls and save them if you want :param url_list: list of urls that we want to download :param workers: :param try_count: :param delay: :param return_bool: :return: list of responses """ def split_list(input_list, step): return [ input_list[i - step:i] for i in range(step, len(input_list) + step, step) ] async def single_page_downloader(url, try_count, delay): """ download one page by send get request to the url save the page and return it as string """ file_address = get_guessed_file_address(url) try: output = {url: load_compressed_object(file_address)} logger.debug(f'already downloaded {url}') return output if return_bool else None except FileNotFoundError as error: logger.debug(f'start downloading {url}') for i in range(try_count): try: async with aiohttp.ClientSession( connector=aiohttp.TCPConnector()) as session: async with session.get(url) as resp: site_html = await resp.text() compressed_html = str_to_compressed(site_html) save_compressed_object(file_address, compressed_html) output = {url: compressed_html} return output if return_bool else None except Exception as error: logger.error( f'try_time: {i}/{try_count}, when downloading {url}: {error}' ) await asyncio.sleep(delay) # urls that not downloaded # comes to here logger.error( f'download FAILED! , could not get the page after {try_count} times of trying!' ) async def async_handler(url_list, workers, try_count, delay, return_bool): """ make tasks and run them in a queue :return dict of {url: html_page} """ logger.debug(f'input urls for download are: {len(url_list)}') url_list = list(set(url_list)) logger.debug( f'len the url_list after delete duplicates = {len(url_list)}') urls_splited = split_list(url_list, workers) responses = {} for urls in urls_splited: tasks = [ asyncio.ensure_future( single_page_downloader(url, try_count, delay)) for url in urls ] res_list = await asyncio.gather(*tasks) if return_bool: responses.update({ list(res.keys())[0]: list(res.values())[0] for res in res_list }) return responses if return_bool else None # main function logger.debug( f'start make_soup for url = ' f'{url_list if len(url_list) < 2 else str(url_list[:2]).replace("]", ", ...]")} len={len(url_list)}' ) loop = asyncio.new_event_loop() task = loop.create_task( async_handler(url_list, workers, try_count, delay, return_bool)) response = loop.run_until_complete(task) loop.close() return response if return_bool else None