def __init__(self, settings, existing_archive_filenames, new_archive_filename, archive_just_cache=False, known_article_urls=None, debug_params=None, downloader_params=None): # List the used properties self._archive_page_urls_by_date = None self._archive_url_format = None self._date_from = None self._date_until = None self._go_reverse_in_archive = None self._min_pagenum = None self._initial_page_num = None self._ignore_archive_cache = None self._infinite_scrolling = None self._extract_article_urls_from_page_fun = None self._find_next_page_url = None # Save the original settings for using it with all columns self._settings = settings # Get columns, if there is only one we use None to keep default settings self._columns = settings['columns'] # Initialise the logger if debug_params is None: debug_params = {} self._logger = Logger(settings['log_file_archive'], **debug_params) # Open files for writing gathered URLs if needed, self.good_urls = set() self._new_good_archive_urls_fh, self._good_urls_add = \ add_and_write_factory(self.good_urls, settings.get('new_good_archive_urls')) self.problematic_urls = set() self._new_problematic_archive_urls_fh, self._problematic_urls_add = \ add_and_write_factory(self.problematic_urls, settings.get('new_problematic_archive_urls')) # Setup the list of cached article URLs to stop archive crawling in time self.known_article_urls = set() if known_article_urls is not None: if isinstance(known_article_urls, str): with open(known_article_urls, encoding='UTF-8') as fh: self.known_article_urls = {line.strip() for line in fh} elif isinstance(known_article_urls, set): self.known_article_urls = known_article_urls # Create new archive while downloading, or simulate download and read the archive self._downloader = WarcCachingDownloader(existing_archive_filenames, new_archive_filename, self._logger, archive_just_cache, downloader_params) # Known bad URLs (read-only, available at __init__ time) self.bad_urls = self._downloader.bad_urls # Known good URLs (read-only, available at __init__ time from cache) self.url_index = self._downloader.url_index
def __init__(self, settings, articles_existing_warc_filenames, articles_new_warc_filename, archive_existing_warc_filenames, archive_new_warc_filename, articles_just_cache=False, archive_just_cache=False, known_article_urls=None, debug_params=None, download_params=None): # Initialise the logger self._logger = Logger(settings['log_file_articles']) # Open files for writing gathered URLs if needed self._new_urls = set() self._new_good_urls_fh, self._new_urls_add = \ add_and_write_factory(self._new_urls, settings.get('new_good_urls')) self.problematic_article_urls = set() self._new_problematic_urls_fh, self._problematic_article_urls_add = \ add_and_write_factory(self.problematic_article_urls, settings.get('new_problematic_urls')) # Store values at init-time self._filter_by_date = settings['FILTER_ARTICLES_BY_DATE'] if self._filter_by_date: # Global date intervals, only if they are explicitly set! self._date_from = settings['date_from'] self._date_until = settings['date_from'] # Get the initialised corpus converter (can be dummy) and set the apropriate logger self._converter = settings['CORPUS_CONVERTER'] self._converter.logger = self._logger # Create new archive while downloading, or simulate download and read the archive self._downloader = WarcCachingDownloader( articles_existing_warc_filenames, articles_new_warc_filename, self._logger, articles_just_cache, download_params) if known_article_urls is None: # If None is supplied copy the ones from the article archive known_article_urls = self._downloader.url_index # All URLs in the archive are known good! if archive_just_cache and articles_just_cache: # Full offline mode for processing articles only withouht the archive self._archive_downloader = NewsArchiveDummyCrawler( self._downloader.url_index) else: # known_bad_urls are common between the NewsArchiveCrawler and the NewsArticleCrawler # For downloading the articles from a (possibly read-only) archive self._archive_downloader = NewsArchiveCrawler( settings, archive_existing_warc_filenames, archive_new_warc_filename, archive_just_cache, known_article_urls, debug_params, download_params)
def main_validate_and_list(args): """ __file__ validate [source warcfiles] # WarcReader(..., strict_mode=True, check_digest=True) """ level = 'INFO' url_index = validate_warc_file( args.source_warcfile, Logger(console_level=level, logfile_level=level)) if args.command == 'listurls': for url in url_index: print(url)
def main_cat_and_sample(args): """ __file__ sample [source warcfiles or None] [urls list file or stdin] [target warcfile] [Online or Offline] """ main_logger = Logger() out_dir = getattr(args, 'out_dir', None) target_warcfile = getattr(args, 'target_warcfile', None) target = out_dir if out_dir is not None else target_warcfile main_logger.log('INFO', 'Adding URLs to', target, ':') offline = getattr( args, 'offline', True ) # Sample can be online or offline, but we write warc only when sampling! sample_warc_by_urls(args.source_warcfile, args.url_input_stream, main_logger, target_warcfile=target_warcfile, offline=offline, out_dir=out_dir, just_cache=args.command == 'cat') main_logger.log('INFO', 'Done!')
def main_download(args): """ __file__ download [URL] [target warfile] """ main_logger = Logger() main_logger.log('INFO', 'Adding URL to', args.target_warcfile, ':') online_test(args.source_url, args.target_warcfile, main_logger) main_logger.log('INFO', 'Done!')
def read_portalspec_config(configs_dir, portal_name, warc_dir, warc_name, log_dir, run_params=None, logfile_level='INFO', console_level='INFO'): """Read and validate all input files and runtime parameters""" if run_params is None: run_params = {} # Init logfile log_filename = os_path_join(log_dir, f'tei_writing_{portal_name}.log') check_exists(log_dir, check_fun=isdir, message='Directory not found') tei_logger = Logger(log_filename=log_filename, logfile_mode='w', logfile_level=logfile_level, console_level=console_level) task_name = run_params.get('task_name') if task_name is None: tei_logger.log('CRITICAL', 'task_name is not set in run_params!') exit(1) tei_logger.log('INFO', f'Running {task_name}') tei_logger.log('INFO', f'Processing {portal_name}') # Portal specific config (python) stuff portal_spec_module_fn = os_path_join(configs_dir, portal_name, f'{portal_name}_specific.py') check_exists(portal_spec_module_fn, tei_logger) blacklist_spec, multipage_compile, next_page_of_article_fun, get_meta_fun_spec, article_root_params, \ decompose_spec, excluded_tags_spec, portal_url_prefix, link_filter_spec, links, block_rules_spec, \ bigram_rules_spec = get_portal_spec_fun_and_dict_names(portal_spec_module_fn, tei_logger) # WARC reading stuff warc_name = os_path_join(warc_dir, warc_name) check_exists(warc_name, tei_logger) warc_date_interval = { } # Actually the maximal date interval for HTTP responses in the WARC file warc_level_params = (warc_name, blacklist_spec, multipage_compile, tei_logger, warc_date_interval, next_page_of_article_fun) # Portal specific TSV dictionaries stuff if run_params.get('w_specific_dicts', False): tei_logger.log('INFO', 'Loading portal specific dicts') text_tags_normal_fn = os_path_join( configs_dir, portal_name, f'{portal_name}_text_tags_normal.tsv') check_exists(text_tags_normal_fn, tei_logger) notext_tags_normal_fn = os_path_join( configs_dir, portal_name, f'{portal_name}_notext_tags_normal.tsv') check_exists(notext_tags_normal_fn, tei_logger) tag_normal_dict, portal_specific_block_rules = \ load_portal_specific_dicts(text_tags_normal_fn, notext_tags_normal_fn, block_rules_spec, tei_logger) else: tei_logger.log('INFO', 'Not loading portal specific dicts') tag_normal_dict, portal_specific_block_rules = None, None # Base TEI XML file reading stuff if run_params.get('w_specific_tei_base_file', False): tei_logger.log('INFO', 'Loading portal specific TEI base file') tei_base_dir_and_name = os_path_join(configs_dir, portal_name, f'{portal_name}_BASE.xml') check_exists(tei_base_dir_and_name, tei_logger) portal_xml_string = read_portal_tei_base_file(tei_base_dir_and_name, tei_logger) else: tei_logger.log('INFO', 'Not loading portal specific TEI base file') portal_xml_string = None write_out_mode = run_params.get('write_out_mode') write_out_mode_fun = None write_out_mode_file = WRITE_OUT_MODES.get(write_out_mode) if write_out_mode is not None and write_out_mode_file is None: tei_logger.log( 'CRITICAL', f'{write_out_mode} is not in the allowed value set ({set(WRITE_OUT_MODES.keys())})!' ) exit(1) elif write_out_mode is not None: # Here we import optional libraries only if they are needed later write_out_mode_fun = getattr(import_python_file(write_out_mode_file), 'process_article') tei_logger.log('INFO', f'Using {write_out_mode} write mode') return tei_logger, warc_level_params, get_meta_fun_spec, article_root_params, decompose_spec, excluded_tags_spec, \ portal_url_prefix, link_filter_spec, links, block_rules_spec, bigram_rules_spec, tag_normal_dict, \ portal_specific_block_rules, portal_xml_string, write_out_mode_fun
class NewsArticleCrawler: """ 1) Get the list of articles (eg. NewsArchiveCrawler) 2) Download article pages 3) Extract the text of articles from raw HTML 4) Filter articles by date (depends on corpus converter) 5) Save them in corpus format (depends on corpus converter) 6) Follow the links on page (depends on corpus converter) """ def __init__(self, settings, articles_existing_warc_filenames, articles_new_warc_filename, archive_existing_warc_filenames, archive_new_warc_filename, articles_just_cache=False, archive_just_cache=False, known_article_urls=None, debug_params=None, download_params=None): # Initialise the logger self._logger = Logger(settings['log_file_articles']) # Open files for writing gathered URLs if needed self._new_urls = set() self._new_good_urls_fh, self._new_urls_add = \ add_and_write_factory(self._new_urls, settings.get('new_good_urls')) self.problematic_article_urls = set() self._new_problematic_urls_fh, self._problematic_article_urls_add = \ add_and_write_factory(self.problematic_article_urls, settings.get('new_problematic_urls')) # Store values at init-time self._filter_by_date = settings['FILTER_ARTICLES_BY_DATE'] if self._filter_by_date: # Global date intervals, only if they are explicitly set! self._date_from = settings['date_from'] self._date_until = settings['date_from'] # Get the initialised corpus converter (can be dummy) and set the apropriate logger self._converter = settings['CORPUS_CONVERTER'] self._converter.logger = self._logger # Create new archive while downloading, or simulate download and read the archive self._downloader = WarcCachingDownloader( articles_existing_warc_filenames, articles_new_warc_filename, self._logger, articles_just_cache, download_params) if known_article_urls is None: # If None is supplied copy the ones from the article archive known_article_urls = self._downloader.url_index # All URLs in the archive are known good! if archive_just_cache and articles_just_cache: # Full offline mode for processing articles only withouht the archive self._archive_downloader = NewsArchiveDummyCrawler( self._downloader.url_index) else: # known_bad_urls are common between the NewsArchiveCrawler and the NewsArticleCrawler # For downloading the articles from a (possibly read-only) archive self._archive_downloader = NewsArchiveCrawler( settings, archive_existing_warc_filenames, archive_new_warc_filename, archive_just_cache, known_article_urls, debug_params, download_params) def __del__(self): if hasattr(self, '_archive_downloader' ): # Make sure that the previous files are closed... del self._archive_downloader if hasattr(self, '_new_good_urls_fh') and hasattr(self, 'close'): self._new_good_urls_fh.close() if hasattr(self, '_new_problematic_urls_fh') and hasattr( self, 'close'): self._new_problematic_urls_fh.close() def _is_problematic_url(self, url): # Explicitly marked as bad URL (either Article or Archive) OR # Download failed in this session and requries manual check (either Article or Archive) return url in self._downloader.bad_urls or url in self._archive_downloader.bad_urls or \ url in self.problematic_article_urls or url in self._archive_downloader.problematic_urls def _is_processed_good_url(self, url): # New good URL newly downloaded (either Article or Archive) # We do not count old good URLs (url_index) have taken from the cache WARC (either Article or Archive) # as they are needed to be copied to the target WARC! return url in self._downloader.good_urls or url in self._archive_downloader.good_urls def download_and_extract_all_articles(self): self.process_urls(self._archive_downloader.url_iterator()) def process_urls(self, it): urls = set() for url in it: urls.add(url) while len(urls) > 0: # This loop runs only one iteration if no URLs are extracted in step (6) else it consumes them first url = urls.pop() # 1) Check if the URL is # 1a) Explicitly marked as bad URL (either Article or Archive) -> Skip it, only INFO log! if url in self._downloader.bad_urls or url in self._archive_downloader.bad_urls: self._logger.log('DEBUG', url, 'Skipping URLs explicitly marked as bad!', sep='\t') continue # 1b) Download succeded in this session either Article or Archive (duplicate) # 1c) Download failed in this session and requries manual check either Article or Archive (duplicate) elif self._is_processed_good_url(url) or \ url in self.problematic_article_urls or url in self._archive_downloader.problematic_urls: self._logger.log( 'WARNING', url, 'Not processing URL, because it is an URL already' ' encountered in this session (including the caches)' ' or it is known to point to the portal\'s archive!', sep='\t') continue # 2) "Download" article article_raw_html = self._downloader.download_url(url) if article_raw_html is None: # Download failed, must be investigated! self._logger.log( 'ERROR', url, 'Article was not processed because download failed!', sep='\t') self._problematic_article_urls_add( url) # New problematic URL for manual checking continue self._new_urls_add(url) # New article URLs # 3) Identify the site scheme of the article to be able to look up the appropriate extracting method scheme = self._converter.identify_site_scheme( url, article_raw_html) # 4) Filter: time filtering when archive page URLs are not generated by date if needed if self._filter_by_date: # a) Retrieve the date article_date = self._converter.extract_article_date( url, article_raw_html, scheme) if article_date is None: self._logger.log('ERROR', url, 'DATE COULD NOT BE PARSED!', sep='\t') continue # b) Check date interval elif not self._date_from <= article_date <= self._date_until: self._logger.log( 'WARNING', url, 'Date ({0}) is not in the specified interval: {1}-{2}' ' didn\'t use it in the corpus'.format( article_date, self._date_from, self._date_until), sep='\t') continue # 5) Extract text to corpus self._converter.article_to_corpus(url, article_raw_html, scheme) # 6) Extract links to other articles and check for already extracted urls (also in the archive)? urls_to_follow = self._converter.follow_links_on_page( url, article_raw_html, scheme) # Only add those which has not been already handled to avoid loops! urls |= { url for url in urls_to_follow if not self._is_processed_good_url(url) and not self._is_problematic_url(url) }
class NewsArchiveCrawler: """ Using the provided regexes 1) Generates URLs of lists of articles (archives) 2) Extracts URLs of articles from these lists (with helper functions and config) """ def __init__(self, settings, existing_archive_filenames, new_archive_filename, archive_just_cache=False, known_article_urls=None, debug_params=None, downloader_params=None): # List the used properties self._archive_page_urls_by_date = None self._archive_url_format = None self._date_from = None self._date_until = None self._go_reverse_in_archive = None self._min_pagenum = None self._initial_page_num = None self._ignore_archive_cache = None self._infinite_scrolling = None self._extract_article_urls_from_page_fun = None self._find_next_page_url = None # Save the original settings for using it with all columns self._settings = settings # Get columns, if there is only one we use None to keep default settings self._columns = settings['columns'] # Initialise the logger if debug_params is None: debug_params = {} self._logger = Logger(settings['log_file_archive'], **debug_params) # Open files for writing gathered URLs if needed, self.good_urls = set() self._new_good_archive_urls_fh, self._good_urls_add = \ add_and_write_factory(self.good_urls, settings.get('new_good_archive_urls')) self.problematic_urls = set() self._new_problematic_archive_urls_fh, self._problematic_urls_add = \ add_and_write_factory(self.problematic_urls, settings.get('new_problematic_archive_urls')) # Setup the list of cached article URLs to stop archive crawling in time self.known_article_urls = set() if known_article_urls is not None: if isinstance(known_article_urls, str): with open(known_article_urls, encoding='UTF-8') as fh: self.known_article_urls = {line.strip() for line in fh} elif isinstance(known_article_urls, set): self.known_article_urls = known_article_urls # Create new archive while downloading, or simulate download and read the archive self._downloader = WarcCachingDownloader(existing_archive_filenames, new_archive_filename, self._logger, archive_just_cache, downloader_params) # Known bad URLs (read-only, available at __init__ time) self.bad_urls = self._downloader.bad_urls # Known good URLs (read-only, available at __init__ time from cache) self.url_index = self._downloader.url_index def _store_settings(self, column_spec_settings): # Settings for URL iterator self._archive_page_urls_by_date = self._settings[ 'archive_page_urls_by_date'] self._archive_url_format = column_spec_settings['archive_url_format'] if self._archive_page_urls_by_date: self._date_from = column_spec_settings['DATE_FROM'] self._date_until = column_spec_settings['DATE_UNTIL'] self._go_reverse_in_archive = self._settings[ 'go_reverse_in_archive'] # Settings for gen_article_urls_including_subpages() self._min_pagenum = column_spec_settings['min_pagenum'] self._initial_page_num = column_spec_settings['INITIAL_PAGENUM'] self._ignore_archive_cache = self._settings['ignore_archive_cache'] self._infinite_scrolling = self._settings['infinite_scrolling'] self._extract_article_urls_from_page_fun = self._settings[ 'EXTRACT_ARTICLE_URLS_FROM_PAGE_FUN'] # Store the constant parameters for the actual function used later self._find_next_page_url = \ self._find_next_page_url_factory(self._settings['EXTRACT_NEXT_PAGE_URL_FUN'], self._settings['next_url_by_pagenum'], self._settings['infinite_scrolling'], column_spec_settings['max_pagenum'], self._settings['new_article_url_threshold'], self.known_article_urls) def __del__( self ): # Write newly found URLs to files when output files supplied... # Save the good URLs... if hasattr(self, '_new_good_archive_urls_fh') and hasattr( self, 'close'): self._new_good_archive_urls_fh.close() if hasattr(self, '_new_problematic_archive_urls_fh') and hasattr( self, 'close'): self._new_problematic_archive_urls_fh.close() def url_iterator(self): """ The URL generation logic. We have one or more base URL to the archive (or column archives if there is more) and it is complete with portal-specific ending. The archive can be stored in groups (mostly the creation date) or ordered in a flat list paginated. This two main method can also be mixed. Pagination can be implemented in various vays see the appropriate function for details :return: Every page of the archive contain multiple URL to the actual articles, which are extrated and then returned as an iterator based on URLs. """ for column_name, params in self._columns.items(): self._logger.log('INFO', 'Starting column:', column_name) # 1) Set params for the actual column self._store_settings(params) # 2) By date with optional pagination (that is handled separately) if self._archive_page_urls_by_date: # a) Unique the generated archive page URLs using every day from date_from to the end of date_until archive_page_urls = list( set( self._gen_url_from_date( self._date_from + timedelta(days=curr_day), self._archive_url_format) for curr_day in range((self._date_until - self._date_from).days + 1))) # b) Sort the generated archive page URLs archive_page_urls.sort(reverse=self._go_reverse_in_archive) # 3) Stored in groups represented by pagination only which will be handled separately else: archive_page_urls = [self._archive_url_format ] # Only the base URL is added # 4) Iterate the archive URLs and process them, while generating the required page URLs on demand for archive_page_url in archive_page_urls: yield from self._gen_article_urls_including_subpages( archive_page_url) @staticmethod def _gen_url_from_date(curr_date, url_format): """ Generates the URLs of a page that contains URLs of articles published on that day. This function allows URLs to be grouped by years or month as there is no guarantee that all fields exists. We also enable using one day open ended interval of dates. eg. from 2018-04-04 to 2018-04-05 (not included) One must place #year #month #day and #next-year #next-month #next-day labels into the url_format variable. """ next_date = curr_date + timedelta( days=1) # Plus one day (open ended interval): vs.hu, hvg.hu art_list_url = url_format.\ replace('#year', '{0:04d}'.format(curr_date.year)).\ replace('#month', '{0:02d}'.format(curr_date.month)).\ replace('#day', '{0:02d}'.format(curr_date.day)). \ \ replace('#next-year', '{0:04d}'.format(next_date.year)). \ replace('#next-month', '{0:02d}'.format(next_date.month)). \ replace('#next-day', '{0:02d}'.format(next_date.day)) return art_list_url def _gen_article_urls_including_subpages(self, archive_page_url_base): """ Generates article URLs from a supplied URL inlcuding the on-demand sub-pages that contains article URLs """ page_num = self._min_pagenum first_page = True next_page_url = archive_page_url_base.replace('#pagenum', self._initial_page_num) while next_page_url is not None: archive_page_raw_html = self._downloader.download_url( next_page_url, self._ignore_archive_cache) curr_page_url = next_page_url if archive_page_raw_html is not None: # Download succeeded self._good_urls_add(next_page_url) # 1) We need article URLs here to reliably determine the end of pages in some cases article_urls = self._extract_article_urls_from_page_fun( archive_page_raw_html) if len(article_urls) == 0 and (not self._infinite_scrolling or first_page): self._logger.log( 'WARNING', next_page_url, 'Could not extract URLs from the archive!', sep='\t') # 2) Generate next-page URL or None if there should not be any next_page_url = self._find_next_page_url( archive_page_url_base, page_num, archive_page_raw_html, article_urls) else: # Download failed if next_page_url not in self.bad_urls and next_page_url not in self._downloader.good_urls and \ next_page_url not in self._downloader.url_index: # URLs in url_index should not be a problem self._problematic_urls_add( next_page_url) # New possibly bad URL next_page_url = None article_urls = [] page_num += 1 self._logger.log('DEBUG', 'URLs/ARCHIVE PAGE', curr_page_url, len(article_urls), sep='\t') yield from article_urls first_page = False @staticmethod def _find_next_page_url_factory(extract_next_page_url_fun, next_url_by_pagenum, infinite_scrolling, max_pagenum, art_url_threshold, known_article_urls): def find_nex_page_url_spec(archive_page_url_base, page_num, raw_html, article_urls): """ The next URL can be determined by various conditions (no matter how the pages are grouped): 1) If there is no pagination we return None 2) If there is a "next page" link, we find it and use that 3) If there is "infinite scrolling", we use pagenum from base to infinity (=No article URLs detected) 4) If there is only page numbering, we use pagenum from base to config-specified maximum 5) If there is only page numbering, we expect the archive to move during crawling (or partial crawling) """ # Method #1: No pagination (default) or no page left next_page_url = None # Method #2: Use special function to follow the link to the next page if extract_next_page_url_fun is not None: next_page_url = extract_next_page_url_fun(raw_html) elif ( next_url_by_pagenum and # There are page numbering # Method #3: No link, but infinite scrolling! (also good for inactive archive, without other clues) ((infinite_scrolling and len(article_urls) > 0) or # Method #4: Has predefined max_pagenum! (also good for inactive archive, with known max_pagenum) (max_pagenum is not None and page_num <= max_pagenum) or # Method #5: Active archive, just pages -> We allow intersecting elements # as the archive may have been moved (art_url_threshold is not None and (len(known_article_urls) == 0 or len(article_urls.minus(known_article_urls)) > art_url_threshold)))): next_page_url = archive_page_url_base.replace( '#pagenum', str(page_num)) # must generate URL return next_page_url return find_nex_page_url_spec