def test_all_valid_and_unique_urls_are_returned(self): num_urls = 10 url_list = [ self.VALID_URL_FORMAT.format(index) for index in range(0, num_urls) ] processed_list = UrlArgProcessing.process_url_list(url_list) assert_equals(num_urls, len(processed_list))
def test_invalid_urls_are_removed(self): num_urls = 5 url_list = [ self.VALID_URL_FORMAT.format(index) for index in range(0, num_urls) ] url_list.append(self.INVALID_URL) url_list.extend([ self.VALID_URL_FORMAT.format(index) for index in range(10, 10 + num_urls) ]) processed_list = UrlArgProcessing.process_url_list(url_list) assert_equals(num_urls * 2, len(processed_list))
def test_concat_urls_only_valid_and_unique_urls_are_returned(self): num_urls = 10 concat_urls = 3 url_list = [ self.VALID_URL_FORMAT.format(index) for index in range(0, num_urls) ] concat = [ self.VALID_URL_FORMAT.format(index) for index in range(100, 100 + concat_urls) ] url_list.append(''.join(concat)) processed_list = UrlArgProcessing.process_url_list(url_list) assert_equals(num_urls + concat_urls, len(processed_list))
def process_and_record_urls(cfg_obj: PdlConfig) -> list: """ Take the generated list of URLs, and verify all URLs are correct, no duplicates (in the list, or downloaded previously). The resulting list should be written to file for archival purposes. :param cfg_obj: (PdlConfig): Contains the inventory structure. :return: List of valid URLs """ url_file = UrlFile() # Check for URLs on the CLI raw_url_list = getattr(cfg_obj.cli_args, args.ArgOptions.URLS) # If no URLs are provided, check if URL file was specified if not raw_url_list: LOG.debug("URL list from CLI is empty.") # Check for URL file specified on the CLI url_file_name = getattr(cfg_obj.cli_args, args.ArgOptions.FILE, None) # URL file found, so read and store file contents if url_file_name is not None: url_file_name = os.path.abspath(url_file_name) raw_url_list = url_file.read_file(url_file_name) # Otherwise was the --buffer option specified the CLI elif getattr(cfg_obj.cli_args, args.ArgOptions.BUFFER, False): raw_url_list = read_from_buffer() # Otherwise, no sure how to proceed... so raise an exception else: LOG.info(cfg_obj.cli_args) LOG.debug( "No URL file was specified on the CLI, nor reading from buffer." ) raise NoURLsProvided() # Determine the supported URL domains (to remove junk/unexpected URLs) url_domains = cfg_obj.app_cfg.get_list(AppCfgFileSections.PROJECT, AppCfgFileSectionKeys.URL_DOMAINS) # Sanitize the URL list (missing spaces, duplicates, valid and accepted URLs) cfg_obj.urls = ArgProcessing.process_url_list(raw_url_list, domains=url_domains) # Remove duplicates from the inventory (can be disabled via CLI) if not cfg_obj.cli_args.ignore_dups: cfg_obj.urls = remove_duplicate_urls_from_inv(cfg_obj) # Write the file of accepted/sanitized URLs to be processed url_file_dir = cfg_obj.app_cfg.get(AppCfgFileSections.LOGGING, AppCfgFileSectionKeys.URL_FILE_DIR) url_file_drive = cfg_obj.app_cfg.get( AppCfgFileSections.LOGGING, AppCfgFileSectionKeys.LOG_DRIVE_LETTER) if url_file_drive is not None: url_file_dir = f"{url_file_drive}:{url_file_dir}" LOG.debug( f"Updated URL File directory for drive letter: {url_file_dir}") # If there were URLs available to DL after validation, create the URL file. # TODO: Write test to verify URL file is written if there are URLs and no file if there are not URLS if cfg_obj.urls: url_file.write_file(urls=cfg_obj.urls, create_dir=True, location=url_file_dir) else: LOG.info("No URLs for DL, no URL FILE created.") return cfg_obj.urls