Пример #1
0
 def test_all_valid_and_unique_urls_are_returned(self):
     num_urls = 10
     url_list = [
         self.VALID_URL_FORMAT.format(index)
         for index in range(0, num_urls)
     ]
     processed_list = UrlArgProcessing.process_url_list(url_list)
     assert_equals(num_urls, len(processed_list))
Пример #2
0
 def test_invalid_urls_are_removed(self):
     num_urls = 5
     url_list = [
         self.VALID_URL_FORMAT.format(index)
         for index in range(0, num_urls)
     ]
     url_list.append(self.INVALID_URL)
     url_list.extend([
         self.VALID_URL_FORMAT.format(index)
         for index in range(10, 10 + num_urls)
     ])
     processed_list = UrlArgProcessing.process_url_list(url_list)
     assert_equals(num_urls * 2, len(processed_list))
Пример #3
0
    def test_concat_urls_only_valid_and_unique_urls_are_returned(self):
        num_urls = 10
        concat_urls = 3
        url_list = [
            self.VALID_URL_FORMAT.format(index)
            for index in range(0, num_urls)
        ]
        concat = [
            self.VALID_URL_FORMAT.format(index)
            for index in range(100, 100 + concat_urls)
        ]
        url_list.append(''.join(concat))

        processed_list = UrlArgProcessing.process_url_list(url_list)
        assert_equals(num_urls + concat_urls, len(processed_list))
Пример #4
0
def process_and_record_urls(cfg_obj: PdlConfig) -> list:
    """
    Take the generated list of URLs, and verify all URLs are correct, no duplicates
    (in the list, or downloaded previously).  The resulting list should be written to
    file for archival purposes.

    :param cfg_obj: (PdlConfig): Contains the inventory structure.

    :return: List of valid URLs

    """
    url_file = UrlFile()

    # Check for URLs on the CLI
    raw_url_list = getattr(cfg_obj.cli_args, args.ArgOptions.URLS)

    # If no URLs are provided, check if URL file was specified
    if not raw_url_list:
        LOG.debug("URL list from CLI is empty.")

        # Check for URL file specified on the CLI
        url_file_name = getattr(cfg_obj.cli_args, args.ArgOptions.FILE, None)

        # URL file found, so read and store file contents
        if url_file_name is not None:
            url_file_name = os.path.abspath(url_file_name)
            raw_url_list = url_file.read_file(url_file_name)

        # Otherwise was the --buffer option specified the CLI
        elif getattr(cfg_obj.cli_args, args.ArgOptions.BUFFER, False):
            raw_url_list = read_from_buffer()

        # Otherwise, no sure how to proceed... so raise an exception
        else:
            LOG.info(cfg_obj.cli_args)
            LOG.debug(
                "No URL file was specified on the CLI, nor reading from buffer."
            )
            raise NoURLsProvided()

    # Determine the supported URL domains (to remove junk/unexpected URLs)
    url_domains = cfg_obj.app_cfg.get_list(AppCfgFileSections.PROJECT,
                                           AppCfgFileSectionKeys.URL_DOMAINS)

    # Sanitize the URL list (missing spaces, duplicates, valid and accepted URLs)
    cfg_obj.urls = ArgProcessing.process_url_list(raw_url_list,
                                                  domains=url_domains)

    # Remove duplicates from the inventory (can be disabled via CLI)
    if not cfg_obj.cli_args.ignore_dups:
        cfg_obj.urls = remove_duplicate_urls_from_inv(cfg_obj)

    # Write the file of accepted/sanitized URLs to be processed
    url_file_dir = cfg_obj.app_cfg.get(AppCfgFileSections.LOGGING,
                                       AppCfgFileSectionKeys.URL_FILE_DIR)
    url_file_drive = cfg_obj.app_cfg.get(
        AppCfgFileSections.LOGGING, AppCfgFileSectionKeys.LOG_DRIVE_LETTER)
    if url_file_drive is not None:
        url_file_dir = f"{url_file_drive}:{url_file_dir}"
        LOG.debug(
            f"Updated URL File directory for drive letter: {url_file_dir}")

    # If there were URLs available to DL after validation, create the URL file.
    # TODO: Write test to verify URL file is written if there are URLs and no file if there are not URLS
    if cfg_obj.urls:
        url_file.write_file(urls=cfg_obj.urls,
                            create_dir=True,
                            location=url_file_dir)
    else:
        LOG.info("No URLs for DL, no URL FILE created.")

    return cfg_obj.urls