Exemplo n.º 1
0
 def test_validate_url_with_protocol_mixed_case(self):
     url = self.VALID_URL
     protocol = self.VALID_PROTOCOL.capitalize()
     valid = UrlArgProcessing.validate_url(url=url, protocol=protocol)
     assert_true(
         valid,
         f"Valid URL ({url}) was marked as invalid with valid protocol ({protocol})."
     )
Exemplo n.º 2
0
 def test_validate_url_with_protocol_as_url(self):
     url = self.VALID_PROTOCOL
     protocol = self.VALID_PROTOCOL
     valid = UrlArgProcessing.validate_url(url=url, protocol=protocol)
     assert_false(
         valid,
         f"Invalid URL ({url}) was marked as valid with valid protocol ({protocol})."
     )
Exemplo n.º 3
0
 def test_validate_url_with_invalid_protocol(self):
     url = self.VALID_URL
     protocol = "\r\ninvalid"
     valid = UrlArgProcessing.validate_url(url=url, protocol=protocol)
     assert_false(
         valid,
         f"Valid URL ({url}) was marked as valid with invalid protocol ({protocol})."
     )
Exemplo n.º 4
0
 def test_all_valid_and_unique_urls_are_returned(self):
     num_urls = 10
     url_list = [
         self.VALID_URL_FORMAT.format(index)
         for index in range(0, num_urls)
     ]
     processed_list = UrlArgProcessing.process_url_list(url_list)
     assert_equals(num_urls, len(processed_list))
Exemplo n.º 5
0
 def test_validate_invalid_url_with_valid_protocol_and_valid_domain(self):
     url = self.INVALID_URL
     domain = self.VALID_DOMAIN
     protocol = self.VALID_PROTOCOL
     valid = UrlArgProcessing.validate_url(
         url=url, protocol=protocol, domains=[domain, self.INVALID_DOMAIN])
     assert_false(valid,
                  (f"Invalid URL ({url}) was marked as valid with valid "
                   f"protocol ({protocol}) and valid domain ({domain})."))
Exemplo n.º 6
0
    def test_duplicate_counts_per_url(self):
        dup_list = self.UNIQUE_URL_LIST[:]
        dup_list.extend(self.UNIQUE_URL_LIST)
        dup_list.extend(self.UNIQUE_URL_LIST)

        expected_count_per_url = 3
        results = UrlArgProcessing.counts_of_each_dup(dup_list)
        for url, count in results.items():
            print(f"URL: {url} --> Count {count}")
            assert_equals(count, expected_count_per_url)
Exemplo n.º 7
0
 def test_validate_url_with_valid_url_and_none_domain(self):
     url = self.VALID_URL
     domain = None
     protocol = self.VALID_PROTOCOL
     valid = UrlArgProcessing.validate_url(url=url,
                                           protocol=protocol,
                                           domains=[domain])
     assert_true(valid,
                 (f"Valid URL ({url}) was marked as invalid with valid "
                  f"protocol ({protocol}) and domain ({domain})."))
Exemplo n.º 8
0
    def test_list_urls_combines_urls_correctly(self):
        list_size = 100
        url_list = [
            self.VALID_URL_FORMAT.format(id_) for id_ in range(0, list_size)
        ]

        url_str = UrlArgProcessing.list_urls(url_list=url_list)
        assert isinstance(url_str, str)

        url_count = len([line for line in url_str.split('\n') if line != ''])
        assert_equals(list_size, url_count)
Exemplo n.º 9
0
 def test_split_urls_list_concatenated_into_single_element(self):
     list_size = 10
     valid_split_url_list = [
         ''.join([
             self.VALID_URL_FORMAT.format(id_)
             for id_ in range(0, list_size)
         ])
     ]
     updated_list = UrlArgProcessing.split_urls(valid_split_url_list)
     assert_not_equals(len(valid_split_url_list), len(updated_list))
     assert_equals(len(updated_list), list_size)
Exemplo n.º 10
0
 def test_invalid_urls_are_removed(self):
     num_urls = 5
     url_list = [
         self.VALID_URL_FORMAT.format(index)
         for index in range(0, num_urls)
     ]
     url_list.append(self.INVALID_URL)
     url_list.extend([
         self.VALID_URL_FORMAT.format(index)
         for index in range(10, 10 + num_urls)
     ])
     processed_list = UrlArgProcessing.process_url_list(url_list)
     assert_equals(num_urls * 2, len(processed_list))
Exemplo n.º 11
0
    def test_split_urls_with_concatenated_element(self):
        list_size = 10
        valid_split_url_list = [
            self.VALID_URL_FORMAT.format(id_) for id_ in range(1, 10)
        ]

        valid_split_url_list.append(valid_split_url_list[-1] +
                                    valid_split_url_list[-2] +
                                    valid_split_url_list[-3])
        list_size += 2

        updated_list = UrlArgProcessing.split_urls(valid_split_url_list)
        assert_not_equals(len(valid_split_url_list), len(updated_list))
        assert_equals(len(updated_list), list_size)
Exemplo n.º 12
0
    def test_concat_urls_only_valid_and_unique_urls_are_returned(self):
        num_urls = 10
        concat_urls = 3
        url_list = [
            self.VALID_URL_FORMAT.format(index)
            for index in range(0, num_urls)
        ]
        concat = [
            self.VALID_URL_FORMAT.format(index)
            for index in range(100, 100 + concat_urls)
        ]
        url_list.append(''.join(concat))

        processed_list = UrlArgProcessing.process_url_list(url_list)
        assert_equals(num_urls + concat_urls, len(processed_list))
Exemplo n.º 13
0
    def test_unique_list_with_no_dups(self):
        expected_unique_dups = 0
        results = UrlArgProcessing.reduce_url_list(
            url_list=self.UNIQUE_URL_LIST)

        # Unique List
        assert_equals(len(self.UNIQUE_URL_LIST),
                      len(results[UrlArgProcessing.REDUCED_LIST]))

        # Total Duplicates
        assert_equals(len(results[UrlArgProcessing.TOTAL_DUP_LIST]),
                      expected_unique_dups)

        # Unique Duplicates
        assert_equals(len(results[UrlArgProcessing.UNIQUE_DUP_LIST]),
                      expected_unique_dups)
Exemplo n.º 14
0
    def test_unique_list_with_single_dups(self):
        # Create a list of duplicate elements (unique * 2)
        dup_list = self.UNIQUE_URL_LIST[:]
        dup_list.extend(self.UNIQUE_URL_LIST)

        # Expected number of duplicates
        expected_unique_dups = len(self.UNIQUE_URL_LIST)
        results = UrlArgProcessing.reduce_url_list(url_list=dup_list)

        # Unique List
        assert_equals(len(results[UrlArgProcessing.REDUCED_LIST]),
                      len(self.UNIQUE_URL_LIST))

        # Total Duplicates
        assert_equals(len(results[UrlArgProcessing.TOTAL_DUP_LIST]),
                      expected_unique_dups)

        # Unique Duplicates
        assert_equals(len(results[UrlArgProcessing.UNIQUE_DUP_LIST]),
                      expected_unique_dups)
Exemplo n.º 15
0
Arquivo: app.py Projeto: rcmhunt71/PDL
def process_and_record_urls(cfg_obj: PdlConfig) -> list:
    """
    Take the generated list of URLs, and verify all URLs are correct, no duplicates
    (in the list, or downloaded previously).  The resulting list should be written to
    file for archival purposes.

    :param cfg_obj: (PdlConfig): Contains the inventory structure.

    :return: List of valid URLs

    """
    url_file = UrlFile()

    # Check for URLs on the CLI
    raw_url_list = getattr(cfg_obj.cli_args, args.ArgOptions.URLS)

    # If no URLs are provided, check if URL file was specified
    if not raw_url_list:
        LOG.debug("URL list from CLI is empty.")

        # Check for URL file specified on the CLI
        url_file_name = getattr(cfg_obj.cli_args, args.ArgOptions.FILE, None)

        # URL file found, so read and store file contents
        if url_file_name is not None:
            url_file_name = os.path.abspath(url_file_name)
            raw_url_list = url_file.read_file(url_file_name)

        # Otherwise was the --buffer option specified the CLI
        elif getattr(cfg_obj.cli_args, args.ArgOptions.BUFFER, False):
            raw_url_list = read_from_buffer()

        # Otherwise, no sure how to proceed... so raise an exception
        else:
            LOG.info(cfg_obj.cli_args)
            LOG.debug(
                "No URL file was specified on the CLI, nor reading from buffer."
            )
            raise NoURLsProvided()

    # Determine the supported URL domains (to remove junk/unexpected URLs)
    url_domains = cfg_obj.app_cfg.get_list(AppCfgFileSections.PROJECT,
                                           AppCfgFileSectionKeys.URL_DOMAINS)

    # Sanitize the URL list (missing spaces, duplicates, valid and accepted URLs)
    cfg_obj.urls = ArgProcessing.process_url_list(raw_url_list,
                                                  domains=url_domains)

    # Remove duplicates from the inventory (can be disabled via CLI)
    if not cfg_obj.cli_args.ignore_dups:
        cfg_obj.urls = remove_duplicate_urls_from_inv(cfg_obj)

    # Write the file of accepted/sanitized URLs to be processed
    url_file_dir = cfg_obj.app_cfg.get(AppCfgFileSections.LOGGING,
                                       AppCfgFileSectionKeys.URL_FILE_DIR)
    url_file_drive = cfg_obj.app_cfg.get(
        AppCfgFileSections.LOGGING, AppCfgFileSectionKeys.LOG_DRIVE_LETTER)
    if url_file_drive is not None:
        url_file_dir = f"{url_file_drive}:{url_file_dir}"
        LOG.debug(
            f"Updated URL File directory for drive letter: {url_file_dir}")

    # If there were URLs available to DL after validation, create the URL file.
    # TODO: Write test to verify URL file is written if there are URLs and no file if there are not URLS
    if cfg_obj.urls:
        url_file.write_file(urls=cfg_obj.urls,
                            create_dir=True,
                            location=url_file_dir)
    else:
        LOG.info("No URLs for DL, no URL FILE created.")

    return cfg_obj.urls
Exemplo n.º 16
0
 def test_validate_url_with_valid_url_mixed_case(self):
     url = self.VALID_URL.capitalize()
     valid = UrlArgProcessing.validate_url(url=url)
     assert_true(valid, f"Valid URL ({url}) was marked as invalid.")
Exemplo n.º 17
0
 def test_split_urls_all_elems_valid_urls(self):
     valid_split_url_list = [
         self.VALID_URL_FORMAT.format(id_) for id_ in range(1, 10)
     ]
     updated_list = UrlArgProcessing.split_urls(valid_split_url_list)
     assert_equals(len(valid_split_url_list), len(updated_list))
Exemplo n.º 18
0
 def test_validate_url_with_valid_url_lowercase(self):
     url = self.VALID_URL
     valid = UrlArgProcessing.validate_url(url=url)
     assert_true(valid, f"Valid URL ({url}) was marked as invalid.")
Exemplo n.º 19
0
 def test_validate_url_with_invalid_url(self):
     url = self.INVALID_URL
     valid = UrlArgProcessing.validate_url(url=url)
     assert_false(valid, f"Invalid URL ({url}) was marked as valid.")