예제 #1
0
 def test_norm_with_domain(self):
     normalizer = URLNormalizer("acme.com", 80)
     test_expectation = {'www.acme.com':"http://acme.com",
                         'www.acme.com:80/b':"http://acme.com/b",
                         'acme.com:80/':"http://acme.com",
                          ' hTTp://www.acme.com:80/kjbk ':'http://www.acme.com/kjbk',
                          ' /a/b ':'http://acme.com/a/b',
                          ' /page.htm ':'http://acme.com/page.htm',
                          'acme.com/a/b ':'http://acme.com/a/b',
                         'http://bar.org:80/?q=sds+asa#frag':"http://bar.org"}
     for input, expectation in test_expectation.items():
         logging.debug("Testing normalization of url: %s"%input)
         self.assertEqual(expectation, normalizer.normalize_with_domain(input))
예제 #2
0
    def test_link_dedup(self):
        base_url = "acme.com:8999"
        base_domain, port = extract_domain_port(base_url)
        logger = logging.getLogger()
        url_norm = URLNormalizer(base_domain, port)
        normalized_url = url_norm.normalize_with_domain(base_url)
        logger.debug("Constructed normalized base url : %s"%normalized_url)

        domain_filter = DomainFilter(base_domain, logger)
        site_graph = SiteGraph(logger)
        link_aggregator = LinkAggregator(logger, site_graph, link_mappers=[url_norm.normalize_with_domain], link_filters=[domain_filter.passes, is_acceptable_url_scheme])
        valid_links = ["/a/b","/a/b/./","http://acme.com:8002/a","https://acme.com:8002/b?q=asd#frag"]
        expected_links = ["http://acme.com:8999/a/b","http://acme.com:8002/a","https://acme.com:8002/b"]

        # This time, we also specify a referrer page
        filtered_links = link_aggregator.filter_update_links(valid_links, normalized_url)
        self.assertListEqual(expected_links,filtered_links)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # Second invocation should result in deduplication
        filtered_links = link_aggregator.filter_update_links(valid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # None of the invalid links should pass
        invalid_links = ["mailto://[email protected]","code.acme.com","code.acme.com/b","https://127.122.9.1"]
        filtered_links = link_aggregator.filter_update_links(invalid_links, None)
        self.assertTrue(len(filtered_links) == 0)
        self.assertSetEqual(set(expected_links),link_aggregator._links)

        # A new valid link should pass
        new_valid_links = ["http://acme.com:8999/"]
        filtered_links = link_aggregator.filter_update_links(new_valid_links, None)
        expected_result = ["http://acme.com:8999"]
        self.assertListEqual(expected_result,filtered_links)
        expected_result_set = set(expected_links)
        expected_result_set.update(set(expected_result))
        self.assertSetEqual(expected_result_set,link_aggregator._links)

        self.assertEqual(len(expected_result_set), site_graph.num_nodes())
        for link in expected_result_set:
            self.assertTrue(site_graph.has_vertex(link))

        self.assertEqual(len(expected_links), site_graph.num_edges())
        for link in expected_links:
            self.assertTrue(site_graph.has_edge(normalized_url, link))