def test_normunique(self): url = 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' # same url list1 = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations'] # norm same url list2 = ['http://en.wikipedia.org/wiki/Unit_testing#Language-'] # different url list3 = ['wikipedia.org'] self.assertFalse(UrlComparator.isNormalizeUnique(url, list1)) self.assertFalse(UrlComparator.isNormalizeUnique(url, list2)) self.assertTrue(UrlComparator.isNormalizeUnique(url, list3))
def test_normunique(self): url = 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' # same url list1 = [ 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' ] # norm same url list2 = ['http://en.wikipedia.org/wiki/Unit_testing#Language-'] # different url list3 = ['wikipedia.org'] self.assertFalse(UrlComparator.isNormalizeUnique(url, list1)) self.assertFalse(UrlComparator.isNormalizeUnique(url, list2)) self.assertTrue(UrlComparator.isNormalizeUnique(url, list3))
# url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False) print 'Source: ' + url print 'Valid: ' + str(isValid) print 'Canonical: ' + ('None' if normURL == None else normURL) print 'Source unique: ' + str(isSrcUnique) print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique)) print '' # clean up infile.close()