def test_sourceUnique(self): url = 'www.google.com' list1 = ['google.com', 'http://google.com'] list2 = ['www.google.com', 'something.net'] self.assertTrue(UrlComparator.isSourceUnique(url, list1)) self.assertFalse(UrlComparator.isSourceUnique(url, list2))
def test_normunique(self): url = 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' # same url list1 = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations'] # norm same url list2 = ['http://en.wikipedia.org/wiki/Unit_testing#Language-'] # different url list3 = ['wikipedia.org'] self.assertFalse(UrlComparator.isNormalizeUnique(url, list1)) self.assertFalse(UrlComparator.isNormalizeUnique(url, list2)) self.assertTrue(UrlComparator.isNormalizeUnique(url, list3))
def test_normunique(self): url = 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' # same url list1 = [ 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' ] # norm same url list2 = ['http://en.wikipedia.org/wiki/Unit_testing#Language-'] # different url list3 = ['wikipedia.org'] self.assertFalse(UrlComparator.isNormalizeUnique(url, list1)) self.assertFalse(UrlComparator.isNormalizeUnique(url, list2)) self.assertTrue(UrlComparator.isNormalizeUnique(url, list3))
def test_wikiExample(self): urlA = 'http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations' urlB = 'http://en.wikipedia.org/wiki/Unit_testing#Language-' expected = 0 res = UrlComparator.compareNormalizeUrl(urlA, urlB) self.assertEqual(expected, res, 'expected: ' + str(expected) +\ ', actual: ' + str(res))
# filter out empty strings urls = filter(lambda s: s.strip(), urls) # process each url for url in urls: # url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False) print 'Source: ' + url print 'Valid: ' + str(isValid) print 'Canonical: ' + ('None' if normURL == None else normURL) print 'Source unique: ' + str(isSrcUnique) print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique)) print ''
def test_normalizedWWWDotDifferentUrl(self): urlA = 'www.google.com' urlB = 'nba.com' self.assertTrue(UrlComparator.compareNormalizeUrl(urlA, urlB) < 0)
def test_sourcecomparison(self): urlA = 'www.google.com' urlB = 'nba.com' self.assertTrue(UrlComparator.compareSourceUrl(urlA, urlB) > 0)
def test_caseSensitiveCases(self): urlA = 'www.google.com/Images' urlB = 'www.google.com/images' self.assertTrue(UrlComparator.compareNormalizeUrl(urlA, urlB) < 0)
def test_normalizedEqualDifferentQueryUrl(self): urlA = 'www.google.com/?q=cse403;id=1' urlB = 'www.google.com/?id=1&q=cse403' self.assertTrue(UrlComparator.compareNormalizeUrl(urlA, urlB) == 0)
def test_normalGreaterLesser(self): urlA = 'www.google.com' urlB = 'www.nba.com' self.assertTrue(UrlComparator.compareNormalizeUrl(urlA, urlB) < 0) self.assertTrue(UrlComparator.compareNormalizeUrl(urlB, urlA) > 0)