def compareNormalizeUrl(urlA, urlB, raiseException=True): uvA = UrlValidator() uvB = UrlValidator() if not uvA.validate(urlA): if raiseException: raise Exception('Invalid urlA') else: return -1 if not uvB.validate(urlB): if raiseException: raise Exception('Invalid urlB') else: return 1 ucA = UrlCanonicalizer() ucB = UrlCanonicalizer() yourlA = ucA.canonicalizerValidator(uvA) yourlB = ucB.canonicalizerValidator(uvB) if yourlA < yourlB: return -1 elif yourlA > yourlB: return 1 else: return 0
def __getNormalizedUrl(self): yourl = self.urls[:] ret = [] for url in yourl: uv = UrlValidator() if uv.validate(url): uc = UrlCanonicalizer() ret.append(uc.canonicalizerValidator(uv)) else: ret.append(None) return ret
def test_removeWWWdot(self): urls = ['http://www.google.com//path//..///path////////////'] expected = ['http://google.com/path/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_addtrailingslash(self): urls = ['http://google.com/path',\ 'http://*****:*****@en.wIkipediA.org:0/wiki/Unit_testing/%4f%4F#Language-'] expected = ['http://google.com/path/',\ 'http://en.wikipedia.org/wiki/Unit_testing/OO/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_removeUserPassword(self): urls = ['hunlan:[email protected]:80/hunlan%40gmail%2ecom',\ 'http://*****:*****@en.wIkipediA.org:0/wiki/Unit_testing/%4f%4F#Language-'] expected = ['google.com/[email protected]/',\ 'http://en.wikipedia.org/wiki/Unit_testing/OO/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_lowercaseHostName(self): urls = ['www.GoOgLE.com',\ 'http://en.wIkipediA.org/wiki/Unit_testing#Language-'] expected = ['google.com/',\ 'http://en.wikipedia.org/wiki/Unit_testing/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_removeDupSlashes(self): urls = ['http://google.com//path//..///path////////////',\ 'http://*****:*****@en.wIkipediA.org:0//wiki/Unit_testing/%2e%2e#Language-'] expected = ['http://google.com/path/',\ 'http://en.wikipedia.org/wiki/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_wikiexample(self): urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\ 'http://en.wikipedia.org/wiki/Unit_testing#Language-'] expected = ['http://en.wikipedia.org/wiki/Unit_testing/',\ 'http://en.wikipedia.org/wiki/Unit_testing/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_decodePercentEncoding(self): urls = ['www.GoOgLE.com/hunlan%40gmail%2ecom',\ 'cs.washington.edu/%43%53%45%34%30%33',\ 'http://en.wIkipediA.org/wiki/Unit_testing/%4f%4F#Language-'] expected = ['google.com/[email protected]/',\ 'cs.washington.edu/CSE403/',\ 'http://en.wikipedia.org/wiki/Unit_testing/OO/'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
def test_sortAndUseAndSignForQuery(self): urls = ['www.nba.com?a=0;A=1;a=d',\ 'http://google.com//path//..///path////////////?b=2;a=1',\ 'http://*****:*****@en.wIkipediA.org:0//wiki/Unit_testing/%2e%2e?a=0;c=1&B=2#Language-'] expected = ['nba.com/?a=0&a=1&a=d',\ 'http://google.com/path/?a=1&b=2',\ 'http://en.wikipedia.org/wiki/?a=0&b=2&c=1'] for i in range(0,len(urls)): uc = UrlCanonicalizer() actual = uc.canonicalizeUrl(urls[i]) self.assertEqual(expected[i], actual, \ 'fail on url: ' + urls[i] + '\n' +\ 'expected: ' + expected[i] + '\n' +\ 'actual : ' + actual)
for url in urls: # url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False) print 'Source: ' + url print 'Valid: ' + str(isValid) print 'Canonical: ' + ('None' if normURL == None else normURL) print 'Source unique: ' + str(isSrcUnique) print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique)) print '' # clean up infile.close()