def compareNormalizeUrl(urlA, urlB, raiseException=True): uvA = UrlValidator() uvB = UrlValidator() if not uvA.validate(urlA): if raiseException: raise Exception('Invalid urlA') else: return -1 if not uvB.validate(urlB): if raiseException: raise Exception('Invalid urlB') else: return 1 ucA = UrlCanonicalizer() ucB = UrlCanonicalizer() yourlA = ucA.canonicalizerValidator(uvA) yourlB = ucB.canonicalizerValidator(uvB) if yourlA < yourlB: return -1 elif yourlA > yourlB: return 1 else: return 0
def compareNormalizeUrl(urlA, urlB, raiseException=True): uvA = UrlValidator() uvB = UrlValidator() if not uvA.validate(urlA): if raiseException: raise Exception('Invalid urlA') else: return -1 if not uvB.validate(urlB): if raiseException: raise Exception('Invalid urlB') else: return 1 ucA = UrlCanonicalizer() ucB = UrlCanonicalizer() yourlA = ucA.canonicalizerValidator(uvA) yourlB = ucB.canonicalizerValidator(uvB) if yourlA < yourlB: return -1 elif yourlA > yourlB: return 1 else: return 0
def __getNormalizedUrl(self): yourl = self.urls[:] ret = [] for url in yourl: uv = UrlValidator() if uv.validate(url): uc = UrlCanonicalizer() ret.append(uc.canonicalizerValidator(uv)) else: ret.append(None) return ret
def __getNormalizedUrl(self): yourl = self.urls[:] ret = [] for url in yourl: uv = UrlValidator() if uv.validate(url): uc = UrlCanonicalizer() ret.append(uc.canonicalizerValidator(uv)) else: ret.append(None) return ret
def canonicalizeUrl(self, url): uv = UrlValidator() if not uv.validate(url): raise Exception('invalid url') return self.canonicalizerValidator(uv)
class UrlValidatorTest(TestCase): def setUp(self): self.urlValidator = UrlValidator() def test_wikiexample(self): urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\ 'http://en.wikipedia.org/wiki/Unit_testing#Language-'] for url in urls: self.assertTrue(self.urlValidator.validate(url)) # check param input def test_illegalinput(self): with self.assertRaises(AssertionError) as err: self.urlValidator.validate(None) with self.assertRaises(AssertionError) as err: self.urlValidator.validate(123) # check empty string def test_emptystring(self): self.assertFalse(self.urlValidator.validate('')) # scheme def test_correct_incorrect_scheme(self): correct_list = ['http://www.google.com', \ 'ftp://www.google.com', \ 'www.google.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['htp://www.google.com', \ '://www.google.com', \ 'http:/www.google.com', \ 'http//www.google.com'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url) # uname pword def test_correct_incorrect_usernamepassword(self): correct_list = ['http://*****:*****@www.google.com', \ 'http://www.google.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://[email protected]', \ 'http://*****:*****@www.google.com', \ 'http://@www.google.com', \ 'http://@@www.google.com'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url) # dname def test_correct_incorrect_domainname(self): correct_list = ['http://google.com', \ 'http://cs.washington.edu/path', \ 'http://555.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://.com', \ 'http://*****:*****@nba.com', \ 'http://www.google.com/images%', \ 'http://www.google.com/%2x/'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url)) # query def test_correct_incorrect_query(self): correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#frag', \ 'http://www.google.com:80/path/', \ 'http://www.google.com:80/path/?', \ 'http://www.google.com:80/path?'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://www.google.com?nba', \ 'http://www.google.com/??', \ 'http://www.google.com/?cmm = cmm', \ 'http://www.google.com/?key==value', \ 'http://www.google.com/?1a=1b'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url)) # fragment def test_correct_incorrect_fragment(self): correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool#fragment', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#_-_', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#', \ 'http://www.google.com:80/path/'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://www.google.com?nba#wrong fragment', \ 'http://www.google.com/##'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url))
class UrlValidatorTest(TestCase): # setup urlvalidator def setUp(self): self.urlValidator = UrlValidator() # wiki example validation, expect true def test_wikiexample(self): urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\ 'http://en.wikipedia.org/wiki/Unit_testing#Language-'] for url in urls: self.assertTrue(self.urlValidator.validate(url)) # check param input def test_illegalinput(self): with self.assertRaises(AssertionError) as err: self.urlValidator.validate(None) with self.assertRaises(AssertionError) as err: self.urlValidator.validate(123) # check empty string def test_emptystring(self): self.assertFalse(self.urlValidator.validate('')) # scheme def test_correct_incorrect_scheme(self): correct_list = ['http://www.google.com', \ 'ftp://www.google.com', \ 'www.google.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['htp://www.google.com', \ '://www.google.com', \ 'http:/www.google.com', \ 'http//www.google.com'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url) # uname pword def test_correct_incorrect_usernamepassword(self): correct_list = ['http://*****:*****@www.google.com', \ 'http://www.google.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://[email protected]', \ 'http://*****:*****@www.google.com', \ 'http://@www.google.com', \ 'http://@@www.google.com'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url) # dname def test_correct_incorrect_domainname(self): correct_list = ['http://google.com', \ 'http://cs.washington.edu/path', \ 'http://555.com'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://.com', \ 'http://*****:*****@nba.com', \ 'http://www.google.com/images%', \ 'http://www.google.com/%2x/'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url)) # query def test_correct_incorrect_query(self): correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#frag', \ 'http://www.google.com:80/path/', \ 'http://www.google.com:80/path/?', \ 'http://www.google.com:80/path?'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://www.google.com?nba', \ 'http://www.google.com/??', \ 'http://www.google.com/?cmm = cmm', \ 'http://www.google.com/?key==value', \ 'http://www.google.com/?1a=1b'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url)) # fragment def test_correct_incorrect_fragment(self): correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool#fragment', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#_-_', \ 'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#', \ 'http://www.google.com:80/path/'] for url in correct_list: self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url) incorrect_list = ['http://www.google.com?nba#wrong fragment', \ 'http://www.google.com/##'] for url in incorrect_list: self.assertFalse(self.urlValidator.validate(url))
line = infile.readline() while len(line) > 0: # take out next line characters if line.endswith('\n'): line = line[:-1] urls.append(line) line = infile.readline() # filter out empty strings urls = filter(lambda s: s.strip(), urls) # process each url for url in urls: # url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False)