def compareNormalizeUrl(urlA, urlB, raiseException=True): uvA = UrlValidator() uvB = UrlValidator() if not uvA.validate(urlA): if raiseException: raise Exception('Invalid urlA') else: return -1 if not uvB.validate(urlB): if raiseException: raise Exception('Invalid urlB') else: return 1 ucA = UrlCanonicalizer() ucB = UrlCanonicalizer() yourlA = ucA.canonicalizerValidator(uvA) yourlB = ucB.canonicalizerValidator(uvB) if yourlA < yourlB: return -1 elif yourlA > yourlB: return 1 else: return 0
def __getNormalizedUrl(self): yourl = self.urls[:] ret = [] for url in yourl: uv = UrlValidator() if uv.validate(url): uc = UrlCanonicalizer() ret.append(uc.canonicalizerValidator(uv)) else: ret.append(None) return ret
# url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False) print 'Source: ' + url print 'Valid: ' + str(isValid) print 'Canonical: ' + ('None' if normURL == None else normURL) print 'Source unique: ' + str(isSrcUnique) print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique)) print '' # clean up infile.close()