def getContentNormalisedURLList(self): """ Call url normalizer for each url returned by getContentURLList Return only url associated to the same Domain """ reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1] # in www.example.com or www.3.example.com # keep only the example.com part reference_domain = ''.join(reference_domain.split('.')[-2:]) if isinstance(reference_domain, unicode): reference_domain = reference_domain.encode('utf-8') url_list = [] base_url = self.getContentBaseURL() for url in self.getContentURLList(): try: url = normaliseUrl(url, base_url=base_url) except UnicodeDecodeError: # Ignore wrong encoding errors # Web is not a kind world continue if not url: continue url_domain = urlsplit(url)[1] if isinstance(url_domain, unicode): url_domain = url_domain.encode('utf-8') if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain: continue # if domain is empty (relative link) or domain is same, then OK url_list.append(url) return url_list
def getContentNormalisedURLList(self): """ Call url normalizer for each url returned by getContentURLList Return only url associated to the same Domain """ reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1] # in www.example.com or www.3.example.com # keep only the example.com part reference_domain = ''.join(reference_domain.split('.')[-2:]) if isinstance(reference_domain, unicode): reference_domain = reference_domain.encode('utf-8') url_list = [] base_url = self.getContentBaseURL() for url in self.getContentURLList(): try: url = normaliseUrl(url, base_url=base_url) except UnicodeDecodeError: # Ignore wrong encoding errors # Web is not a kind world continue if not url: continue url_domain = urlsplit(url)[1] if isinstance(url_domain, unicode): url_domain = url_domain.encode('utf-8') if url_domain and ''.join( url_domain.split('.')[-2:]) != reference_domain: continue # if domain is empty (relative link) or domain is same, then OK url_list.append(url) return url_list
def asNormalisedURL(self, base_url=None): """ call normaliseUrl with raw url """ if self.hasUrlString(): return normaliseUrl(self.asURL(), base_url=base_url)