def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: #links = [link for link in links if self._url_is_from_any_host(link.url, self.allow_domains)] links = [ link for link in links if self._url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: #links = [link for link in links if not self._url_is_from_any_host(link.url, self.deny_domains)] links = [ link for link in links if not self._url_is_from_any_domain(link.url, self.deny_domains) ] if self.canonicalize: for link in links: #log.msg("extract link before normalize: [%s]" % link.url, level=log.INFO) link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: "+ASIN,level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [ link for link in links if not self.check_url or _is_valid_url(link.url) ] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if not self.check_url or _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: " + ASIN, level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links