Exemplo n.º 1
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            #links = [link for link in links if self._url_is_from_any_host(link.url, self.allow_domains)]
            links = [
                link for link in links
                if self._url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            #links = [link for link in links if not self._url_is_from_any_host(link.url, self.deny_domains)]
            links = [
                link for link in links if
                not self._url_is_from_any_domain(link.url, self.deny_domains)
            ]

        if self.canonicalize:
            for link in links:
                #log.msg("extract link before normalize: [%s]" % link.url, level=log.INFO)
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 2
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: "+ASIN,level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 3
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 4
0
    def _process_links(self, links):
        links = [
            link for link in links
            if not self.check_url or _is_valid_url(link.url)
        ]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 5
0
    def _process_links(self, links):
        links = [link for link in links if not self.check_url or _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 6
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: " + ASIN, level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Exemplo n.º 7
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links