예제 #1
0
    def _start_reviewer(self, job):
        if job:

            if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS:
                self.info('Max URL levels! Details: %s' % job['url'])
                return

            self.debug('Starting Review for [%s]' % job['url'])
            reviewer = Reviewer(
                api_url=self.config.HOLMES_API_URL,
                page_uuid=job['page'],
                page_url=job['url'],
                page_score=0,
                config=self.config,
                validators=self.validators,
                facters=self.facters,
                search_provider=self.search_provider,
                async_get=self.async_get,
                wait=self.otto.wait,
                wait_timeout=0,  # max time to wait for all requests to finish
                db=self.db,
                cache=self.cache,
                publish=self.publish,
                girl=self.girl,
                fact_definitions=self.fact_definitions,
                violation_definitions=self.violation_definitions)

            reviewer.review()
예제 #2
0
    def _start_reviewer(self, job):
        if job:

            if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS:
                self.info('Max URL levels! Details: %s' % job['url'])
                return

            self.debug('Starting Review for [%s]' % job['url'])
            reviewer = Reviewer(
                api_url=self.config.HOLMES_API_URL,
                page_uuid=job['page'],
                page_url=job['url'],
                page_score=job['score'],
                increase_lambda_tax_method=self._increase_lambda_tax,
                config=self.config,
                validators=self.validators,
                facters=self.facters,
                async_get=self.async_get,
                wait=self.otto.wait,
                wait_timeout=0,  # max time to wait for all requests to finish
                db=self.db,
                cache=self.cache,
                publish=self.publish,
                fact_definitions=self.fact_definitions,
                violation_definitions=self.violation_definitions
            )

            reviewer.review()
예제 #3
0
    def get_facts(self):
        links = self.get_links()

        self.review.data["page.links"] = set()
        self.review.data["page.all_links"] = links

        self.add_fact(key="page.links", value=set())

        invalid_links = set()

        num_links = 0

        links_to_get = set()

        for link in links:
            url = link.get("href").strip()
            url = REMOVE_HASH.sub("", url)

            if not url:
                continue

            aux = self.normalize_url(url)
            if not aux:
                invalid_links.add(url)
                continue

            url = aux

            if self.looks_like_image(url):
                continue

            if link.get("rel") == "nofollow":
                continue

            if count_url_levels(url) > self.config.MAX_URL_LEVELS:
                logging.info("Max URL levels! Details: %s" % url)
                continue

            should_get = False
            domain, domain_url = get_domain_from_url(url)
            if domain in self.page_url:
                should_get = True

            if should_get and URL_RE.match(url):
                num_links += 1
                links_to_get.add(url)

        for url in links_to_get:
            self.async_get(url, self.handle_url_loaded)

        self.add_fact(key="total.number.links", value=num_links)

        self.add_fact(key="total.number.invalid_links", value=len(invalid_links))

        self.add_fact(key="page.invalid_links", value=invalid_links)
예제 #4
0
    def get_facts(self):
        links = self.get_links()

        self.review.data['page.links'] = set()
        self.review.data['page.all_links'] = links

        self.add_fact(
            key='page.links',
            value=set(),
        )

        invalid_links = set()

        num_links = 0

        links_to_get = set()

        for link in links:
            url = link.get('href').strip()
            url = REMOVE_HASH.sub('', url)

            if not url:
                continue

            aux = self.normalize_url(url)
            if not aux:
                invalid_links.add(url)
                continue

            url = aux

            if self.looks_like_image(url):
                continue

            if link.get('rel') == 'nofollow':
                continue

            if count_url_levels(url) > self.config.MAX_URL_LEVELS:
                logging.info('Max URL levels! Details: %s' % url)
                continue

            if URL_RE.match(url):
                num_links += 1
                links_to_get.add(url)

        for url in links_to_get:
            self.async_get(url, self.handle_url_loaded)

        self.add_fact(key='total.number.links', value=num_links)

        self.add_fact(key='total.number.invalid_links',
                      value=len(invalid_links))

        self.add_fact(key='page.invalid_links', value=invalid_links)
예제 #5
0
    def get_facts(self):
        links = self.get_links()

        self.review.data['page.links'] = set()
        self.review.data['page.all_links'] = links

        self.add_fact(
            key='page.links',
            value=set(),
        )

        invalid_links = set()

        num_links = 0

        links_to_get = set()

        for link in links:
            url = link.get('href').strip()
            url = REMOVE_HASH.sub('', url)

            if not url:
                continue

            aux = self.normalize_url(url)
            if not aux:
                invalid_links.add(url)
                continue

            url = aux

            if self.looks_like_image(url):
                continue

            if link.get('rel') == 'nofollow':
                continue

            if count_url_levels(url) > self.config.MAX_URL_LEVELS:
                logging.info('Max URL levels! Details: %s' % url)
                continue

            if URL_RE.match(url):
                num_links += 1
                links_to_get.add(url)

        for url in links_to_get:
            self.async_get(url, self.handle_url_loaded)

        self.add_fact(
            key='total.number.links',
            value=num_links
        )

        self.add_fact(
            key='total.number.invalid_links',
            value=len(invalid_links)
        )

        self.add_fact(
            key='page.invalid_links',
            value=invalid_links
        )