def _start_reviewer(self, job): if job: if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS: self.info('Max URL levels! Details: %s' % job['url']) return self.debug('Starting Review for [%s]' % job['url']) reviewer = Reviewer( api_url=self.config.HOLMES_API_URL, page_uuid=job['page'], page_url=job['url'], page_score=0, config=self.config, validators=self.validators, facters=self.facters, search_provider=self.search_provider, async_get=self.async_get, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, girl=self.girl, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions) reviewer.review()
def _start_reviewer(self, job): if job: if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS: self.info('Max URL levels! Details: %s' % job['url']) return self.debug('Starting Review for [%s]' % job['url']) reviewer = Reviewer( api_url=self.config.HOLMES_API_URL, page_uuid=job['page'], page_url=job['url'], page_score=job['score'], increase_lambda_tax_method=self._increase_lambda_tax, config=self.config, validators=self.validators, facters=self.facters, async_get=self.async_get, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions ) reviewer.review()
def get_facts(self): links = self.get_links() self.review.data["page.links"] = set() self.review.data["page.all_links"] = links self.add_fact(key="page.links", value=set()) invalid_links = set() num_links = 0 links_to_get = set() for link in links: url = link.get("href").strip() url = REMOVE_HASH.sub("", url) if not url: continue aux = self.normalize_url(url) if not aux: invalid_links.add(url) continue url = aux if self.looks_like_image(url): continue if link.get("rel") == "nofollow": continue if count_url_levels(url) > self.config.MAX_URL_LEVELS: logging.info("Max URL levels! Details: %s" % url) continue should_get = False domain, domain_url = get_domain_from_url(url) if domain in self.page_url: should_get = True if should_get and URL_RE.match(url): num_links += 1 links_to_get.add(url) for url in links_to_get: self.async_get(url, self.handle_url_loaded) self.add_fact(key="total.number.links", value=num_links) self.add_fact(key="total.number.invalid_links", value=len(invalid_links)) self.add_fact(key="page.invalid_links", value=invalid_links)
def get_facts(self): links = self.get_links() self.review.data['page.links'] = set() self.review.data['page.all_links'] = links self.add_fact( key='page.links', value=set(), ) invalid_links = set() num_links = 0 links_to_get = set() for link in links: url = link.get('href').strip() url = REMOVE_HASH.sub('', url) if not url: continue aux = self.normalize_url(url) if not aux: invalid_links.add(url) continue url = aux if self.looks_like_image(url): continue if link.get('rel') == 'nofollow': continue if count_url_levels(url) > self.config.MAX_URL_LEVELS: logging.info('Max URL levels! Details: %s' % url) continue if URL_RE.match(url): num_links += 1 links_to_get.add(url) for url in links_to_get: self.async_get(url, self.handle_url_loaded) self.add_fact(key='total.number.links', value=num_links) self.add_fact(key='total.number.invalid_links', value=len(invalid_links)) self.add_fact(key='page.invalid_links', value=invalid_links)
def get_facts(self): links = self.get_links() self.review.data['page.links'] = set() self.review.data['page.all_links'] = links self.add_fact( key='page.links', value=set(), ) invalid_links = set() num_links = 0 links_to_get = set() for link in links: url = link.get('href').strip() url = REMOVE_HASH.sub('', url) if not url: continue aux = self.normalize_url(url) if not aux: invalid_links.add(url) continue url = aux if self.looks_like_image(url): continue if link.get('rel') == 'nofollow': continue if count_url_levels(url) > self.config.MAX_URL_LEVELS: logging.info('Max URL levels! Details: %s' % url) continue if URL_RE.match(url): num_links += 1 links_to_get.add(url) for url in links_to_get: self.async_get(url, self.handle_url_loaded) self.add_fact( key='total.number.links', value=num_links ) self.add_fact( key='total.number.invalid_links', value=len(invalid_links) ) self.add_fact( key='page.invalid_links', value=invalid_links )