def _verify_workers_limits(self, url, avg_links_per_page=10): active_domains = Domain.get_active_domains(self.db) return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10): from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency page = None lock = None settings = Settings.instance(db) workers = db.query(Worker).all() number_of_workers = len(workers) active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] all_domains_pages_in_need_of_review = {} for domain_id in active_domains_ids: pages = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .order_by(Page.score.desc())[:number_of_workers] if pages: all_domains_pages_in_need_of_review[domain_id] = pages pages_in_need_of_review = [] current_domain = 0 while all_domains_pages_in_need_of_review: domains = all_domains_pages_in_need_of_review.keys() if current_domain >= len(domains): current_domain = 0 domain_id = domains[current_domain] item = all_domains_pages_in_need_of_review[domain_id].pop(0) pages_in_need_of_review.append(item) if not all_domains_pages_in_need_of_review[domain_id]: del all_domains_pages_in_need_of_review[domain_id] current_domain += 1 if not pages_in_need_of_review: return None if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) for i in range(len(pages_in_need_of_review)): if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page): continue lock = cache.has_next_job_lock( pages_in_need_of_review[i].url, lock_expiration ) if lock is not None: page = pages_in_need_of_review[i] break if page is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }