def test_can_get_active_domains(self): self.db.query(Domain).delete() domain = DomainFactory(is_active=True) DomainFactory(is_active=False) domains = Domain.get_active_domains(self.db) expect(domains).to_length(1) expect(domains[0].id).to_equal(domain.id)
def get_next_jobs_count(cls, db, config): from holmes.models import Domain active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] return db \ .query( sa.func.count(Page.id) ) \ .filter(Page.domain_id.in_(active_domains_ids)) \ .scalar()
def get_next_job_list(cls, db, expiration, current_page=1, page_size=200): from holmes.models import Domain lower_bound = (current_page - 1) * page_size upper_bound = lower_bound + page_size active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] pages_query = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id.in_(active_domains_ids)) \ .order_by(Page.score.desc()) return pages_query[lower_bound:upper_bound]
def fill_job_bucket(self, expiration, look_ahead_pages=1000, avg_links_per_page=10.0): try: with Lock('next-job-fill-bucket-lock', redis=self.redis): logging.info('Refilling job bucket. Lock acquired...') expired_time = datetime.utcnow() - timedelta( seconds=expiration) active_domains = Domain.get_active_domains(self.db) if not active_domains: return active_domains_ids = [item.id for item in active_domains] limiter_buckets = self.get_limiter_buckets( active_domains, avg_links_per_page) all_domains_pages_in_need_of_review = [] for domain_id in active_domains_ids: pages = self.db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .filter(or_( Page.last_review_date == None, Page.last_review_date <= expired_time ))[:look_ahead_pages] if pages: all_domains_pages_in_need_of_review.append(pages) logging.debug( 'Total of %d pages found to add to redis.' % (sum([ len(item) for item in all_domains_pages_in_need_of_review ]))) item_count = int(self.redis.zcard('next-job-bucket')) current_domain = 0 while item_count < look_ahead_pages and len( all_domains_pages_in_need_of_review) > 0: if current_domain >= len( all_domains_pages_in_need_of_review): current_domain = 0 item = all_domains_pages_in_need_of_review[ current_domain].pop(0) has_limit = True logging.debug('Available Limit Buckets: %s' % limiter_buckets) for index, (limit, available) in enumerate(limiter_buckets): if limit.matches(item.url): if available <= 0: has_limit = False break limiter_buckets[index] = (limit, available - 1) if has_limit: self.add_next_job_bucket(item.uuid, item.url) item_count += 1 # if there are not any more pages in this domain remove it from dictionary if not all_domains_pages_in_need_of_review[current_domain]: del all_domains_pages_in_need_of_review[current_domain] current_domain += 1 logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' % item_count) except LockTimeout: logging.info("Can't acquire lock. Moving on...")
def _verify_workers_limits(self, url, avg_links_per_page=10): active_domains = Domain.get_active_domains(self.db) return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10): from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency page = None lock = None settings = Settings.instance(db) workers = db.query(Worker).all() number_of_workers = len(workers) active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] all_domains_pages_in_need_of_review = {} for domain_id in active_domains_ids: pages = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .order_by(Page.score.desc())[:number_of_workers] if pages: all_domains_pages_in_need_of_review[domain_id] = pages pages_in_need_of_review = [] current_domain = 0 while all_domains_pages_in_need_of_review: domains = all_domains_pages_in_need_of_review.keys() if current_domain >= len(domains): current_domain = 0 domain_id = domains[current_domain] item = all_domains_pages_in_need_of_review[domain_id].pop(0) pages_in_need_of_review.append(item) if not all_domains_pages_in_need_of_review[domain_id]: del all_domains_pages_in_need_of_review[domain_id] current_domain += 1 if not pages_in_need_of_review: return None if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) for i in range(len(pages_in_need_of_review)): if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page): continue lock = cache.has_next_job_lock( pages_in_need_of_review[i].url, lock_expiration ) if lock is not None: page = pages_in_need_of_review[i] break if page is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }
def fill_job_bucket(self, expiration, look_ahead_pages=1000, avg_links_per_page=10.0): try: with Lock('next-job-fill-bucket-lock', redis=self.redis): logging.info('Refilling job bucket. Lock acquired...') expired_time = datetime.utcnow() - timedelta(seconds=expiration) active_domains = Domain.get_active_domains(self.db) if not active_domains: return active_domains_ids = [item.id for item in active_domains] limiter_buckets = self.get_limiter_buckets(active_domains, avg_links_per_page) all_domains_pages_in_need_of_review = [] for domain_id in active_domains_ids: pages = self.db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .filter(or_( Page.last_review_date == None, Page.last_review_date <= expired_time ))[:look_ahead_pages] if pages: all_domains_pages_in_need_of_review.append(pages) logging.debug('Total of %d pages found to add to redis.' % (sum([len(item) for item in all_domains_pages_in_need_of_review]))) item_count = int(self.redis.zcard('next-job-bucket')) current_domain = 0 while item_count < look_ahead_pages and len(all_domains_pages_in_need_of_review) > 0: if current_domain >= len(all_domains_pages_in_need_of_review): current_domain = 0 item = all_domains_pages_in_need_of_review[current_domain].pop(0) has_limit = True logging.debug('Available Limit Buckets: %s' % limiter_buckets) for index, (limit, available) in enumerate(limiter_buckets): if limit.matches(item.url): if available <= 0: has_limit = False break limiter_buckets[index] = (limit, available - 1) if has_limit: self.add_next_job_bucket(item.uuid, item.url) item_count += 1 # if there are not any more pages in this domain remove it from dictionary if not all_domains_pages_in_need_of_review[current_domain]: del all_domains_pages_in_need_of_review[current_domain] current_domain += 1 logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' % item_count) except LockTimeout: logging.info("Can't acquire lock. Moving on...")