Пример #1
0
    def get_next_job(cls, db, expiration, cache, lock_expiration):
        from holmes.models import Settings

        settings = Settings.instance(db)

        pages_in_need_of_review = cls.get_next_job_list(db, expiration)

        if len(pages_in_need_of_review) == 0:
            if settings.lambda_score > 0:
                cls.update_pages_score_by(settings, settings.lambda_score, db)
            return None

        if settings.lambda_score > pages_in_need_of_review[0].score:
            cls.update_pages_score_by(settings, settings.lambda_score, db)
            return None

        page = choice(pages_in_need_of_review)

        for i in range(10):
            lock = cache.has_next_job_lock(page.url, lock_expiration)
            if lock is None:
                page = choice(pages_in_need_of_review)
            else:
                break

        if lock is None:
            return None

        return {
            'page': str(page.uuid),
            'url': page.url,
            'score': page.score,
            'lock': lock
        }
Пример #2
0
    def test_increases_page_score_when_lambda_is_top_page(self):
        WorkerFactory.create()
        page = PageFactory.create()
        page2 = PageFactory.create()

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
Пример #3
0
    def test_increases_page_score_when_all_pages_have_been_reviewed(self):
        page = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))
        page2 = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).to_be_null()

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
Пример #4
0
    def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10):
        from holmes.models import Settings, Worker, Domain, Limiter  # Avoid circular dependency

        page = None
        lock = None
        settings = Settings.instance(db)
        workers = db.query(Worker).all()
        number_of_workers = len(workers)

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        all_domains_pages_in_need_of_review = {}

        for domain_id in active_domains_ids:
            pages = db \
                .query(
                    Page.uuid,
                    Page.url,
                    Page.score,
                    Page.last_review_date
                ) \
                .filter(Page.domain_id == domain_id) \
                .order_by(Page.score.desc())[:number_of_workers]
            if pages:
                all_domains_pages_in_need_of_review[domain_id] = pages

        pages_in_need_of_review = []
        current_domain = 0
        while all_domains_pages_in_need_of_review:
            domains = all_domains_pages_in_need_of_review.keys()
            if current_domain >= len(domains):
                current_domain = 0

            domain_id = domains[current_domain]

            item = all_domains_pages_in_need_of_review[domain_id].pop(0)
            pages_in_need_of_review.append(item)

            if not all_domains_pages_in_need_of_review[domain_id]:
                del all_domains_pages_in_need_of_review[domain_id]

            current_domain += 1

        if not pages_in_need_of_review:
            return None

        if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score:
            cls.update_pages_score_by(settings, settings.lambda_score, db)

        for i in range(len(pages_in_need_of_review)):
            if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page):
                continue

            lock = cache.has_next_job_lock(
                pages_in_need_of_review[i].url,
                lock_expiration
            )

            if lock is not None:
                page = pages_in_need_of_review[i]
                break

        if page is None:
            return None

        return {
            'page': str(page.uuid),
            'url': page.url,
            'score': page.score,
            'lock': lock
        }
Пример #5
0
    def test_can_get_instance(self):
        settings = Settings.instance(self.db)

        expect(settings.lambda_score).to_equal(0)