예제 #1
0
    def test_update_pages_score(self):
        config = Config()
        config.MAX_PAGE_SCORE = 15000000

        self.db.query(Page).delete()
        self.sync_cache.redis.delete('pages-score')

        page1 = PageFactory.create(score=3)
        page2 = PageFactory.create(score=0)

        for i in range(3):
            self.sync_cache.increment_page_score(page1.id)

        self.sync_cache.increment_page_score(page2.id)

        expect(page1.score).to_equal(3)
        expect(page2.score).to_equal(0)

        Page.update_pages_score(self.db, self.sync_cache, config)
        self.db.flush()

        self.db.refresh(page1)
        self.db.refresh(page2)

        expect(page1.score).to_equal(6)
        expect(page2.score).to_equal(1)
예제 #2
0
    def test_can_get_page_by_uuid(self):
        page = PageFactory.create()
        PageFactory.create()

        loaded_page = Page.by_uuid(page.uuid, self.db)
        expect(loaded_page.id).to_equal(page.id)

        invalid_page = Page.by_uuid(uuid4(), self.db)
        expect(invalid_page).to_be_null()
예제 #3
0
    def test_can_get_page_by_url_hash(self):
        page = PageFactory.create()
        PageFactory.create()

        loaded_page = Page.by_url_hash(page.url_hash, self.db)
        expect(loaded_page.id).to_equal(page.id)

        invalid_page = Page.by_uuid('123', self.db)
        expect(invalid_page).to_be_null()
예제 #4
0
    def test_can_get_page_by_uuid(self):
        page = PageFactory.create()
        PageFactory.create()

        loaded_page = Page.by_uuid(page.uuid, self.db)
        expect(loaded_page.id).to_equal(page.id)

        invalid_page = Page.by_uuid(uuid4(), self.db)
        expect(invalid_page).to_be_null()
예제 #5
0
    def test_can_get_page_by_url_hash(self):
        page = PageFactory.create()
        PageFactory.create()

        loaded_page = Page.by_url_hash(page.url_hash, self.db)
        expect(loaded_page.id).to_equal(page.id)

        invalid_page = Page.by_uuid('123', self.db)
        expect(invalid_page).to_be_null()
예제 #6
0
    def _update_pages_score(self):
        expiration = self.config.UPDATE_PAGES_SCORE_EXPIRATION
        lock = self.cache.has_update_pages_lock(expiration)

        if lock is not None:
            self.debug('Updating pages score...')
            Page.update_pages_score(self.db, self.cache, self.config)
            self.cache.release_update_pages_lock(lock)
            self.last_update_pages_score = datetime.utcnow()
예제 #7
0
    def enqueue(self, urls):
        if not urls:
            return

        for url, score in urls:
            Page.add_page(self.db, self.cache, url, score, self.async_get_func,
                          self.publish, self.config, self.girl,
                          self.violation_definitions, self.handle_page_added)

        self.wait_for_async_requests()
예제 #8
0
    def test_can_get_next_jobs_count(self):
        config = Config()
        config.REVIEW_EXPIRATION_IN_SECONDS = 100

        for x in range(3):
            PageFactory.create()

        next_job_list = Page.get_next_jobs_count(self.db, config)
        expect(next_job_list).to_equal(3)

        for x in range(2):
            PageFactory.create()

        next_job_list = Page.get_next_jobs_count(self.db, config)
        expect(next_job_list).to_equal(5)
예제 #9
0
    def post(self):
        post_data = loads(self.request.body)
        url = post_data["url"]
        score = float(post_data.get("score", self.application.config.DEFAULT_PAGE_SCORE))

        result = yield Page.add_page(
            self.db,
            self.application.cache,
            url,
            score,
            self.application.http_client.fetch,
            self.application.event_bus.publish,
            self.application.config,
        )

        created, url, result = result

        if not created and result["reason"] == "invalid_url":
            self.set_status(400, "Invalid url [%s]" % url)
            self.write_json(
                {"reason": "invalid_url", "url": url, "status": result["status"], "details": result["details"]}
            )
            return

        if not created and result["reason"] == "redirect":
            self.set_status(400, "Redirect URL [%s]" % url)
            self.write_json({"reason": "redirect", "url": url, "effectiveUrl": result["effectiveUrl"]})
            return

        self.write(str(result))
        self.finish()
예제 #10
0
 def get_next_jobs_count(self, callback=None):
     self.get_data(
         'next-jobs',
         int(self.config.NEXT_JOBS_COUNT_EXPIRATION_IN_SECONDS),
         lambda: Page.get_next_jobs_count(self.db, self.config),
         callback=callback
     )
예제 #11
0
    def get(self):
        current_page = int(self.get_argument('current_page', 1))
        page_size = int(self.get_argument('page_size', 10))

        get_next_job_list = Page.get_next_job_list(
            self.db,
            self.application.config.REVIEW_EXPIRATION_IN_SECONDS,
            current_page=current_page,
            page_size=page_size
        )

        #review_count = self.girl.get('next_jobs_count')
        review_count = 0

        result = {'reviewCount': review_count}
        pages = []
        for item in get_next_job_list:
            pages.append({
                'uuid': item.uuid,
                'url': item.url,
            })

        result['pages'] = pages

        self.write_json(result)
예제 #12
0
    def get(self, uuid="", limit=10):
        uuid = UUID(uuid)

        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._("Page UUID [%s] not found") % uuid)
            return

        reviews = (
            self.db.query(Review)
            .filter(Review.page == page)
            .filter(Review.is_complete == True)
            .order_by(Review.completed_date.desc())[:limit]
        )

        result = []
        for review in reviews:
            result.append(
                {
                    "uuid": str(review.uuid),
                    "completedAt": review.completed_date,
                    "violationCount": review.violation_count,
                }
            )

        self.write_json(result)
예제 #13
0
    def test_can_save(self):
        self.mock_request(status_code=200, effective_url="http://www.globo.com")

        self.server.application.girl = Mock()

        response = yield self.authenticated_fetch(
            '/page', method='POST', body=dumps({
                'url': 'http://www.globo.com'
            })
        )

        expect(response.code).to_equal(200)

        page_uuid = UUID(response.body)
        page = Page.by_uuid(page_uuid, self.db)

        expect(page).not_to_be_null()
        expect(str(page_uuid)).to_equal(page.uuid)

        expect(self.server.application.girl.expire.call_count).to_equal(4)
        self.server.application.girl.assert_has_calls([
            call.expire('domains_details'),
            call.expire('failed_responses_count'),
            call.expire('violation_count_for_domains'),
            call.expire('top_violations_in_category_for_domains'),
        ])
예제 #14
0
    def get(self, page_uuid, review_uuid):
        review = None
        page = None
        if self._parse_uuid(review_uuid):
            review = Review.by_uuid(review_uuid, self.db)

        if self._parse_uuid(page_uuid):
            page = Page.by_uuid(page_uuid, self.db)

        if not review and page:
            self.redirect('/page/%s/review/%s/' % (page_uuid, page.last_review_uuid))
            return

        if not page:
            self.set_status(404, self._('Page UUID [%s] not found') % page_uuid)
            return

        result = review.to_dict(self.application.fact_definitions,
                                self.application.violation_definitions,
                                self._)
        result.update({
            'violationPoints': review.get_violation_points(),
            'violationCount': review.violation_count,
        })

        self.write_json(result)
예제 #15
0
    def get(self, page_uuid, review_uuid):
        review = None
        page = None
        if self._parse_uuid(review_uuid):
            review = Review.by_uuid(review_uuid, self.db)

        if self._parse_uuid(page_uuid):
            page = Page.by_uuid(page_uuid, self.db)

        if not review and page:
            self.redirect('/page/%s/review/%s/' %
                          (page_uuid, page.last_review_uuid))
            return

        if not page:
            self.set_status(404,
                            self._('Page UUID [%s] not found') % page_uuid)
            return

        result = review.to_dict(self.application.fact_definitions,
                                self.application.violation_definitions, self._)
        result.update({
            'violationPoints': review.get_violation_points(),
            'violationCount': review.violation_count,
        })

        self.write_json(result)
예제 #16
0
 def increment_next_jobs_count(self, increment=1, callback=None):
     self.increment_data(
         'next-jobs',
         lambda: Page.get_next_jobs_count(self.db, self.config),
         increment,
         callback
     )
예제 #17
0
    def get_count(self, key, domain_name, expiration, get_count_method):
        cache_key = '%s-%s' % (self.get_domain_name(domain_name), key)

        count = self.redis.get(cache_key)

        if count is not None:
            return int(count)

        domain = domain_name
        if domain and not isinstance(domain, Domain):
            domain = Domain.get_domain_by_name(domain_name, self.db)

        if domain is None:
            count = Page.get_page_count(self.db)
        else:
            count = get_count_method(domain)

        cache_key = '%s-%s' % (self.get_domain_name(domain), key)

        self.redis.setex(
            cache_key,
            expiration,
            value=int(count)
        )

        return int(count)
예제 #18
0
    def enqueue(self, urls):
        if not urls:
            return

        for url, score in urls:
            Page.add_page(
                self.db,
                self.cache,
                url,
                score,
                self.async_get_func,
                self.publish,
                self.config,
                self.handle_page_added
            )

        self.wait_for_async_requests()
예제 #19
0
    def test_can_get_next_job_when_domain_limited(self):
        self.db.query(Domain).delete()
        self.db.query(Page).delete()

        domain_a = DomainFactory.create()
        domain_b = DomainFactory.create()

        LimiterFactory.create(url=domain_a.url, value=2)

        pages_a = []
        pages_b = []
        workers = []
        for i in range(10):
            for j in range(2):
                workers.append(WorkerFactory.create())

            pages_a.append(PageFactory.create(domain=domain_a, url="%s/%d.html" % (domain_a.url, i), score=i * 10))
            pages_b.append(PageFactory.create(domain=domain_b, url="%s/%d.html" % (domain_b.url, i), score=i))

        # first one should not be limited
        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1,
            avg_links_per_page=10
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(pages_a[-1].uuid))
        workers[0].current_url = next_job['url']
        self.db.flush()

        # second one should be limited (2 / 10 = 0.2, rounded up = 1 job at a time)
        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(pages_b[-1].uuid))
예제 #20
0
    def test_increases_page_score_when_lambda_is_top_page(self):
        WorkerFactory.create()
        page = PageFactory.create()
        page2 = PageFactory.create()

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
예제 #21
0
    def get(self, uuid=''):
        uuid = UUID(uuid)

        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._('Page UUID [%s] not found') % uuid)
            return

        page_json = {"uuid": str(page.uuid), "url": page.url}

        self.write(page_json)
예제 #22
0
    def test_can_get_next_job_when_expired(self):
        page = PageFactory.create(last_review_date=datetime(2010, 10, 10, 10, 10, 10))

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(page.uuid))
예제 #23
0
    def test_get_next_job_does_not_get_from_inactive_domains(self):
        domain = DomainFactory.create(is_active=False)
        PageFactory.create(domain=domain)

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).to_be_null()
예제 #24
0
    def test_can_get_next_job(self):
        page = PageFactory.create()

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(page.uuid))
예제 #25
0
    def get(self, uuid):
        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._('Page UUID [%s] not found') % uuid)
            return

        violations_per_day = page.get_violations_per_day(self.db)

        page_json = {"violations": violations_per_day}

        self.write_json(page_json)
예제 #26
0
    def get(self, uuid=""):
        uuid = UUID(uuid)

        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._("Page UUID [%s] not found") % uuid)
            return

        page_json = {"uuid": str(page.uuid), "url": page.url}

        self.write(page_json)
예제 #27
0
    def get(self, uuid):
        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._("Page UUID [%s] not found") % uuid)
            return

        violations_per_day = page.get_violations_per_day(self.db)

        page_json = {"violations": violations_per_day}

        self.write_json(page_json)
예제 #28
0
    def test_get_next_job_list(self):
        page = PageFactory.create()
        PageFactory.create()

        next_job_list = Page.get_next_job_list(self.db, expiration=100)

        expect(next_job_list).to_length(2)

        pages = [{'url': x.url, 'uuid': str(x.uuid)} for x in next_job_list]
        expect(pages).to_include({
            'url': page.url,
            'uuid': str(page.uuid)
        })
예제 #29
0
        def handle(has_key):
            domain = domain_name
            if domain and not isinstance(domain, Domain):
                domain = Domain.get_domain_by_name(domain_name, self.db)

            if has_key:
                self.redis.incrby(key, increment, callback=callback)
            else:
                if domain is None:
                    value = Page.get_page_count(self.db) + increment - 1
                else:
                    value = get_default_method(domain) + increment - 1

                self.redis.set(key, value, callback=callback)
예제 #30
0
        def handle(has_key):
            domain = domain_name
            if domain and not isinstance(domain, Domain):
                domain = Domain.get_domain_by_name(domain_name, self.db)

            if has_key:
                self.redis.incrby(key, increment, callback=callback)
            else:
                if domain is None:
                    value = Page.get_page_count(self.db) + increment - 1
                else:
                    value = get_default_method(domain) + increment - 1

                self.redis.set(key, value, callback=callback)
예제 #31
0
    def increment_count(self, key, domain_name, get_default_method, increment=1):
        key = '%s-%s' % (self.get_domain_name(domain_name), key)

        has_key = self.has_key(key)

        domain = domain_name
        if domain and not isinstance(domain, Domain):
            domain = Domain.get_domain_by_name(domain_name, self.db)

        if has_key:
            self.redis.incrby(key, increment)
        else:
            if domain is None:
                value = Page.get_page_count(self.db) + increment - 1
            else:
                value = get_default_method(domain) + increment - 1

            self.redis.set(key, value)
예제 #32
0
        def handle(count):
            if count is not None:
                callback(int(count))
                return

            domain = self.get_domain(domain_name)

            if domain is None:
                count = Page.get_page_count(self.db)
            else:
                count = get_count_method(domain)

            cache_key = '%s-%s' % (self.get_domain_name(domain), key)

            self.redis.setex(key=cache_key,
                             value=int(count),
                             seconds=expiration,
                             callback=self.handle_set_count(count, callback))
예제 #33
0
    def test_can_save_known_domain(self):
        DomainFactory.create(url='http://www.globo.com', name='globo.com')

        self.mock_request(status_code=200, effective_url="http://www.globo.com")

        response = self.fetch(
            '/page',
            method='POST',
            body=dumps({
                'url': 'http://www.globo.com'
            })
        )

        expect(response.code).to_equal(200)

        page_uuid = UUID(response.body)
        page = Page.by_uuid(page_uuid, self.db)

        expect(page).not_to_be_null()
        expect(str(page_uuid)).to_equal(page.uuid)
예제 #34
0
    def test_can_save_known_domain(self):
        DomainFactory.create(url='http://www.globo.com', name='globo.com')

        self.mock_request(status_code=200, effective_url="http://www.globo.com")

        response = yield self.authenticated_fetch(
            '/page',
            method='POST',
            body=dumps({
                'url': 'http://www.globo.com'
            })
        )

        expect(response.code).to_equal(200)

        page_uuid = UUID(response.body)
        page = Page.by_uuid(page_uuid, self.db)

        expect(page).not_to_be_null()
        expect(str(page_uuid)).to_equal(page.uuid)
예제 #35
0
    def test_can_get_next_job(self):
        domain = DomainFactory.create()
        pages = []
        for i in range(20):
            WorkerFactory.create()
            pages.append(PageFactory.create(
                domain=domain,
                score=float(i)
            ))

        for i in range(20):
            next_job = Page.get_next_job(
                self.db,
                expiration=100,
                cache=self.sync_cache,
                lock_expiration=100
            )

            expect(next_job).not_to_be_null()
            expect(next_job['page']).to_equal(str(pages[19 - i].uuid))
예제 #36
0
        def handle(count):
            if count is not None:
                callback(int(count))
                return

            domain = self.get_domain(domain_name)

            if domain is None:
                count = Page.get_page_count(self.db)
            else:
                count = get_count_method(domain)

            cache_key = '%s-%s' % (self.get_domain_name(domain), key)

            self.redis.setex(
                key=cache_key,
                value=int(count),
                seconds=expiration,
                callback=self.handle_set_count(count, callback)
            )
예제 #37
0
    def test_increases_page_score_when_all_pages_have_been_reviewed(self):
        page = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))
        page2 = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).to_be_null()

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
예제 #38
0
    def get(self):
        current_page = int(self.get_argument("current_page", 1))
        page_size = int(self.get_argument("page_size", 10))

        get_next_job_list = Page.get_next_job_list(
            self.db,
            self.application.config.REVIEW_EXPIRATION_IN_SECONDS,
            current_page=current_page,
            page_size=page_size,
        )

        review_count = self.girl.get("next_jobs_count")

        result = {"reviewCount": review_count}
        pages = []
        for item in get_next_job_list:
            pages.append({"uuid": item.uuid, "url": item.url})

        result["pages"] = pages

        self.write_json(result)
예제 #39
0
    def post(self):
        post_data = loads(self.request.body)
        url = post_data['url']
        score = float(
            post_data.get('score', self.application.config.DEFAULT_PAGE_SCORE))

        result = yield Page.add_page(
            self.db, self.application.cache, url, score,
            self.application.http_client.fetch,
            self.application.event_bus.publish, self.application.config,
            self.application.girl, self.application.default_violations_values,
            self.application.violation_definitions)

        created, url, result = result

        if not created and result['reason'] == 'invalid_url':
            self.set_status(400, self._('Invalid url [%s]') % url)
            self.write_json({
                'reason': 'invalid_url',
                'url': url,
                'status': result['status'],
                'details': result['details']
            })
            return

        if not created and result['reason'] == 'redirect':
            self.set_status(400,
                            self._('Supplied URL is a redirect [%s]') % url)
            self.write_json({
                'reason': 'redirect',
                'url': url,
                'effectiveUrl': result['effectiveUrl']
            })
            return

        yield self.application.cache.add_next_job_bucket(result, url)

        self.write(str(result))
        self.finish()
예제 #40
0
    def test_can_save(self):
        def side_effect(*args, **kw):
            response_mock = Mock(status_code=200, effective_url="http://www.globo.com")
            kw['callback'](response_mock)

        self.mock_request(status_code=200, effective_url="http://www.globo.com")

        response = yield self.http_client.fetch(
            self.get_url('/page'),
            method='POST',
            body=dumps({
                'url': 'http://www.globo.com'
            })
        )

        expect(response.code).to_equal(200)

        page_uuid = UUID(response.body)
        page = Page.by_uuid(page_uuid, self.db)

        expect(page).not_to_be_null()
        expect(str(page_uuid)).to_equal(page.uuid)
예제 #41
0
    def get_count(self, key, domain_name, expiration, get_count_method):
        cache_key = '%s-%s' % (self.get_domain_name(domain_name), key)

        count = self.redis.get(cache_key)

        if count is not None:
            return int(count)

        domain = domain_name
        if domain and not isinstance(domain, Domain):
            domain = Domain.get_domain_by_name(domain_name, self.db)

        if domain is None:
            count = Page.get_page_count(self.db)
        else:
            count = get_count_method(domain)

        cache_key = '%s-%s' % (self.get_domain_name(domain), key)

        self.redis.setex(cache_key, expiration, value=int(count))

        return int(count)
예제 #42
0
    def increment_count(self,
                        key,
                        domain_name,
                        get_default_method,
                        increment=1):
        key = '%s-%s' % (self.get_domain_name(domain_name), key)

        has_key = self.has_key(key)

        domain = domain_name
        if domain and not isinstance(domain, Domain):
            domain = Domain.get_domain_by_name(domain_name, self.db)

        if has_key:
            self.redis.incrby(key, increment)
        else:
            if domain is None:
                value = Page.get_page_count(self.db) + increment - 1
            else:
                value = get_default_method(domain) + increment - 1

            self.redis.set(key, value)
예제 #43
0
    def get(self, uuid='', limit=10):
        uuid = UUID(uuid)

        page = Page.by_uuid(uuid, self.db)

        if not page:
            self.set_status(404, self._('Page UUID [%s] not found') % uuid)
            return

        reviews = self.db.query(Review) \
            .filter(Review.page == page) \
            .filter(Review.is_complete == True) \
            .order_by(Review.completed_date.desc())[:limit]

        result = []
        for review in reviews:
            result.append({
                'uuid': str(review.uuid),
                'completedAt': review.completed_date,
                'violationCount': review.violation_count
            })

        self.write_json(result)