Пример #1
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = 'GET' if not item.method else item.method
                r = Request(
                    item.url,
                    method=method,
                    meta=item.meta,
                    headers=item.headers,
                    cookies=item.cookies)
                r.meta['fingerprint'] = item.fingerprint
                r.meta['score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception, exc:
            self.logger.exception(exc)
            self.session.rollback()
Пример #2
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = item.method or b'GET'
                r = Request(item.url,
                            method=method,
                            meta=item.meta,
                            headers=item.headers,
                            cookies=item.cookies)
                r.meta[b'fingerprint'] = to_bytes(item.fingerprint)
                r.meta[b'score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception as exc:
            self.logger.exception(exc)
            self.session.rollback()
        return results
Пример #3
0
def test_states():
    logging.basicConfig(level=logging.DEBUG)
    states = HCFStates(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME,
                       256, True)
    states.frontier_start()
    objs = []
    fprints = []
    for i in range(0, 128):
        o = Request('http://website.com/%d' % randint(0, maxsize))
        o.meta[b'fingerprint'] = generate_fprint()
        o.meta[b'state'] = choice([
            HCFStates.NOT_CRAWLED, HCFStates.QUEUED, HCFStates.CRAWLED,
            HCFStates.ERROR
        ])
        objs.append(o)
        fprints.append(o.meta[b'fingerprint'])

    states.update_cache(objs)
    states.flush()

    # cache is warm
    check_states(states, fprints, objs)

    # clearing tha cache, and testing fetching
    states.flush(force_clear=True)
    check_states(states, fprints, objs)
    def test_should_parse_domain_info(self):
        seeds = [
            Request('http://example.com'),
            Request('https://www.google.com'),
        ]

        mware = DomainMiddleware(self.fake_manager)
        result = mware.add_seeds(seeds)

        self.assertEquals(len(result), len(seeds))

        for r in result:
            self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r)

        expected = [
            {
                b'name': b'example.com',
                b'netloc': b'example.com',
                b'scheme': b'http',
                b'sld': b'',
                b'subdomain': b'',
                b'tld': b''
            },
            {
                b'name': b'www.google.com',
                b'netloc': b'www.google.com',
                b'scheme': b'https',
                b'sld': b'',
                b'subdomain': b'',
                b'tld': b''
            },
        ]
        self.assertEquals(expected, [r.meta[b'domain'] for r in result])
Пример #5
0
    def test_should_parse_tldextract_extra_domain_info(self):
        seeds = [
            Request('http://example.com'),
            Request('https://www.google.com'),
        ]

        self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True}

        mware = DomainMiddleware(self.fake_manager)
        result = mware.add_seeds(seeds)

        self.assertEqual(len(result), len(seeds))

        for r in result:
            self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r)

        expected = [
            {
                b'name': b'example.com',
                b'netloc': b'example.com',
                b'scheme': b'http',
                b'sld': b'example',
                b'subdomain': b'',
                b'tld': b'com'
            },
            {
                b'name': b'google.com',
                b'netloc': b'www.google.com',
                b'scheme': b'https',
                b'sld': b'google',
                b'subdomain': b'www',
                b'tld': b'com'
            },
        ]
        self.assertEqual(expected, [r.meta[b'domain'] for r in result])
Пример #6
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            dequeued_urls = 0
            cql_ditems = []
            d_query = self.session.prepare("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? "
                                           "AND score = ? AND created_at = ?")
            for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\
                    order_by("partition_id", "score", self._order_by()).limit(max_n_requests):
                method = 'GET' if not item.method else item.method

                meta_dict2 = dict((name, getattr(item.meta, name)) for name in dir(item.meta)
                                  if not name.startswith('__'))
                # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus
                # If I take meta_dict2 direct to Request i get the same error message

                meta_dict = dict()
                meta_dict["fingerprint"] = meta_dict2["fingerprint"]
                meta_dict["domain"] = meta_dict2["domain"]
                meta_dict["origin_is_frontier"] = meta_dict2["origin_is_frontier"]
                meta_dict["scrapy_callback"] = meta_dict2["scrapy_callback"]
                meta_dict["scrapy_errback"] = meta_dict2["scrapy_errback"]
                meta_dict["scrapy_meta"] = meta_dict2["scrapy_meta"]
                meta_dict["score"] = meta_dict2["score"]
                meta_dict["jid"] = meta_dict2["jid"]

                r = Request(item.url, method=method, meta=meta_dict, headers=item.headers, cookies=item.cookies)
                r.meta['fingerprint'] = item.fingerprint
                r.meta['score'] = item.score
                results.append(r)

                cql_d = (item.crawl, item.fingerprint, item.partition_id, item.score, item.created_at)
                cql_ditems.append(cql_d)
                dequeued_urls += 1

            if dequeued_urls > 0:
                execute_concurrent_with_args(self.session, d_query, cql_ditems, concurrency=200)

            self.counter_cls.cass_count({"dequeued_urls": dequeued_urls})

        except Exception, exc:
            self.logger.exception(exc)
Пример #7
0
    def create_request(self,
                       url,
                       method='GET',
                       headers=None,
                       cookies=None,
                       meta=None,
                       body=''):
        """
        Creates request with specified fields, with state fetched from backend.

        :param url: str
        :param method: str
        :param headers: dict
        :param cookies: dict
        :param meta: dict
        :param body: str
        :return: :class:`Request <frontera.core.models.Request>`
        """
        r = Request(url,
                    method=method,
                    headers=headers,
                    cookies=cookies,
                    meta=meta,
                    body=body)
        self.url_mw._add_fingerprint(r)
        self._states_context.refresh_and_keep(r)
        return r
Пример #8
0
    def create_request(self,
                       url,
                       method=b'GET',
                       headers=None,
                       cookies=None,
                       meta=None,
                       body=b''):
        """
        Creates request with specified fields, with state fetched from backend. This method only creates request, but
        isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states
        from storage.

        :param url: str
        :param method: str
        :param headers: dict
        :param cookies: dict
        :param meta: dict
        :param body: str
        :return: :class:`Request <frontera.core.models.Request>`
        """
        r = Request(url,
                    method=method,
                    headers=headers,
                    cookies=cookies,
                    meta=meta,
                    body=body)
        self.url_mw._add_fingerprint(r)
        return r
Пример #9
0
def single_node_chain(url1, url2):
    r = Request(url=url1)
    re = Response(url=url2, request=r)
    re.meta[b'fingerprint'] = sha1(url2)
    re.meta[b'redirect_urls'] = [url1]
    re.meta[b'redirect_fingerprints'] = [sha1(url1)]
    return re
Пример #10
0
 def schedule(self, batch):
     for obj in batch:
         if obj[3]:
             self.requests.append(
                 Request(obj[2].url,
                         meta={
                             b'fingerprint': obj[0],
                             b'score': obj[1]
                         }))
Пример #11
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        Priorities, from highest to lowest:
         - max_requests_per_host
         - max_n_requests
         - min_hosts & min_requests

        :param max_n_requests:
        :param partition_id:
        :param kwargs: min_requests, min_hosts, max_requests_per_host
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        min_requests = kwargs.pop("min_requests", None)
        min_hosts = kwargs.pop("min_hosts", None)
        max_requests_per_host = kwargs.pop("max_requests_per_host", None)
        assert(max_n_requests > min_requests)

        queue = {}
        limit = max_n_requests
        tries = 0
        count = 0
        while tries < self.GET_RETRIES:
            tries += 1
            limit *= 5.5 if tries > 1 else 1.0
            self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d",
                              tries, limit, count, len(queue.keys()))
            queue.clear()
            count = 0
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(limit):
                if item.host_crc32 not in queue:
                    queue[item.host_crc32] = []
                if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host:
                    continue
                queue[item.host_crc32].append(item)
                count += 1
                if count > max_n_requests:
                    break
            if min_hosts is not None and len(queue.keys()) < min_hosts:
                continue
            if min_requests is not None and count < min_requests:
                continue
            break
        self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count)

        results = []
        for items in queue.itervalues():
            for item in items:
                method = 'GET' if not item.method else str(item.method)
                results.append(Request(item.url, method=method,
                                       meta=item.meta, headers=item.headers, cookies=item.cookies))
                self.session.delete(item)
        self.session.commit()
        return results
Пример #12
0
    def generate_requests(self):
        def get_random_host():
            return str("").join([choice(ascii_lowercase) for i in range(5)])

        self.hosts = set()
        for _ in range(21):
            self.hosts.add(get_random_host())
        self.requests = []
        for host in self.hosts:
            self.requests.append(Request("http://%s/" % (host)))
def test_basic():
    cs = Basic()
    r = Request(url="http://www.scrapinghub.com/")

    re = Response(url="http://scrapinghub.com/", request=r)
    re.meta['fingerprint'] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e"
    re.meta['redirect_urls'] = ['http://www.scrapinghub.com/']
    re.meta['redirect_fingerprints'] = [
        "6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66"
    ]
    cs.page_crawled(re, [])
    assert re.url == "http://www.scrapinghub.com/"
Пример #14
0
def test_states():
    logging.basicConfig(level=logging.DEBUG)
    states = HCFStates(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME, 256, True)
    states.frontier_start()
    objs = []
    fprints = []
    for i in range(0, 128):
        o = Request('http://website.com/%d' % randint(0, maxsize))
        o.meta[b'fingerprint'] = generate_fprint()
        o.meta[b'state'] = choice([HCFStates.NOT_CRAWLED, HCFStates.QUEUED, HCFStates.CRAWLED, HCFStates.ERROR])
        objs.append(o)
        fprints.append(o.meta[b'fingerprint'])

    states.update_cache(objs)
    states.flush()

    # cache is warm
    check_states(states, fprints, objs)

    # clearing tha cache, and testing fetching
    states.flush(force_clear=True)
    check_states(states, fprints, objs)
Пример #15
0
 def test_scheduling_past_1part_post(self):
     subject = MemoryQueue(1)
     data = {'id': 'xxx', 'name': 'yyy'}
     batch = [
         ("1", 1,
          Request(url='https://www.knuthellan.com/',
                  body=data,
                  method='POST'), True),
     ]
     subject.schedule(batch)
     requests = subject.get_next_requests(5, 0)
     for request in requests:
         self.assertTrue(request.method == b'POST')
         self.assertTrue(request.body == data)
Пример #16
0
 def get_next_requests(self, max_n_requests, partition_id, score, **kwargs):
     results = []
     try:
         queue = self.queue_model
         query = self.session.query(queue
             ).filter(queue.partition_id == partition_id, queue.score >= score
             ).order_by(queue.created_at
             ).limit(max_n_requests)
         for item in query:
             method = item.method or b'GET'
             r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)
             fp = item.fingerprint
             msg = f"retrieved request {fp[:6]}...{fp[-6:]}"
             self.logger.info(msg)
             r.meta[b'fingerprint'] = to_bytes(item.fingerprint)
             r.meta[b'score'] = item.score
             results.append(r)
             self.session.delete(item)
         self.session.commit()
     except Exception as exc:
         self.logger.exception(exc)
         self.session.rollback()
     self.logger.info(f"Got {len(results)} next requests with score {score}")
     return results
Пример #17
0
def check_states(states, fprints, objs):
    states.fetch(fprints)
    objs_fresh = [
        Request(o.url, meta={b'fingerprint': o.meta[b'fingerprint']})
        for o in objs
    ]
    states.set_states(objs_fresh)
    i1 = iter(objs)
    i2 = iter(objs_fresh)

    while True:
        try:
            o1 = next(i1)
            o2 = next(i2)
            assert o1.meta[b'fingerprint'] == o2.meta[b'fingerprint']
            assert o1.meta[b'state'] == o2.meta[b'state']
        except StopIteration:
            break
Пример #18
0
 def consume_scoring(self, *args, **kwargs):
     consumed = 0
     seen = set()
     batch = []
     for m in self.scoring_log_consumer.get_messages(count=self.consumer_batch_size):
         try:
             msg = self._decoder.decode(m)
         except (KeyError, TypeError), e:
             logger.error("Decoding error: %s", e)
             continue
         else:
             if msg[0] == 'update_score':
                 _, fprint, score, url, schedule = msg
                 if fprint not in seen:
                     batch.append((fprint, score, Request(url), schedule))
                 seen.add(fprint)
             if msg[0] == 'new_job_id':
                 self.job_id = msg[1]
         finally:
Пример #19
0
def test_queue():
    logging.basicConfig(level=logging.DEBUG)
    queue = HCFQueue(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME,
                     10000, 1, 1, "", True)

    queue.frontier_start()

    r = Request(url="http://scrapinghub.com",
                meta={
                    b"fingerprint": b"abcdef01234567890",
                    "native": "string test"
                })
    queue.schedule([("", 0.9, r, True)])
    sleep(4)
    result = queue.get_next_requests(256, 0)
    assert result[0].url == r.url
    assert result[0].meta[b'fingerprint'] == r.meta[b'fingerprint']
    assert result[0].meta["native"] == r.meta["native"]

    queue.frontier_stop()
Пример #20
0
import pytest
from frontera.core.components import States
from frontera.core.models import Request
from happybase import Connection
from frontera.contrib.backends.hbase import HBaseState, HBaseQueue
from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates, Queue as SQLAlchemyQueue
from frontera.contrib.backends.sqlalchemy.models import StateModel, QueueModel
from frontera.contrib.backends.memory import MemoryStates, MemoryQueue
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

r1 = Request('https://www.example.com',
             meta={
                 b'fingerprint': b'10',
                 b'domain': {
                     b'name': b'www.example.com',
                     b'fingerprint': b'81'
                 }
             })
r2 = Request('http://example.com/some/page/',
             meta={
                 b'fingerprint': b'11',
                 b'domain': {
                     b'name': b'example.com',
                     b'fingerprint': b'82'
                 }
             })
r3 = Request('http://www.scrapy.org',
             meta={
                 b'fingerprint': b'12',
                 b'domain': {
Пример #21
0
from frontera.core.models import Request, Response
from frontera.worker.db import DBWorker, ScoringConsumer, IncomingConsumer, BatchGenerator
from frontera.settings import Settings
from frontera.core.components import States
import unittest

r1 = Request('http://www.example.com/',
             meta={
                 b'fingerprint': b'1',
                 b'state': States.DEFAULT,
                 b'jid': 0
             })
r2 = Request('http://www.scrapy.org/',
             meta={
                 b'fingerprint': b'2',
                 b'state': States.DEFAULT,
                 b'jid': 0
             })
r3 = Request('https://www.dmoz.org',
             meta={
                 b'fingerprint': b'3',
                 b'state': States.DEFAULT,
                 b'jid': 0
             })


class TestDBWorker(unittest.TestCase):
    def dbw_setup(self, distributed=False):
        settings = Settings()
        settings.MAX_NEXT_REQUESTS = 64
        settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
Пример #22
0
from __future__ import absolute_import
from frontera.core.manager import FrontierManager
from frontera.settings import Settings
from frontera.core.models import Request, Response
from six.moves import range

r1 = Request(
    'http://www.example.com',
    meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'})
r2 = Request(
    'https://www.example.com/some/page',
    meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'})
r3 = Request(
    'http://example1.com',
    meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'})


class TestFrontierManager(object):
    def setup_frontier_manager(self, settings=None):
        settings = settings or Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        return FrontierManager.from_settings(settings)

    def test_start(self):
Пример #23
0
from frontera.core.manager import LocalFrontierManager
from frontera.settings import Settings
from frontera.core.models import Request, Response
from frontera.core.components import States
from six.moves import range
from unittest import TestCase

r1 = Request(
    'http://www.example.com',
    meta={b'fingerprint': b'89e6a0649e06d83370cdf2cbfb05f363934a8d0c'})
r2 = Request(
    'https://www.example.com/some/page',
    meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'})
r3 = Request(
    'http://example1.com',
    meta={b'fingerprint': b'758293d800fc9672ae2c68bd083359b74ab9b6c2'})

seeds_blob = b"""http://www.example.com
https://www.example.com/some/page
http://example1.com
"""
from io import BytesIO

SEEDS_FILE = BytesIO(seeds_blob)


class TestFrontierManager(TestCase):
    def setup_frontier_manager(self, settings=None):
        settings = settings or Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
Пример #24
0
from __future__ import absolute_import
from frontera.core import OverusedBuffer
from frontera.core.models import Request
from six.moves import range
from itertools import cycle
from random import choice, sample
from string import ascii_lowercase

r1 = Request('http://www.example.com')
r2 = Request('http://www.example.com/some/')
r3 = Request('htttp://www.example.com/some/page/')
r4 = Request('http://example.com')
r5 = Request('http://example.com/some/page')
r6 = Request('http://example1.com')


class TestOverusedBuffer(object):

    requests = [r1, r2, r3, r4, r5, r6]

    def get_once(self, max_n_requests, **kwargs):
        lst = []
        for _ in range(max_n_requests):
            try:
                lst.append(next(self.req_it))
            except StopIteration:
                break
        return lst

    def test_base(self):
        self.req_it = iter(self.requests)
Пример #25
0
from frontera.worker.strategy import StrategyWorker
from frontera.settings import Settings
from frontera.core.models import Request, Response
from frontera.core.components import States
from tests.mocks.components import CrawlingStrategy
from unittest import TestCase
from os import remove
from os.path import exists

r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0})
r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0})
r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'jid': 0})
r4 = Request('http://www.test.com/some/page',
             meta={
                 b'fingerprint': b'4',
                 b'jid': 0
             })


class FilteredLinksCrawlingStrategy(CrawlingStrategy):
    def filter_extracted_links(self, request, links):
        return []


class TestStrategyWorker(TestCase):
    def setUp(self):
        settings = Settings()
        settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
        settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
Пример #26
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all
        parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue.

        :param max_n_requests: maximum number of requests
        :param partition_id: partition id to get batch from
        :param min_requests: minimum number of requests
        :param min_hosts: minimum number of hosts
        :param max_requests_per_host: maximum number of requests per host
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        min_requests = kwargs.pop('min_requests')
        min_hosts = kwargs.pop('min_hosts')
        max_requests_per_host = kwargs.pop('max_requests_per_host')
        assert (max_n_requests > min_requests)
        table = self.connection.table(self.table_name)

        meta_map = {}
        queue = {}
        limit = min_requests
        tries = 0
        count = 0
        while tries < self.GET_RETRIES:
            tries += 1
            limit *= 5.5 if tries > 1 else 1.0
            self.logger.debug(
                "Try %d, limit %d, last attempt: requests %d, hosts %d" %
                (tries, limit, count, len(queue.keys())))
            meta_map.clear()
            queue.clear()
            count = 0
            for rk, data in table.scan(row_prefix='%d_' % partition_id,
                                       limit=int(limit),
                                       batch_size=256):
                for cq, buf in data.iteritems():
                    stream = BytesIO(buf)
                    unpacker = Unpacker(stream)
                    for item in unpacker:
                        fingerprint, host_crc32, url, score = item
                        if host_crc32 not in queue:
                            queue[host_crc32] = []
                        if max_requests_per_host is not None and len(
                                queue[host_crc32]) > max_requests_per_host:
                            continue
                        queue[host_crc32].append(fingerprint)
                        count += 1

                        if fingerprint not in meta_map:
                            meta_map[fingerprint] = []
                        meta_map[fingerprint].append((rk, item))
                if count > max_n_requests:
                    break

            if min_hosts is not None and len(queue.keys()) < min_hosts:
                continue

            if count < min_requests:
                continue
            break

        self.logger.debug("Finished: tries %d, hosts %d, requests %d" %
                          (tries, len(queue.keys()), count))

        # For every fingerprint collect it's row keys and return all fingerprints from them
        fprint_map = {}
        for fprint, meta_list in meta_map.iteritems():
            for rk, _ in meta_list:
                fprint_map.setdefault(rk, []).append(fprint)

        results = []
        trash_can = set()
        for _, fprints in queue.iteritems():
            for fprint in fprints:
                for rk, _ in meta_map[fprint]:
                    trash_can.add(rk)
                    for rk_fprint in fprint_map[rk]:
                        _, item = meta_map[rk_fprint][0]
                        _, _, url, score = item
                        results.append(
                            Request(url,
                                    meta={
                                        'fingerprint': hexlify(rk_fprint),
                                        'score': score,
                                    }))

        with table.batch(transaction=True) as b:
            for rk in trash_can:
                b.delete(rk)
        self.logger.debug("%d row keys removed" % (len(trash_can)))
        return results
Пример #27
0
from __future__ import absolute_import
import unittest

from frontera.contrib.backends.remote.messagebus import MessageBusBackend
from frontera.settings import Settings
from frontera.core.models import Request, Response

data = {'id': 'xxx', 'name': 'yyy'}

r1 = Request('http://www.example.com/',
             method='post',
             body=data,
             meta={b'domain': {
                 b'fingerprint': b'1'
             }})
r2 = Request('http://www.scrapy.org/',
             meta={b'domain': {
                 b'fingerprint': b'2'
             }})
r3 = Request('http://www.test.com/some/page',
             meta={b'domain': {
                 b'fingerprint': b'3'
             }})


class TestMessageBusBackend(unittest.TestCase):
    def mbb_setup(self, settings=None):
        manager = type('manager', (object, ), {})
        settings = settings or Settings()
        settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
        settings.STORE_CONTENT = True
Пример #28
0
from __future__ import absolute_import
from frontera.core.manager import FrontierManager
from frontera.settings import Settings
from frontera.core.models import Request, Response
from six.moves import range

r1 = Request('http://www.example.com')
r2 = Request('https://www.example.com/some/page')
r3 = Request('http://example1.com')


class TestFrontierManager(object):
    def setup_frontier_manager(self, settings=None):
        settings = settings or Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        return FrontierManager.from_settings(settings)

    def test_start(self):
        fm = self.setup_frontier_manager()
        assert fm._started is True
        assert fm.backend._started is True
        assert [mw._started for mw in fm.middlewares] == [True] * 4
        assert fm.canonicalsolver._started is True
Пример #29
0
from frontera.core import OverusedBuffer
from frontera.core.models import Request
from six.moves import range
from itertools import cycle
from random import choice
from string import ascii_lowercase

r1 = Request(
    'http://www.example.com',
    meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'})
r2 = Request(
    'http://www.example.com/some/',
    meta={b'fingerprint': b'9773afd9cb0f4ec3fd09d6d1fe2c742abf0621ec'})
r3 = Request(
    'htttp://www.example.com/some/page/',
    meta={b'fingerprint': b'7278fb7612670523a7e3e37d7c38871c73bcb0ea'})
r4 = Request(
    'http://example.com',
    meta={b'fingerprint': b'89dce6a446a69d6b9bdc01ac75251e4c322bcdff'})
r5 = Request(
    'http://example.com/some/page',
    meta={b'fingerprint': b'9dbd730bdce21e322a12c757753f26bbc95c3779'})
r6 = Request(
    'http://example1.com',
    meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'})


class TestOverusedBuffer(object):

    requests = [r1, r2, r3, r4, r5, r6]
Пример #30
0
def test_codec(encoder, decoder):
    def check_request(req1, req2):
        assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \
            and req1.method == req2.method

    enc = encoder(Request, send_body=True)
    dec = decoder(Request, Response)
    req = Request(url="http://www.yandex.ru",
                  method=b'GET',
                  meta={b"test": b"shmest"},
                  headers={b'reqhdr': b'value'})
    req2 = Request(url="http://www.yandex.ru/search")
    msgs = [
        enc.encode_add_seeds([req]),
        enc.encode_page_crawled(
            Response(url="http://www.yandex.ru",
                     body=b'SOME CONTENT',
                     headers={b'hdr': b'value'},
                     request=req)),
        enc.encode_links_extracted(req, [req2]),
        enc.encode_request_error(req, "Host not found"),
        enc.encode_update_score(req, 0.51, True),
        enc.encode_new_job_id(1),
        enc.encode_offset(0, 28796),
        enc.encode_request(req)
    ]

    it = iter(msgs)

    o = dec.decode(next(it))
    assert o[0] == 'add_seeds'
    assert type(o[1]) == list
    req_d = o[1][0]
    check_request(req_d, req)
    assert type(req_d) == Request

    o = dec.decode(next(it))
    assert o[0] == 'page_crawled'
    assert type(o[1]) == Response
    assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[
        1].meta == req.meta

    o = dec.decode(next(it))
    print(o)
    assert o[0] == 'links_extracted'
    assert type(o[1]) == Request
    assert o[1].url == req.url and o[1].meta == req.meta
    assert type(o[2]) == list
    req_d = o[2][0]
    assert type(req_d) == Request
    assert req_d.url == req2.url

    o_type, o_req, o_error = dec.decode(next(it))
    assert o_type == 'request_error'
    check_request(o_req, req)
    assert o_error == "Host not found"

    o_type, o_req2, score, schedule = dec.decode(next(it))
    assert o_type == 'update_score'
    assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers
    assert score == 0.51
    assert schedule is True

    o_type, job_id = dec.decode(next(it))
    assert o_type == 'new_job_id'
    assert job_id == 1

    o_type, partition_id, offset = dec.decode(next(it))
    assert o_type == 'offset'
    assert partition_id == 0
    assert offset == 28796

    o = dec.decode_request(next(it))
    check_request(o, req)
Пример #31
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner
from frontera.core.models import Request
from six.moves import range

request = Request(
    'http://www.example.com',
    meta={b'fingerprint': b'1be68ff556fd0bbe5802d1a100850da29f7f15b1'})


def test_fingerprint_partitioner():
    partitions = list(range(0, 5))
    fp = FingerprintPartitioner(partitions)

    key = b'1be68ff556fd0bbe5802d1a100850da29f7f15b1'
    assert fp.get_key(request) == key

    partition = fp.partition(key, partitions)
    assert partition == 1

    partition = fp.partition(key, None)
    assert partition == 1


def test_crc32name_partitioner():
    partitions = list(range(0, 5))
    cp = Crc32NamePartitioner(partitions)

    key = b'www.example.com'
    assert cp.get_key(request) == key
Пример #32
0
from __future__ import absolute_import
from happybase import Connection
from Hbase_thrift import AlreadyExists  # module loaded at runtime in happybase
from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue
from frontera.core.models import Request, Response
from frontera.core.components import States
from binascii import unhexlify
from time import sleep, time
from w3lib.util import to_native_str

r1 = Request('https://www.example.com', meta={b'fingerprint': b'10',
             b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}})
r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11',
             b'domain': {b'name': b'example.com', b'fingerprint': b'82'}})
r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12',
             b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}})
r4 = r3.copy()


class TestHBaseBackend(object):

    def delete_rows(self, table, row_keys):
        batch = table.batch()
        for key in row_keys:
            batch.delete(unhexlify(key))
        batch.send()

    def test_metadata(self):
        connection = Connection(host='hbase-docker', port=9090)
        metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
        metadata.add_seeds([r1, r2, r3])
Пример #33
0
def test_codec(encoder, decoder, send_body, invalid_value):
    def check_request(req1, req2):
        assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \
               _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method

    enc = encoder(Request, send_body=send_body)
    dec = decoder(Request, Response)
    req = Request(url="http://www.yandex.ru",
                  method=b'GET',
                  meta={
                      b'test': b'shmest',
                      b'scrapy_meta': {
                          'rule': 0,
                          'key': 'value'
                      }
                  },
                  headers={b'reqhdr': b'value'})
    req2 = Request(url="http://www.yandex.ru/search")
    msgs = [
        enc.encode_add_seeds([req]),
        enc.encode_page_crawled(
            Response(url="http://www.yandex.ru",
                     body=b'SOME CONTENT',
                     headers={b'hdr': b'value'},
                     request=req)),
        enc.encode_links_extracted(req, [req2]),
        enc.encode_request_error(req, "Host not found"),
        enc.encode_update_score(req, 0.51, True),
        enc.encode_new_job_id(1),
        enc.encode_offset(0, 28796),
        enc.encode_request(req),
        invalid_value,
    ]

    it = iter(msgs)

    o = dec.decode(next(it))
    assert o[0] == 'add_seeds'
    assert type(o[1]) == list
    req_d = o[1][0]
    check_request(req_d, req)
    assert type(req_d) == Request

    o = dec.decode(next(it))
    assert o[0] == 'page_crawled'
    assert type(o[1]) == Response
    assert o[1].url == req.url and o[1].meta == req.meta
    if send_body:
        o[1].body == b'SOME CONTENT'
    else:
        o[1].body is None

    o = dec.decode(next(it))
    print(o)
    assert o[0] == 'links_extracted'
    assert type(o[1]) == Request
    assert o[1].url == req.url and o[1].meta == req.meta
    assert type(o[2]) == list
    req_d = o[2][0]
    assert type(req_d) == Request
    assert req_d.url == req2.url

    o_type, o_req, o_error = dec.decode(next(it))
    assert o_type == 'request_error'
    check_request(o_req, req)
    assert o_error == "Host not found"

    o_type, o_req2, score, schedule = dec.decode(next(it))
    assert o_type == 'update_score'
    assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers
    assert score == 0.51
    assert schedule is True

    o_type, job_id = dec.decode(next(it))
    assert o_type == 'new_job_id'
    assert job_id == 1

    o_type, partition_id, offset = dec.decode(next(it))
    assert o_type == 'offset'
    assert partition_id == 0
    assert offset == 28796

    o = dec.decode_request(next(it))
    check_request(o, req)

    with pytest.raises(TypeError):
        dec.decode(next(it))
from __future__ import absolute_import
import unittest

from frontera.contrib.backends.remote.messagebus import MessageBusBackend
from frontera.settings import Settings
from frontera.core.models import Request, Response

r1 = Request('http://www.example.com/',
             meta={
                 b'domain': {
                     b'fingerprint': b'1'
                 },
                 b'fingerprint': b'abc'
             })
r2 = Request('http://www.scrapy.org/',
             meta={
                 b'domain': {
                     b'fingerprint': b'2'
                 },
                 b'fingerprint': b'012'
             })
r3 = Request('http://www.test.com/some/page',
             meta={
                 b'domain': {
                     b'fingerprint': b'3'
                 },
                 b'fingerprint': b'345'
             })


class TestMessageBusBackend(unittest.TestCase):