Exemplo n.º 1
0
    def __init__(self):
        self.active = False

        self.prefetch_queue = Queue(settings.prefetch['queue_size'])
        self.prefetch_thread = spawn(self.prefetch_worker)
        self.prefetch_thread.link(reraise_errors, greenthread.getcurrent())

        self.given_items = Cache()

        self.postreport_queue = Queue(settings.postreport['queue_size'])
        self.postreport_thread = spawn(self.postreport_worker)
        self.postreport_thread.link(reraise_errors, greenthread.getcurrent())

        self.storage_connections = eventlet.pools.Pool(max_size=settings.storage['max_connections'])
        self.storage_connections.create = StorageConnection
Exemplo n.º 2
0
 def test_set_001(self):
     c = Cache()
     c.set('key-set_001', "value")
     self.assertEqual(c.get('key-set_001'), "value")
Exemplo n.º 3
0
 def test_dict_constructor_001(self):
     """`Cache` constructor must behave like dict's one."""
     d = {}
     c = Cache(d)
     self.assertEqual(c.items(), [])
Exemplo n.º 4
0
 def setUp(self):
     self.cache = Cache()
Exemplo n.º 5
0
class ExpireTestCase(unittest.TestCase):
    def setUp(self):
        self.cache = Cache()

    def test_timer_001(self):
        """Must delete items after timeout."""
        self.cache.set('key-timer_001', "value", DEFAULT_TIMEOUT / 2)
        eventlet.sleep(DEFAULT_TIMEOUT)
        self.assertFalse('key-timer_001' in self.cache)

    def test_race_001(self):
        """Expiry must not conflict with regular `del`."""
        self.cache.set('key-delete_race_001', "value", DEFAULT_TIMEOUT / 2)
        del self.cache['key-delete_race_001']
        eventlet.sleep(DEFAULT_TIMEOUT)
        self.assertFalse('key-delete_race_001' in self.cache)

    def test_race_002(self):
        """Expiry must not conflict with regular `pop`."""
        self.cache.set('key-delete_race_002', "value", DEFAULT_TIMEOUT / 2)
        self.cache.pop('key-delete_race_002')
        eventlet.sleep(DEFAULT_TIMEOUT)
        self.assertFalse('key-delete_race_002' in self.cache)

    def test_race_003(self):
        """Expiry must not conflict with regular `del` in another thread."""
        self.cache.set('key-delete_race_003', "value", DEFAULT_TIMEOUT / 2)
        eventlet.spawn_n(self.cache.__delitem__, 'key-delete_race_003')
        eventlet.sleep(DEFAULT_TIMEOUT)
        self.assertFalse('key-delete_race_003' in self.cache)

    def test_race_004(self):
        """Expiry must not conflict with regular `pop` in another thread."""
        self.cache.set('key-delete_race_004', "value", DEFAULT_TIMEOUT / 2)
        eventlet.spawn_n(self.cache.pop, 'key-delete_race_004')
        eventlet.sleep(DEFAULT_TIMEOUT)
        self.assertFalse('key-delete_race_004' in self.cache)
Exemplo n.º 6
0
class Manager(object):
    """Class encapsulating Heroshi URL server state."""

    def __init__(self):
        self.active = False

        self.prefetch_queue = Queue(settings.prefetch['queue_size'])
        self.prefetch_thread = spawn(self.prefetch_worker)
        self.prefetch_thread.link(reraise_errors, greenthread.getcurrent())

        self.given_items = Cache()

        self.postreport_queue = Queue(settings.postreport['queue_size'])
        self.postreport_thread = spawn(self.postreport_worker)
        self.postreport_thread.link(reraise_errors, greenthread.getcurrent())

        self.storage_connections = eventlet.pools.Pool(max_size=settings.storage['max_connections'])
        self.storage_connections.create = StorageConnection

    def close(self):
        self.active = False
        self.prefetch_thread.kill()
        self.postreport_thread.kill()

    def ping_storage(self):
        with self.storage_connections.item() as storage:
            pass

    def get_from_prefetch_queue(self, size):
        result = []
        while len(result) < size:
            sleep()
            try:
                pack = self.prefetch_queue.get(timeout=settings.prefetch['get_timeout'])
            except eventlet.queue.Empty:
                break
            result.extend(pack)
        return result

    def prefetch_worker(self):
        if not self.active:
            sleep(0.01)
        while self.active:
            with self.storage_connections.item() as storage:
                docs = storage.query_new_random(settings.prefetch['single_limit'])
            if len(docs) == 0:
                sleep(10.)
                continue
            else:
                # Note: putting a *list* as a single item into queue
                self.prefetch_queue.put(docs)
        # and respawn again
        self.prefetch_thread = spawn(self.prefetch_worker)

    @log_exceptions
    def _postreport_worker(self):
        docs = []
        while len(docs) < settings.postreport['flush_size']: # inner accumulator loop
            try:
                item = self.postreport_queue.get(timeout=settings.postreport['flush_delay'])
            except eventlet.queue.Empty:
                break

            # Quick dirty duplicate filtering.
            #     Note that this code only finds dups in current "flush pack". `report_result` uses
            # `is_duplicate_report` which finds dups in whole `postreport_queue` but it can't find dups here.
            # Thus two dups searchers.
            #     It is still possible that at most 2 duplicate reports exist: one in `postreport_queue`
            # and one in current "flush pack". This is acceptable, because most of the dups are filtered out.
            for doc in docs:
                if item['url'] == doc['url']:
                    item = None
                    break
            if item is None:
                continue

            if 'result' not in item:
                # It's a link, found on some reported page.
                # Just add it to bulk insert, don't try to update any document here.
                docs.append(item)
                continue

            docs.append(item)

        if not docs:
            return

        with self.storage_connections.item() as storage:
            for doc in docs:
                content = doc.pop('content', None)

                storage.save(doc)

                if content is None:
                    continue
                headers = doc.get('headers') or {}
                content_type = headers.get('content-type', "application/octet-stream")

                storage.save_content(doc, content, content_type)

    def postreport_worker(self):
        if not self.active:
            sleep(0.01)

        while self.active:
            self._postreport_worker()

        # and respawn again
        self.postreport_thread = spawn(self.prefetch_worker)

    @log_exceptions
    def crawl_queue(self, request):
        limit = max(int(request.POST['limit']), settings.api['max_queue_limit'])

        time_now = datetime.datetime.now()

        doc_list = self.get_from_prefetch_queue(limit)
        for doc in doc_list:
            if isinstance(doc['visited'], basestring):
                doc['visited'] = datetime.datetime.strptime(doc['visited'], TIME_FORMAT)
            self.given_items.set(doc['url'], doc, settings.prefetch['cache_timeout'])

        def is_old(doc):
            """Predicate tells if page was never visited or visited long enough ago.

            Worker SHOULD NOT visit URI, if this function returns False.
            """
            if doc['visited'] is None:
                return True
            diff = time_now - doc['visited']
            return diff > datetime.timedelta(minutes=settings.api['min_revisit_minutes'])

        doc_list = filter(is_old, doc_list)

        def make_queue_item(doc):
            if isinstance(doc['visited'], datetime.datetime):
                doc['visited'] = doc['visited'].strftime(TIME_FORMAT)
            filter_fields = ('url', 'headers', 'visited',)
            return dict( (k,v) for (k,v) in doc.iteritems() if k in filter_fields )

        queue = map(make_queue_item, doc_list)
        return queue

    def is_duplicate_report(self, url):
        """Quick dirty duplicate searching."""

        for doc in self.postreport_queue.queue:
            if url == doc['url']:
                return True
        return False

    def force_append_links(self, links):
        # 1. remove duplicates
        links = set(links)

        # 2. put links into queue
        for url in links:
            new_doc = {'url': url, 'parent': None, 'visited': None}
            self.postreport_queue.put(new_doc)

    @log_exceptions
    def report_result(self, request):
        report = json.loads(request.body)

        # `report['links']` now used only to force insertion of new URLs into
        #   Heroshi crawling queue via bin/heroshi-append script.
        # So, if a more sophisticated way to append new URLs is to arise,
        #   remove this code.
        if report['url'] is None:
            self.force_append_links(report['links'])
            return

        if self.is_duplicate_report(report['url']):
            return

        # accept report into postreport_queue for later persistent saving
        try:
            doc = self.given_items[report['url']]
        except KeyError:
            self.postreport_queue.put(report)
        else:
            doc.update(report)
            self.postreport_queue.put(doc)

        return None