示例#1
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            assert redis.dbsize() > num_keys

            queue.purge_queues()

            assert redis.get('ckanext-harvest:some-random-key') == 'foobar'
            assert redis.dbsize() == num_keys
            assert redis.llen(queue.get_gather_routing_key()) == 0
            assert redis.llen(queue.get_fetch_routing_key()) == 0
        finally:
            redis.delete('ckanext-harvest:some-random-key')
def purge_distributed_queues(gather_queue_name, fetch_queue_name):
    '''
    Purges given persistent queues.
    
    @param gather_queue_name    name of the gather queue
    @param fetch_queue_name     name of the fetch queue
    '''
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
    connection = get_connection()
    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.queue_purge(queue=gather_queue_name)
        channel.queue_purge(queue=fetch_queue_name)
        return
    raise Exception('not a valid queue type %s' % backend)
def purge_distributed_queues(gather_queue_name, fetch_queue_name):
    '''
    Purges given persistent queues.
    
    @param gather_queue_name    name of the gather queue
    @param fetch_queue_name     name of the fetch queue
    '''
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
    connection = get_connection()
    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.queue_purge(queue=gather_queue_name)
        channel.queue_purge(queue=fetch_queue_name)
        return
    raise Exception('not a valid queue type %s' % backend)
def get_publisher(exchange_name, routing_key):
    '''
    Returns a publisher object.
    
    @param exchange    name of the exchange to send messages to
    @param routing_key message routing key    
    '''
    connection = get_connection()
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.exchange_declare(exchange=exchange_name, durable=True)
        return Publisher(connection,
                         channel,
                         exchange_name,
                         routing_key=routing_key)
    raise Exception('not a valid queue type %s' % backend)
def get_publisher(exchange_name, routing_key):
    '''
    Returns a publisher object.
    
    @param exchange    name of the exchange to send messages to
    @param routing_key message routing key    
    '''
    connection = get_connection()
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.exchange_declare(exchange=exchange_name, durable=True)
        return Publisher(connection,
                         channel,
                         exchange_name,
                         routing_key=routing_key)
    raise Exception('not a valid queue type %s' % backend)
def get_consumer(exchange_name, queue_name, routing_key):
    '''
    Returns a reference to a RabbitMQ server channel.
    
    @param exchange    name of the exchange to send messages to
    @param queue_key   name of the queue to receive messages from    
    @param routing_key message routing key
    '''
    connection = get_connection()
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)

    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.exchange_declare(exchange=exchange_name, durable=True)
        channel.queue_declare(queue=queue_name, durable=True)
        channel.queue_bind(queue=queue_name, exchange=exchange_name, routing_key=routing_key)
        return channel
    raise Exception('not a valid queue type %s' % backend)
示例#7
0
    def test_redis_corrupt(self, mock_log_error):
        '''
        Test that corrupt Redis doesn't stop harvest process and still processes other jobs.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key-2', 'foobar')

            # make sure queues/exchanges are created first and are empty
            gather_consumer = queue.get_gather_consumer()
            fetch_consumer = queue.get_fetch_consumer()
            gather_consumer.queue_purge(queue=queue.get_gather_queue_name())
            fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name())

            # Create some fake jobs and objects with no harvest_job_id
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': None})
            h_obj_id = str(uuid.uuid4())
            fetch_publisher.send({'harvest_object_id': h_obj_id})

            # Create some fake objects
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            _, _, body = next(
                fetch_consumer.consume(queue.get_fetch_queue_name()))

            json_obj = json.loads(body)
            assert json_obj['harvest_object_id'] == h_obj_id

            assert mock_log_error.call_count == 1
            args, _ = mock_log_error.call_args_list[0]
            if six.PY2:
                assert "cannot concatenate 'str' and 'NoneType' objects" in args[
                    1]
            else:
                assert "must be str, not NoneType" in str(args[1])

        finally:
            redis.delete('ckanext-harvest:some-random-key-2')
def get_consumer(exchange_name, queue_name, routing_key):
    '''
    Returns a reference to a RabbitMQ server channel.
    
    @param exchange    name of the exchange to send messages to
    @param queue_key   name of the queue to receive messages from    
    @param routing_key message routing key
    '''
    connection = get_connection()
    backend = config.get('ckan.harvest.mq.type', MQ_TYPE)

    if backend in ('amqp', 'ampq'):
        channel = connection.channel()
        channel.exchange_declare(exchange=exchange_name, durable=True)
        channel.queue_declare(queue=queue_name, durable=True)
        channel.queue_bind(queue=queue_name,
                           exchange=exchange_name,
                           routing_key=routing_key)
        return channel
    raise Exception('not a valid queue type %s' % backend)
示例#9
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            raise SkipTest()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            ok_(redis.dbsize() > num_keys)

            queue.purge_queues()

            assert_equal(redis.get('ckanext-harvest:some-random-key'),
                         'foobar')
            assert_equal(redis.dbsize(), num_keys)
            assert_equal(redis.llen(queue.get_gather_routing_key()), 0)
            assert_equal(redis.llen(queue.get_fetch_routing_key()), 0)
        finally:
            redis.delete('ckanext-harvest:some-random-key')
示例#10
0
    def test_01_basic_harvester(self):

        if config.get('ckan.harvest.mq.type') == 'redis':
            # make sure that there are no old elements in the redis db
            redis = queue.get_connection()
            redis.flushdb()

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {}
        )['name']

        context = {'model': model, 'session': model.Session,
                   'user': user, 'api_version': 3, 'ignore_auth': True}

        harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'COMPLETE'
        assert all_objects[0].report_status == 'added'
        assert all_objects[1].state == 'COMPLETE'
        assert all_objects[1].report_status == 'added'
        assert all_objects[2].state == 'COMPLETE'
        assert all_objects[2].report_status == 'added'

        # fire run again to check if job is set to Finished
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )

        assert harvest_job['status'] == u'Finished'
        assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}
        assert harvest_source_dict['status']['total_datasets'] == 3
        assert harvest_source_dict['status']['job_count'] == 1

        # Second run
        harvest_job = toolkit.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']
        assert toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
        assert len(all_objects) == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
        assert len(all_objects) == 2

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
        assert len(all_objects) == 1

        # run to make sure job is marked as finshed
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )
        assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}
        assert harvest_source_dict['status']['total_datasets'] == 2
        assert harvest_source_dict['status']['job_count'] == 2
示例#11
0
    def test_resubmit_objects(self):
        '''
        Test that only harvest objects re-submitted which were not be present in the redis fetch queue.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        # make sure that there are no old elements in the redis db
        redis = queue.get_connection()
        fetch_routing_key = queue.get_fetch_routing_key()
        redis.flushdb()
        try:
            # make sure queues/exchanges are created first and are empty
            consumer = queue.get_gather_consumer()
            consumer_fetch = queue.get_fetch_consumer()
            consumer.queue_purge(queue=queue.get_gather_queue_name())
            consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

            user = toolkit.get_action('get_site_user')(
                {'model': model, 'ignore_auth': True}, {}
            )['name']

            context = {'model': model, 'session': model.Session,
                       'user': user, 'api_version': 3, 'ignore_auth': True}

            harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

            assert redis.llen(fetch_routing_key) == 3

            # do only one time for the first harvest object
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            queue.fetch_callback(consumer_fetch, *reply)

            count = model.Session.query(model.Package) \
                .filter(model.Package.type == 'dataset') \
                .count()
            assert count == 1

            all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all()
            assert len(all_objects) == 3
            assert all_objects[0].state == 'COMPLETE'
            assert all_objects[0].report_status == 'added'
            assert all_objects[0].current is True
            assert all_objects[1].state == 'WAITING'
            assert all_objects[1].current is False
            assert all_objects[2].state == 'WAITING'
            assert all_objects[2].current is False

            assert len(redis.keys(fetch_routing_key + ':*')) == 0
            assert redis.llen(fetch_routing_key) == 2

            # Remove one object from redis that should be re-sent to the fetch queue
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert len(fetch_queue_items) == 1
            harvest_object_id = reply[2]
            assert fetch_queue_items[0] != harvest_object_id

            queue.resubmit_objects()

            assert redis.llen(fetch_routing_key) == 2
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert harvest_object_id in fetch_queue_items
            assert redis.dbsize() == 1
        finally:
            redis.flushdb()