Exemplo n.º 1
0
def test_no_normed_cross_db(destination, index='test'):

    if scrapi.settings.CANONICAL_PROCESSOR == destination:
        return

    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    # Get the test documents into the caonical processor, but don't process normalized
    canonical_processor = get_processor(scrapi.settings.CANONICAL_PROCESSOR)
    canonical_processor.process_raw(RAW)
    # canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor raw is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc.raw
    assert not canonical_doc.normalized

    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc

    # # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index)

    # Check to see if the document didn't make made it to the destinaton, and is still in the canonical
    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    scrapi.processing.elasticsearch.es = real_es
Exemplo n.º 2
0
def test_cross_db(canonical, destination, monkeypatch, index='test'):

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    if destination != 'elasticsearch':
        real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
        scrapi.processing.elasticsearch.es = mock.MagicMock()
    else:
        monkeypatch.setattr('scrapi.settings.ELASTIC_INDEX', 'test')

    # Get the test documents into the caonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    source=RAW['source'])
        assert not destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    index=index,
                                                    source=RAW['source'])
        assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    source=RAW['source'])
        assert destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    index=index,
                                                    source=RAW['source'])
        assert destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        scrapi.processing.elasticsearch.es = real_es
Exemplo n.º 3
0
def cross_db(docs,
             source_db=None,
             target_db=None,
             index=None,
             versions=False,
             **kwargs):
    """
    Migration to go between
        cassandra > postgres
        postgres > cassandra
        cassandra > elasticsearch
        postgres > elasticsearch

    source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings
    target_db will be specified when the task is called
    """
    assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch'
    assert target_db in [
        'postgres', 'cassandra', 'elasticsearch'
    ], 'Invalid target database - please specify either postgres, cassandra or elasticsearch'
    source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR)
    target_processor = get_processor(target_db)
    for doc in docs:
        try:
            if not doc.raw['doc']:
                # corrupted database item has no doc element
                message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(
                    doc.raw.attributes['source'], doc.raw.attributes['docID'])
                log_to_sentry(message)
                logger.info(message)
                continue

            raw, normalized = doc.raw, doc.normalized

            if not kwargs.get('dry'):
                if versions:
                    for raw_version, norm_version in source_processor.get_versions(
                            raw['source'], raw['docID']):
                        target_processor.process_raw(raw_version)
                        if norm_version:
                            target_processor.process_normalized(
                                raw_version, norm_version)
                        else:
                            logger.info(
                                'Not storing migrated normalized version from {} with id {}, document is not in approved set list.'
                                .format(raw.attributes['source'],
                                        raw.attributes['docID']))
                else:
                    target_processor.process_raw(raw)
                    if normalized:
                        target_processor.process_normalized(raw, normalized)
                    else:
                        logger.info(
                            'Not storing migrated normalized from {} with id {}, document is not in approved set list.'
                            .format(raw.attributes['source'],
                                    raw.attributes['docID']))
        except Exception as e:
            logger.exception(e)
            log_to_sentry(e)
Exemplo n.º 4
0
def test_cross_db_with_versions(canonical,
                                destination,
                                monkeypatch,
                                index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW,
                                           NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(
        canonical_processor.get_versions(docID=RAW['docID'],
                                         source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index,
                  versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(
        destination_processor.get_versions(docID=RAW['docID'],
                                           source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc
Exemplo n.º 5
0
def delete(docs, sources=None, **kwargs):
    for doc in docs:
        assert sources, "To run this migration you need a source."

        processor = get_processor(settings.CANONICAL_PROCESSOR)
        processor.delete(source=doc.raw.attributes['source'], docID=doc.raw.attributes['docID'])

        es_processor = get_processor('elasticsearch')
        es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404])
        es_processor.manager.es.delete(index='share_v1', doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404])

        logger.info('Deleted document from {} with id {}'.format(sources, doc.raw.attributes['docID']))
Exemplo n.º 6
0
def test_rename(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(source=RAW['source'], docID=RAW['docID'])

    old_source = NORMALIZED['shareProperties']['source']

    assert queryset.normalized.attributes['shareProperties']['source'] == utils.RECORD['shareProperties']['source']
    assert queryset.normalized.attributes['shareProperties']['source'] == old_source

    new_record = copy.deepcopy(utils.RECORD)
    new_record['shareProperties']['source'] = 'wwe_news'
    test_harvester.short_name = 'wwe_news'
    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = processor.get(source='wwe_news', docID=RAW['docID'])

    assert queryset.normalized.attributes['shareProperties']['source'] == 'wwe_news'

    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
    test_harvester.short_name = RAW['source']
    registry['test'] = test_harvester
    del registry['wwe_news']
Exemplo n.º 7
0
def test_renormalize(processor_name, monkeypatch):
    # Set up
    # real_es = scrapi.processing.elasticsearch.es
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es

    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    # Process raw and normalized with fake docs
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    # Check to see those docs were processed
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    # Create a new doucment to be renormalized
    new_raw = copy.deepcopy(RAW)
    new_raw.attributes['docID'] = 'get_the_tables'
    new_raw.attributes['doc'] = new_raw.attributes['doc'].encode('utf-8')

    # This is basically like running the improved harvester right?
    processor.create(new_raw.attributes)

    tasks.migrate(renormalize, sources=[RAW['source']], dry=False)

    queryset = processor.get(docID='get_the_tables', source=RAW['source'])
    assert queryset
    scrapi.processing.elasticsearch.es = real_es
    processor.delete(docID='get_the_tables', source=RAW['source'])
Exemplo n.º 8
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    for doc in docs:
        new_doc = copy.deepcopy(doc.raw.attributes)
        new_doc['source'] = target

        raw = RawDocument(new_doc, validate=False)

        assert doc.raw.attributes[
            'source'] != target, "Can't rename {} to {}, names are the same.".format(
                doc.raw['source'], target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(
                doc.raw.attributes['source'], raw['docID']))

            es_processor = get_processor('elasticsearch')
            es_processor.manager.es.delete(
                index=settings.ELASTIC_INDEX,
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])
            es_processor.manager.es.delete(
                index='share_v1',
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])

        logger.info('Renamed document from {} to {} with id {}'.format(
            doc.raw.attributes['source'], target, raw['docID']))
Exemplo n.º 9
0
def test_rename(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(source=RAW['source'], docID=RAW['docID'])

    old_source = NORMALIZED['shareProperties']['source']

    assert queryset.normalized.attributes['shareProperties'][
        'source'] == utils.RECORD['shareProperties']['source']
    assert queryset.normalized.attributes['shareProperties'][
        'source'] == old_source

    new_record = copy.deepcopy(utils.RECORD)
    new_record['shareProperties']['source'] = 'wwe_news'
    test_harvester.short_name = 'wwe_news'
    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = processor.get(source='wwe_news', docID=RAW['docID'])

    assert queryset.normalized.attributes['shareProperties'][
        'source'] == 'wwe_news'

    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
    test_harvester.short_name = RAW['source']
    registry['test'] = test_harvester
    del registry['wwe_news']
Exemplo n.º 10
0
def test_renormalize(processor_name, monkeypatch):
    # Set up
    # real_es = scrapi.processing.elasticsearch.es
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es

    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    # Process raw and normalized with fake docs
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    # Check to see those docs were processed
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    # Create a new doucment to be renormalized
    new_raw = copy.deepcopy(RAW)
    new_raw.attributes['docID'] = 'get_the_tables'
    new_raw.attributes['doc'] = new_raw.attributes['doc'].encode('utf-8')

    # This is basically like running the improved harvester right?
    processor.create(new_raw.attributes)

    tasks.migrate(renormalize, sources=[RAW['source']], dry=False)

    queryset = processor.get(docID='get_the_tables', source=RAW['source'])
    assert queryset
    scrapi.processing.elasticsearch.es = real_es
    processor.delete(docID='get_the_tables', source=RAW['source'])
Exemplo n.º 11
0
def test_cross_db(canonical, destination, monkeypatch, index='test'):

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    if destination != 'elasticsearch':
        real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
        scrapi.processing.elasticsearch.es = mock.MagicMock()
    else:
        monkeypatch.setattr('scrapi.settings.ELASTIC_INDEX', 'test')

    # Get the test documents into the caonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
        assert not destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source'])
        assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
        assert destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source'])
        assert destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        scrapi.processing.elasticsearch.es = real_es
Exemplo n.º 12
0
def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs):
    """
    Migration to go between
        cassandra > postgres
        postgres > cassandra
        cassandra > elasticsearch
        postgres > elasticsearch

    source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings
    target_db will be specified when the task is called
    """
    assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch'
    assert target_db in ['postgres', 'cassandra', 'elasticsearch'], 'Invalid target database - please specify either postgres, cassandra or elasticsearch'
    source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR)
    target_processor = get_processor(target_db)
    for doc in docs:
        try:
            if not doc.raw['doc']:
                # corrupted database item has no doc element
                message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(doc.raw.attributes['source'], doc.raw.attributes['docID'])
                log_to_sentry(message)
                logger.info(message)
                continue

            raw, normalized = doc.raw, doc.normalized

            if not kwargs.get('dry'):
                if versions:
                    for raw_version, norm_version in source_processor.get_versions(raw['source'], raw['docID']):
                        target_processor.process_raw(raw_version)
                        if norm_version:
                            target_processor.process_normalized(raw_version, norm_version)
                        else:
                            logger.info('Not storing migrated normalized version from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID']))
                else:
                    target_processor.process_raw(raw)
                    if normalized:
                        target_processor.process_normalized(raw, normalized)
                    else:
                        logger.info('Not storing migrated normalized from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID']))
        except Exception as e:
            logger.exception(e)
            log_to_sentry(e)
Exemplo n.º 13
0
def test_no_normed_cross_db(destination, index='test'):

    if scrapi.settings.CANONICAL_PROCESSOR == destination:
        return

    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    # Get the test documents into the caonical processor, but don't process normalized
    canonical_processor = get_processor(scrapi.settings.CANONICAL_PROCESSOR)
    canonical_processor.process_raw(RAW)
    # canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor raw is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc.raw
    assert not canonical_doc.normalized

    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc

    # # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index)

    # Check to see if the document didn't make made it to the destinaton, and is still in the canonical
    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    scrapi.processing.elasticsearch.es = real_es
Exemplo n.º 14
0
def delete(docs, sources=None, **kwargs):
    for doc in docs:
        assert sources, "To run this migration you need a source."

        processor = get_processor(settings.CANONICAL_PROCESSOR)
        processor.delete(source=doc.raw.attributes['source'],
                         docID=doc.raw.attributes['docID'])

        es_processor = get_processor('elasticsearch')
        es_processor.manager.es.delete(index=settings.ELASTIC_INDEX,
                                       doc_type=sources,
                                       id=doc.raw.attributes['docID'],
                                       ignore=[404])
        es_processor.manager.es.delete(index='share_v1',
                                       doc_type=sources,
                                       id=doc.raw.attributes['docID'],
                                       ignore=[404])

        logger.info('Deleted document from {} with id {}'.format(
            sources, doc.raw.attributes['docID']))
Exemplo n.º 15
0
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(destination_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc
def main():
    for docID in LIST_OF_DOCIDS:
        es_processor = get_processor('elasticsearch')
        es_processor.manager.es.delete(index=settings.ELASTIC_INDEX,
                                       doc_type='scholarsphere',
                                       id=docID,
                                       ignore=[404])
        es_processor.manager.es.delete(index='share_v1',
                                       doc_type='scholarsphere',
                                       id=docID,
                                       ignore=[404])

        logger.info('Deleted document with ID {}'.format(docID))
Exemplo n.º 17
0
def test_delete(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    print('Canonical Processor is {}'.format(scrapi.settings.CANONICAL_PROCESSOR))
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not queryset
    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
Exemplo n.º 18
0
def test_delete(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    print('Canonical Processor is {}'.format(
        scrapi.settings.CANONICAL_PROCESSOR))
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not queryset
    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
Exemplo n.º 19
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    for doc in docs:
        new_doc = copy.deepcopy(doc.raw.attributes)
        new_doc['source'] = target

        raw = RawDocument(new_doc, validate=False)

        assert doc.raw.attributes['source'] != target, "Can't rename {} to {}, names are the same.".format(doc.raw['source'], target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(doc.raw.attributes['source'], raw['docID']))

            es_processor = get_processor('elasticsearch')
            es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404])
            es_processor.manager.es.delete(index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404])

        logger.info('Renamed document from {} to {} with id {}'.format(doc.raw.attributes['source'], target, raw['docID']))
Exemplo n.º 20
0
def test_raises_on_bad_processor():
    with pytest.raises(NotImplementedError):
        processing.get_processor("Baby, You're never there.")
Exemplo n.º 21
0
    return normalized  # returns a single normalized document


@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=settings.CELERY_MAX_RETRIES, throws=events.Skip)
@events.logged(events.PROCESSING, 'normalized')
def process_normalized(normalized_doc, raw_doc, **kwargs):
    if not normalized_doc:
        raise events.Skip('Not processing document with id {}'.format(raw_doc['docID']))
    processing.process_normalized(raw_doc, normalized_doc, kwargs)


@app.task
def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs):

    source_db = source_db or settings.CANONICAL_PROCESSOR
    documents = processing.get_processor(source_db).documents

    doc_sources = sources or registry.keys()
    docs = documents(*doc_sources)
    if async:
        segment = list(islice(docs, group_size))
        while segment:
            migration.s(segment, sources=sources, dry=dry, **kwargs).apply_async()
            segment = list(islice(docs, group_size))
    else:
        for doc in docs:
            migration((doc,), sources=sources, dry=dry, **kwargs)

    if dry:
        logger.info('Dry run complete')
Exemplo n.º 22
0
def test_raises_on_bad_processor():
    with pytest.raises(NotImplementedError):
        processing.get_processor("Baby, You're never there.")
Exemplo n.º 23
0
        raise events.Skip('Not processing document with id {}'.format(
            raw_doc['docID']))
    processing.process_normalized(raw_doc, normalized_doc, kwargs)


@app.task
def migrate(migration,
            source_db=None,
            sources=tuple(),
            async=False,
            dry=True,
            group_size=1000,
            **kwargs):

    source_db = source_db or settings.CANONICAL_PROCESSOR
    documents = processing.get_processor(source_db).documents

    doc_sources = sources or registry.keys()
    docs = documents(*doc_sources)
    if async:
        segment = list(islice(docs, group_size))
        while segment:
            migration.s(segment,
                        sources=sources,
                        dry=dry,
                        source_db=source_db,
                        **kwargs).apply_async()
            segment = list(islice(docs, group_size))
    else:
        for doc in docs:
            migration((doc, ), sources=sources, dry=dry, **kwargs)