def test_no_normed_cross_db(destination, index='test'): if scrapi.settings.CANONICAL_PROCESSOR == destination: return real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() # Get the test documents into the caonical processor, but don't process normalized canonical_processor = get_processor(scrapi.settings.CANONICAL_PROCESSOR) canonical_processor.process_raw(RAW) # canonical_processor.process_normalized(RAW, NORMALIZED) destination_processor = get_processor(destination) # Check to see canonical_processor raw is there, and destination is not canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc.raw assert not canonical_doc.normalized destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc # # Migrate from the canonical to the destination tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index) # Check to see if the document didn't make made it to the destinaton, and is still in the canonical destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc.normalized canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc scrapi.processing.elasticsearch.es = real_es
def test_cross_db(canonical, destination, monkeypatch, index='test'): if canonical == destination: return monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical) if destination != 'elasticsearch': real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() else: monkeypatch.setattr('scrapi.settings.ELASTIC_INDEX', 'test') # Get the test documents into the caonical processor canonical_processor = get_processor(canonical) canonical_processor.process_raw(RAW) canonical_processor.process_normalized(RAW, NORMALIZED) destination_processor = get_processor(destination) # Check to see canonical_processor is there, and destination is not canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc if destination != 'elasticsearch': destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc else: destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source']) assert not destination_doc # Migrate from the canonical to the destination tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index) # Check to see if the document made it to the destinaton, and is still in the canonical if destination != 'elasticsearch': destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert destination_doc else: destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source']) assert destination_doc.normalized canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc if destination != 'elasticsearch': scrapi.processing.elasticsearch.es = real_es
def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs): """ Migration to go between cassandra > postgres postgres > cassandra cassandra > elasticsearch postgres > elasticsearch source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings target_db will be specified when the task is called """ assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch' assert target_db in [ 'postgres', 'cassandra', 'elasticsearch' ], 'Invalid target database - please specify either postgres, cassandra or elasticsearch' source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR) target_processor = get_processor(target_db) for doc in docs: try: if not doc.raw['doc']: # corrupted database item has no doc element message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format( doc.raw.attributes['source'], doc.raw.attributes['docID']) log_to_sentry(message) logger.info(message) continue raw, normalized = doc.raw, doc.normalized if not kwargs.get('dry'): if versions: for raw_version, norm_version in source_processor.get_versions( raw['source'], raw['docID']): target_processor.process_raw(raw_version) if norm_version: target_processor.process_normalized( raw_version, norm_version) else: logger.info( 'Not storing migrated normalized version from {} with id {}, document is not in approved set list.' .format(raw.attributes['source'], raw.attributes['docID'])) else: target_processor.process_raw(raw) if normalized: target_processor.process_normalized(raw, normalized) else: logger.info( 'Not storing migrated normalized from {} with id {}, document is not in approved set list.' .format(raw.attributes['source'], raw.attributes['docID'])) except Exception as e: logger.exception(e) log_to_sentry(e)
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'): new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide' if canonical == destination: return monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical) # Get the test documents into the canonical processor canonical_processor = get_processor(canonical) canonical_processor.process_raw(RAW) canonical_processor.process_normalized(RAW, NORMALIZED) # Get a version in there too new_normalized = copy.deepcopy(NORMALIZED.attributes) new_normalized['title'] = new_title canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized)) destination_processor = get_processor(destination) # Check to see canonical_processor versions are there, and destination are not canonical_versions = list( canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(canonical_versions) == 3 assert canonical_versions[1].normalized['title'] == NORMALIZED['title'] assert canonical_versions[2].normalized['title'] == new_title destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc # Migrate from the canonical to the destination tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True) # Check to see if the document made it to the destinaton, and is still in the canonical destination_versions = list( destination_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(destination_versions) == 3 assert destination_versions[1].normalized['title'] == NORMALIZED['title'] assert destination_versions[2].normalized['title'] == new_title canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc
def delete(docs, sources=None, **kwargs): for doc in docs: assert sources, "To run this migration you need a source." processor = get_processor(settings.CANONICAL_PROCESSOR) processor.delete(source=doc.raw.attributes['source'], docID=doc.raw.attributes['docID']) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format(sources, doc.raw.attributes['docID']))
def test_rename(processor_name, monkeypatch): real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name) processor = get_processor(processor_name) processor.process_raw(RAW) processor.process_normalized(RAW, NORMALIZED) queryset = processor.get(source=RAW['source'], docID=RAW['docID']) old_source = NORMALIZED['shareProperties']['source'] assert queryset.normalized.attributes['shareProperties']['source'] == utils.RECORD['shareProperties']['source'] assert queryset.normalized.attributes['shareProperties']['source'] == old_source new_record = copy.deepcopy(utils.RECORD) new_record['shareProperties']['source'] = 'wwe_news' test_harvester.short_name = 'wwe_news' registry['wwe_news'] = test_harvester tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False) queryset = processor.get(source='wwe_news', docID=RAW['docID']) assert queryset.normalized.attributes['shareProperties']['source'] == 'wwe_news' scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es test_harvester.short_name = RAW['source'] registry['test'] = test_harvester del registry['wwe_news']
def test_renormalize(processor_name, monkeypatch): # Set up # real_es = scrapi.processing.elasticsearch.es real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name) # Process raw and normalized with fake docs processor = get_processor(processor_name) processor.process_raw(RAW) processor.process_normalized(RAW, NORMALIZED) # Check to see those docs were processed queryset = processor.get(docID=RAW['docID'], source=RAW['source']) assert queryset # Create a new doucment to be renormalized new_raw = copy.deepcopy(RAW) new_raw.attributes['docID'] = 'get_the_tables' new_raw.attributes['doc'] = new_raw.attributes['doc'].encode('utf-8') # This is basically like running the improved harvester right? processor.create(new_raw.attributes) tasks.migrate(renormalize, sources=[RAW['source']], dry=False) queryset = processor.get(docID='get_the_tables', source=RAW['source']) assert queryset scrapi.processing.elasticsearch.es = real_es processor.delete(docID='get_the_tables', source=RAW['source'])
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes[ 'source'] != target, "Can't rename {} to {}, names are the same.".format( doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete( index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete( index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format( doc.raw.attributes['source'], target, raw['docID']))
def test_rename(processor_name, monkeypatch): real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name) processor = get_processor(processor_name) processor.process_raw(RAW) processor.process_normalized(RAW, NORMALIZED) queryset = processor.get(source=RAW['source'], docID=RAW['docID']) old_source = NORMALIZED['shareProperties']['source'] assert queryset.normalized.attributes['shareProperties'][ 'source'] == utils.RECORD['shareProperties']['source'] assert queryset.normalized.attributes['shareProperties'][ 'source'] == old_source new_record = copy.deepcopy(utils.RECORD) new_record['shareProperties']['source'] = 'wwe_news' test_harvester.short_name = 'wwe_news' registry['wwe_news'] = test_harvester tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False) queryset = processor.get(source='wwe_news', docID=RAW['docID']) assert queryset.normalized.attributes['shareProperties'][ 'source'] == 'wwe_news' scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es test_harvester.short_name = RAW['source'] registry['test'] = test_harvester del registry['wwe_news']
def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs): """ Migration to go between cassandra > postgres postgres > cassandra cassandra > elasticsearch postgres > elasticsearch source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings target_db will be specified when the task is called """ assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch' assert target_db in ['postgres', 'cassandra', 'elasticsearch'], 'Invalid target database - please specify either postgres, cassandra or elasticsearch' source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR) target_processor = get_processor(target_db) for doc in docs: try: if not doc.raw['doc']: # corrupted database item has no doc element message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(doc.raw.attributes['source'], doc.raw.attributes['docID']) log_to_sentry(message) logger.info(message) continue raw, normalized = doc.raw, doc.normalized if not kwargs.get('dry'): if versions: for raw_version, norm_version in source_processor.get_versions(raw['source'], raw['docID']): target_processor.process_raw(raw_version) if norm_version: target_processor.process_normalized(raw_version, norm_version) else: logger.info('Not storing migrated normalized version from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID'])) else: target_processor.process_raw(raw) if normalized: target_processor.process_normalized(raw, normalized) else: logger.info('Not storing migrated normalized from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID'])) except Exception as e: logger.exception(e) log_to_sentry(e)
def delete(docs, sources=None, **kwargs): for doc in docs: assert sources, "To run this migration you need a source." processor = get_processor(settings.CANONICAL_PROCESSOR) processor.delete(source=doc.raw.attributes['source'], docID=doc.raw.attributes['docID']) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type=sources, id=doc.raw.attributes['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format( sources, doc.raw.attributes['docID']))
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'): new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide' if canonical == destination: return monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical) # Get the test documents into the canonical processor canonical_processor = get_processor(canonical) canonical_processor.process_raw(RAW) canonical_processor.process_normalized(RAW, NORMALIZED) # Get a version in there too new_normalized = copy.deepcopy(NORMALIZED.attributes) new_normalized['title'] = new_title canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized)) destination_processor = get_processor(destination) # Check to see canonical_processor versions are there, and destination are not canonical_versions = list(canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(canonical_versions) == 3 assert canonical_versions[1].normalized['title'] == NORMALIZED['title'] assert canonical_versions[2].normalized['title'] == new_title destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc # Migrate from the canonical to the destination tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True) # Check to see if the document made it to the destinaton, and is still in the canonical destination_versions = list(destination_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(destination_versions) == 3 assert destination_versions[1].normalized['title'] == NORMALIZED['title'] assert destination_versions[2].normalized['title'] == new_title canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc
def main(): for docID in LIST_OF_DOCIDS: es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type='scholarsphere', id=docID, ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type='scholarsphere', id=docID, ignore=[404]) logger.info('Deleted document with ID {}'.format(docID))
def test_delete(processor_name, monkeypatch): real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name) print('Canonical Processor is {}'.format(scrapi.settings.CANONICAL_PROCESSOR)) processor = get_processor(processor_name) processor.process_raw(RAW) processor.process_normalized(RAW, NORMALIZED) queryset = processor.get(docID=RAW['docID'], source=RAW['source']) assert queryset tasks.migrate(delete, sources=[RAW['source']], dry=False) queryset = processor.get(docID=RAW['docID'], source=RAW['source']) assert not queryset scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
def test_delete(processor_name, monkeypatch): real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es scrapi.processing.elasticsearch.es = mock.MagicMock() monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name) print('Canonical Processor is {}'.format( scrapi.settings.CANONICAL_PROCESSOR)) processor = get_processor(processor_name) processor.process_raw(RAW) processor.process_normalized(RAW, NORMALIZED) queryset = processor.get(docID=RAW['docID'], source=RAW['source']) assert queryset tasks.migrate(delete, sources=[RAW['source']], dry=False) queryset = processor.get(docID=RAW['docID'], source=RAW['source']) assert not queryset scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes['source'] != target, "Can't rename {} to {}, names are the same.".format(doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format(doc.raw.attributes['source'], target, raw['docID']))
def test_raises_on_bad_processor(): with pytest.raises(NotImplementedError): processing.get_processor("Baby, You're never there.")
return normalized # returns a single normalized document @task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=settings.CELERY_MAX_RETRIES, throws=events.Skip) @events.logged(events.PROCESSING, 'normalized') def process_normalized(normalized_doc, raw_doc, **kwargs): if not normalized_doc: raise events.Skip('Not processing document with id {}'.format(raw_doc['docID'])) processing.process_normalized(raw_doc, normalized_doc, kwargs) @app.task def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs): source_db = source_db or settings.CANONICAL_PROCESSOR documents = processing.get_processor(source_db).documents doc_sources = sources or registry.keys() docs = documents(*doc_sources) if async: segment = list(islice(docs, group_size)) while segment: migration.s(segment, sources=sources, dry=dry, **kwargs).apply_async() segment = list(islice(docs, group_size)) else: for doc in docs: migration((doc,), sources=sources, dry=dry, **kwargs) if dry: logger.info('Dry run complete')
raise events.Skip('Not processing document with id {}'.format( raw_doc['docID'])) processing.process_normalized(raw_doc, normalized_doc, kwargs) @app.task def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs): source_db = source_db or settings.CANONICAL_PROCESSOR documents = processing.get_processor(source_db).documents doc_sources = sources or registry.keys() docs = documents(*doc_sources) if async: segment = list(islice(docs, group_size)) while segment: migration.s(segment, sources=sources, dry=dry, source_db=source_db, **kwargs).apply_async() segment = list(islice(docs, group_size)) else: for doc in docs: migration((doc, ), sources=sources, dry=dry, **kwargs)