def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl(app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS']) gotten_results = [override_generated_fields(result) for result in results] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results
def test_cds(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS')) crawl_results = deep_sort( sorted( crawl_results, key=lambda result: result['record']['titles'][0]['title'], )) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], )) gotten_results = [ override_generated_fields(result['record']) for result in crawl_results ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results
def test_desy( expected_results, settings, cleanup, ): crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL')) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=2, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS')) records = [result['record'] for result in crawl_results] gotten_results = override_dynamic_fields_on_records(records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_results = deep_sort( sorted( gotten_results, key=lambda result: result['titles'][0]['title'], )) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], )) assert gotten_results == expected_results
def test_pipeline(generated_records, expected_records): clean_generated_records = [ override_generated_fields(generated_record) for generated_record in generated_records ] sorted_generated_records = deep_sort(clean_generated_records) sorted_expected_records = deep_sort(expected_records) assert sorted_generated_records == sorted_expected_records
def test_pipeline(generated_records, expected_records): clean_generated_records = [ override_generated_fields(generated_record) for generated_record in generated_records ] sorted_generated_records = deep_sort(clean_generated_records) sorted_expected_records = deep_sort(expected_records) assert sorted_generated_records == sorted_expected_records
def test_desy_crawl_twice(expected_results, settings, cleanup): crawler = get_crawler_instance( settings.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_records = [ result['record'] for result in crawl_result['results_data'] ] gotten_records = override_dynamic_fields_on_records(gotten_records) expected_results = override_dynamic_fields_on_records(expected_results) gotten_records = deep_sort( sorted( gotten_records, key=lambda record: record['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) assert gotten_records == expected_results assert not crawl_result['errors'] # Second crawl crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=settings.get('CRAWLER_PROJECT'), spider='desy', settings={}, **settings.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_cds_crawl_twice(set_up_local_environment, expected_results): crawler = get_crawler_instance( set_up_local_environment.get('CRAWLER_HOST_URL') ) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] results_records = deep_sort( sorted( crawl_result['results_data'], key=lambda result: result['record']['titles'][0]['title'], ) ) expected_results = deep_sort( sorted( expected_results, key=lambda result: result['titles'][0]['title'], ) ) gotten_results = [ override_generated_fields(result['record']) for result in results_records ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] assert gotten_results == expected_results assert not crawl_result['errors'] crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='CDS', settings={}, **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) assert len(crawl_results) == 0
def test_deep_sort_with_query_parser_output(): element = { "bool": { "filter": { "bool": { "should": [ { "term": { "authors.name_variations": "j ellis" } }, { "term": { "authors.name_variations": "ellis j" } } ] } }, "must": { "match": { "authors.full_name": "ellis, j" } } } } expected_element = { "bool": { "filter": { "bool": { "should": [ { "term": { "authors.name_variations": "ellis j" } }, { "term": { "authors.name_variations": "j ellis" } } ] } }, "must": { "match": { "authors.full_name": "ellis, j" } } } } result = deep_sort(expected_element) assert result == expected_element
def test_arxiv( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS'] ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results assert not crawl_result['errors']
def test_cds( expected_results, config, spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) crawl_results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider=spider, settings={}, **config['CRAWLER_ARGUMENTS'] ) assert len(crawl_results) == 1 crawl_result = crawl_results[0] gotten_results = [ override_generated_fields(result['record']) for result in crawl_result['results_data'] ] expected_results = [ override_generated_fields(expected) for expected in expected_results ] gotten_results = deep_sort(gotten_results) expected_results = deep_sort(expected_results) assert gotten_results == expected_results assert not crawl_result['errors']
def test_deep_sort_with_dict_in_list(): element = [{'b': {'name': 'bb'}}, {'a': {'name': 'aa'}}] expected_element = [{'a': {'name': 'aa'}}, {'b': {'name': 'bb'}}] result = deep_sort(expected_element) assert result == expected_element