Exemplo n.º 1
0
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    results = CeleryMonitor.do_crawl(app=celery_app,
                                     monitor_timeout=5,
                                     monitor_iter_limit=100,
                                     events_limit=1,
                                     crawler_instance=crawler,
                                     project=config['CRAWLER_PROJECT'],
                                     spider=spider,
                                     settings={},
                                     **config['CRAWLER_ARGUMENTS'])

    gotten_results = [override_generated_fields(result) for result in results]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
Exemplo n.º 2
0
def test_cds(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS'))

    crawl_results = deep_sort(
        sorted(
            crawl_results,
            key=lambda result: result['record']['titles'][0]['title'],
        ))
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        ))

    gotten_results = [
        override_generated_fields(result['record']) for result in crawl_results
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
Exemplo n.º 3
0
def test_desy(
    expected_results,
    settings,
    cleanup,
):
    crawler = get_crawler_instance(settings.get('CRAWLER_HOST_URL'))

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=2,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS'))

    records = [result['record'] for result in crawl_results]

    gotten_results = override_dynamic_fields_on_records(records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_results = deep_sort(
        sorted(
            gotten_results,
            key=lambda result: result['titles'][0]['title'],
        ))
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        ))

    assert gotten_results == expected_results
Exemplo n.º 4
0
def test_pipeline(generated_records, expected_records):
    clean_generated_records = [
        override_generated_fields(generated_record)
        for generated_record in generated_records
    ]
    sorted_generated_records = deep_sort(clean_generated_records)
    sorted_expected_records = deep_sort(expected_records)
    assert sorted_generated_records == sorted_expected_records
Exemplo n.º 5
0
def test_pipeline(generated_records, expected_records):
    clean_generated_records = [
        override_generated_fields(generated_record)
        for generated_record in generated_records
    ]
    sorted_generated_records = deep_sort(clean_generated_records)
    sorted_expected_records = deep_sort(expected_records)
    assert sorted_generated_records == sorted_expected_records
Exemplo n.º 6
0
def test_desy_crawl_twice(expected_results, settings, cleanup):
    crawler = get_crawler_instance(
        settings.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_records = [
        result['record'] for result in crawl_result['results_data']
    ]
    gotten_records = override_dynamic_fields_on_records(gotten_records)
    expected_results = override_dynamic_fields_on_records(expected_results)

    gotten_records = deep_sort(
        sorted(
            gotten_records,
            key=lambda record: record['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    assert gotten_records == expected_results
    assert not crawl_result['errors']

    # Second crawl
    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=settings.get('CRAWLER_PROJECT'),
        spider='desy',
        settings={},
        **settings.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
Exemplo n.º 7
0
def test_cds_crawl_twice(set_up_local_environment, expected_results):
    crawler = get_crawler_instance(
        set_up_local_environment.get('CRAWLER_HOST_URL')
    )

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        events_limit=1,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    results_records = deep_sort(
        sorted(
            crawl_result['results_data'],
            key=lambda result: result['record']['titles'][0]['title'],
        )
    )
    expected_results = deep_sort(
        sorted(
            expected_results,
            key=lambda result: result['titles'][0]['title'],
        )
    )

    gotten_results = [
        override_generated_fields(result['record'])
        for result in results_records
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    assert gotten_results == expected_results
    assert not crawl_result['errors']

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=20,
        crawler_instance=crawler,
        project=set_up_local_environment.get('CRAWLER_PROJECT'),
        spider='CDS',
        settings={},
        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
    )

    assert len(crawl_results) == 0
Exemplo n.º 8
0
def test_deep_sort_with_query_parser_output():
    element = {
        "bool": {
            "filter": {
                "bool": {
                    "should": [
                        {
                            "term": {
                                "authors.name_variations": "j ellis"
                            }
                        },
                        {
                            "term": {
                                "authors.name_variations": "ellis j"
                            }
                        }
                    ]
                }
            },
            "must": {
                "match": {
                    "authors.full_name": "ellis, j"
                }
            }
        }
    }

    expected_element = {
        "bool": {
            "filter": {
                "bool": {
                    "should": [
                        {
                            "term": {
                                "authors.name_variations": "ellis j"
                            }
                        },
                        {
                            "term": {
                                "authors.name_variations": "j ellis"
                            }
                        }
                    ]
                }
            },
            "must": {
                "match": {
                    "authors.full_name": "ellis, j"
                }
            }
        }
    }

    result = deep_sort(expected_element)
    assert result == expected_element
Exemplo n.º 9
0
def test_arxiv(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider=spider,
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
    assert not crawl_result['errors']
Exemplo n.º 10
0
def test_cds(
    expected_results,
    config,
    spider,
):
    crawler = get_crawler_instance(config['CRAWLER_HOST_URL'])

    crawl_results = CeleryMonitor.do_crawl(
        app=celery_app,
        monitor_timeout=5,
        monitor_iter_limit=100,
        events_limit=1,
        crawler_instance=crawler,
        project=config['CRAWLER_PROJECT'],
        spider=spider,
        settings={},
        **config['CRAWLER_ARGUMENTS']
    )

    assert len(crawl_results) == 1

    crawl_result = crawl_results[0]

    gotten_results = [
        override_generated_fields(result['record'])
        for result in crawl_result['results_data']
    ]
    expected_results = [
        override_generated_fields(expected) for expected in expected_results
    ]

    gotten_results = deep_sort(gotten_results)
    expected_results = deep_sort(expected_results)

    assert gotten_results == expected_results
    assert not crawl_result['errors']
Exemplo n.º 11
0
def test_deep_sort_with_dict_in_list():
    element = [{'b': {'name': 'bb'}}, {'a': {'name': 'aa'}}]
    expected_element = [{'a': {'name': 'aa'}}, {'b': {'name': 'bb'}}]

    result = deep_sort(expected_element)
    assert result == expected_element