Пример #1
0
def test_file_exists_when_file_is_there(inspire_app, s3):
    expected_result = True
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    result = current_s3_instance.file_exists(KEY)
    assert result == expected_result
Пример #2
0
def test_delete_file(inspire_app, s3):
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    current_s3_instance.delete_file(KEY)
    with pytest.raises(ClientError):
        current_s3_instance.client.head_object(
            Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
Пример #3
0
def test_get_file_metadata(inspire_app, s3):
    expected_metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        expected_metadata,
    )
    metadata = current_s3_instance.get_file_metadata(KEY)["Metadata"]
    assert metadata == expected_metadata
Пример #4
0
def test_index_record_fulltext_manually(inspire_app, clean_celery_session,
                                        override_config, s3, datadir):
    metadata = {"foo": "bar"}
    pdf_path = os.path.join(datadir, "2206.04407.pdf")
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        pdf_path,
        metadata,
        **{"ContentType": "application/pdf"},
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        data = faker.record("lit")
        data.update({
            "documents": [{
                "source":
                "arxiv",
                "fulltext":
                True,
                "filename":
                "new_doc.pdf",
                "key":
                KEY,
                "url":
                "http://www.africau.edu/images/default/sample.pdf",
            }]
        })
        rec = LiteratureRecord.create(data)
        models_committed.disconnect(index_after_commit)
        db.session.commit()
        models_committed.connect(index_after_commit)

        assert_record_not_in_es(rec["control_number"])

        rec.index_fulltext()

        def assert_record_in_es():
            current_search.flush_and_refresh("*")
            record_lit_es = (LiteratureSearch().get_record(str(
                rec.id)).execute().hits.hits[0])
            document = record_lit_es._source["documents"][0]
            assert "attachment" in document
            assert "text" not in document  # pipeline should remove it

        retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
Пример #5
0
def test_replace_file_metadata(inspire_app, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    filename = "file.txt"
    mimetype = "text/*"
    acl = "public-read"
    current_s3_instance.replace_file_metadata(KEY, filename, mimetype, acl)
    result = current_s3_instance.client.head_object(
        Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
    assert result["ContentDisposition"] == f'inline; filename="{filename}"'
    assert result["ContentType"] == mimetype
    assert result["Metadata"] == {}
Пример #6
0
def test_fulltext_indexer_removes_deleted_from_es(inspire_app, override_config,
                                                  clean_celery_session, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)

            assert lit_record_from_es

        retry_until_pass(assert_records_in_es, retry_interval=5)

        lit_record.delete()
        db.session.commit()

        assert_record_not_in_es(lit_record["control_number"])
Пример #7
0
def test_index_records_batch_fulltext_manually(inspire_app,
                                               clean_celery_session,
                                               override_config, s3):
    metadata = {"foo": "bar"}
    key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a"
    key_3 = "e5892c4e59898346d307332354c6c7b8"
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_2)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_2),
        key_2,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_3)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_3),
        key_3,
        "this is my data",
        metadata,
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        lit_record_2 = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "filename":
                        "new_doc.pdf",
                        "key":
                        key_2,
                        "url":
                        "http://www.africau.edu/images/default/sample.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)
            lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es(
                lit_record_2)
            assert lit_record_from_es and lit_record_from_es_2

        retry_until_pass(assert_records_in_es, retry_interval=5)

        models_committed.disconnect(index_after_commit)
        lit_record["documents"].append(
            {
                "source": "arxiv",
                "fulltext": True,
                "filename": "another_doc.pdf",
                "key": key_3,
                "url": "http://www.africau.edu/images/default/sample.pdf",
            }, )
        lit_record.update(dict(lit_record))
        db.session.commit()
        # reconnect signal before we call process_references_in_records
        models_committed.connect(index_after_commit)
        task = batch_index_literature_fulltext.delay(
            [lit_record.id, lit_record_2.id])
        task.get(timeout=5)

        assert task.result == {
            "uuids": [str(lit_record.id),
                      str(lit_record_2.id)],
            "success_count": 2,
            "failures_count": 0,
            "failures": [],
        }