예제 #1
0
def test_upload_file(inspire_app, s3):
    create_s3_bucket(KEY)
    filename = "file.txt"
    mimetype = "text/*"
    acl = "public-read"
    expected_content = "This is a demo file\n"
    record_fixture_path = pkg_resources.resource_filename(
        __name__, os.path.join("fixtures", "file.txt")
    )
    with open(record_fixture_path, "rb") as data:
        current_s3_instance.upload_file(data, KEY, filename, mimetype, acl)
    result = current_s3_instance.client.head_object(
        Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY
    )
    assert result["ContentDisposition"] == f'inline; filename="{filename}"'
    assert result["ContentType"] == mimetype
    content = (
        current_s3_instance.resource.Object(
            current_s3_instance.get_bucket_for_file_key(KEY), KEY
        )
        .get()["Body"]
        .read()
        .decode("utf-8")
    )
    assert content == expected_content
예제 #2
0
def test_delete_file(inspire_app, s3):
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    current_s3_instance.delete_file(KEY)
    with pytest.raises(ClientError):
        current_s3_instance.client.head_object(
            Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
예제 #3
0
파일: es.py 프로젝트: inspirehep/inspirehep
 def get_documents_with_fulltext(self, record_data):
     documents = record_data.get("documents", [])
     for document in documents:
         if (not document.get("hidden")
                 and document.get("filename", "").endswith("pdf")
                 and document.get("fulltext")
             ) or document.get("source") == "arxiv":
             try:
                 key = document["key"]
                 bucket = current_s3_instance.get_bucket_for_file_key(key)
                 file_data = current_s3_instance.client.get_object(
                     Bucket=bucket, Key=key)
                 mimetype = file_data.get("ContentType", "")
                 body = file_data["Body"]
                 if mimetype != "application/pdf":
                     continue
                 encoded_file = base64.b64encode(
                     body.read()).decode("ascii")
                 document["text"] = encoded_file
             except current_s3_instance.client.exceptions.NoSuchKey:
                 LOGGER.error(
                     "File was not found for the given url",
                     document_url=document["url"],
                     record_control_number=record_data["control_number"],
                 )
                 continue
     return documents
예제 #4
0
def test_file_exists_when_file_is_there(inspire_app, s3):
    expected_result = True
    create_s3_bucket(KEY)
    create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY,
                   "this is my data")
    result = current_s3_instance.file_exists(KEY)
    assert result == expected_result
예제 #5
0
def test_replace_file_metadata(inspire_app, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    filename = "file.txt"
    mimetype = "text/*"
    acl = "public-read"
    current_s3_instance.replace_file_metadata(KEY, filename, mimetype, acl)
    result = current_s3_instance.client.head_object(
        Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
    assert result["ContentDisposition"] == f'inline; filename="{filename}"'
    assert result["ContentType"] == mimetype
    assert result["Metadata"] == {}
예제 #6
0
def test_get_file_metadata(inspire_app, s3):
    expected_metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        expected_metadata,
    )
    metadata = current_s3_instance.get_file_metadata(KEY)["Metadata"]
    assert metadata == expected_metadata
예제 #7
0
def test_index_record_fulltext_manually(inspire_app, clean_celery_session,
                                        override_config, s3, datadir):
    metadata = {"foo": "bar"}
    pdf_path = os.path.join(datadir, "2206.04407.pdf")
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        pdf_path,
        metadata,
        **{"ContentType": "application/pdf"},
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        data = faker.record("lit")
        data.update({
            "documents": [{
                "source":
                "arxiv",
                "fulltext":
                True,
                "filename":
                "new_doc.pdf",
                "key":
                KEY,
                "url":
                "http://www.africau.edu/images/default/sample.pdf",
            }]
        })
        rec = LiteratureRecord.create(data)
        models_committed.disconnect(index_after_commit)
        db.session.commit()
        models_committed.connect(index_after_commit)

        assert_record_not_in_es(rec["control_number"])

        rec.index_fulltext()

        def assert_record_in_es():
            current_search.flush_and_refresh("*")
            record_lit_es = (LiteratureSearch().get_record(str(
                rec.id)).execute().hits.hits[0])
            document = record_lit_es._source["documents"][0]
            assert "attachment" in document
            assert "text" not in document  # pipeline should remove it

        retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
예제 #8
0
def test_fulltext_indexer_removes_deleted_from_es(inspire_app, override_config,
                                                  clean_celery_session, s3):
    metadata = {"foo": "bar"}
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )
    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)

            assert lit_record_from_es

        retry_until_pass(assert_records_in_es, retry_interval=5)

        lit_record.delete()
        db.session.commit()

        assert_record_not_in_es(lit_record["control_number"])
예제 #9
0
def test_get_bucket(inspire_app, s3):
    key = "e50c2ea2d26571e0c5a3411e320586289fd715c2"
    expected_result = "inspire-files-e"
    result = current_s3_instance.get_bucket_for_file_key(key)
    assert result == expected_result
예제 #10
0
def test_index_records_batch_fulltext_manually(inspire_app,
                                               clean_celery_session,
                                               override_config, s3):
    metadata = {"foo": "bar"}
    key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a"
    key_3 = "e5892c4e59898346d307332354c6c7b8"
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_2)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_2),
        key_2,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_3)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_3),
        key_3,
        "this is my data",
        metadata,
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        lit_record_2 = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "filename":
                        "new_doc.pdf",
                        "key":
                        key_2,
                        "url":
                        "http://www.africau.edu/images/default/sample.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)
            lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es(
                lit_record_2)
            assert lit_record_from_es and lit_record_from_es_2

        retry_until_pass(assert_records_in_es, retry_interval=5)

        models_committed.disconnect(index_after_commit)
        lit_record["documents"].append(
            {
                "source": "arxiv",
                "fulltext": True,
                "filename": "another_doc.pdf",
                "key": key_3,
                "url": "http://www.africau.edu/images/default/sample.pdf",
            }, )
        lit_record.update(dict(lit_record))
        db.session.commit()
        # reconnect signal before we call process_references_in_records
        models_committed.connect(index_after_commit)
        task = batch_index_literature_fulltext.delay(
            [lit_record.id, lit_record_2.id])
        task.get(timeout=5)

        assert task.result == {
            "uuids": [str(lit_record.id),
                      str(lit_record_2.id)],
            "success_count": 2,
            "failures_count": 0,
            "failures": [],
        }