def test_file_exists_when_file_is_there(inspire_app, s3): expected_result = True create_s3_bucket(KEY) create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data") result = current_s3_instance.file_exists(KEY) assert result == expected_result
def test_delete_file(inspire_app, s3): create_s3_bucket(KEY) create_s3_file(current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data") current_s3_instance.delete_file(KEY) with pytest.raises(ClientError): current_s3_instance.client.head_object( Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY)
def test_get_file_metadata(inspire_app, s3): expected_metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", expected_metadata, ) metadata = current_s3_instance.get_file_metadata(KEY)["Metadata"] assert metadata == expected_metadata
def test_index_record_fulltext_manually(inspire_app, clean_celery_session, override_config, s3, datadir): metadata = {"foo": "bar"} pdf_path = os.path.join(datadir, "2206.04407.pdf") create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, pdf_path, metadata, **{"ContentType": "application/pdf"}, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): data = faker.record("lit") data.update({ "documents": [{ "source": "arxiv", "fulltext": True, "filename": "new_doc.pdf", "key": KEY, "url": "http://www.africau.edu/images/default/sample.pdf", }] }) rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) assert_record_not_in_es(rec["control_number"]) rec.index_fulltext() def assert_record_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( rec.id)).execute().hits.hits[0]) document = record_lit_es._source["documents"][0] assert "attachment" in document assert "text" not in document # pipeline should remove it retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
def test_replace_file_metadata(inspire_app, s3): metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) filename = "file.txt" mimetype = "text/*" acl = "public-read" current_s3_instance.replace_file_metadata(KEY, filename, mimetype, acl) result = current_s3_instance.client.head_object( Bucket=current_s3_instance.get_bucket_for_file_key(KEY), Key=KEY) assert result["ContentDisposition"] == f'inline; filename="{filename}"' assert result["ContentType"] == mimetype assert result["Metadata"] == {}
def test_fulltext_indexer_removes_deleted_from_es(inspire_app, override_config, clean_celery_session, s3): metadata = {"foo": "bar"} create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True): lit_record = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "key": KEY, "filename": "2105.15193.pdf", "url": "https://arxiv.org/pdf/2105.15193.pdf", }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = LiteratureSearch.get_record_data_from_es( lit_record) assert lit_record_from_es retry_until_pass(assert_records_in_es, retry_interval=5) lit_record.delete() db.session.commit() assert_record_not_in_es(lit_record["control_number"])
def test_index_records_batch_fulltext_manually(inspire_app, clean_celery_session, override_config, s3): metadata = {"foo": "bar"} key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a" key_3 = "e5892c4e59898346d307332354c6c7b8" create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) create_s3_bucket(key_2) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_2), key_2, "this is my data", metadata, ) create_s3_bucket(key_3) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_3), key_3, "this is my data", metadata, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): lit_record = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "key": KEY, "filename": "2105.15193.pdf", "url": "https://arxiv.org/pdf/2105.15193.pdf", }] }, )) lit_record_2 = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "filename": "new_doc.pdf", "key": key_2, "url": "http://www.africau.edu/images/default/sample.pdf", }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = LiteratureSearch.get_record_data_from_es( lit_record) lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es( lit_record_2) assert lit_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) lit_record["documents"].append( { "source": "arxiv", "fulltext": True, "filename": "another_doc.pdf", "key": key_3, "url": "http://www.africau.edu/images/default/sample.pdf", }, ) lit_record.update(dict(lit_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = batch_index_literature_fulltext.delay( [lit_record.id, lit_record_2.id]) task.get(timeout=5) assert task.result == { "uuids": [str(lit_record.id), str(lit_record_2.id)], "success_count": 2, "failures_count": 0, "failures": [], }