def test_basic_search(app, db, es): """Test basic search functionality.""" # The index should be empty assert len(ItemSearch().execute()) == 0 # Create item1, search for everything item1 = Item.create({}) item1.commit() record_indexer = RecordIndexer() record_indexer.index(item1) current_search.flush_and_refresh('_all') assert len(ItemSearch().execute()) == 1 # Create item2, search for everything again item2 = Item.create({'foo': 'bar'}) item2.commit() record_indexer.index(item2) current_search.flush_and_refresh('_all') assert len(ItemSearch().execute()) == 2 # Search for item2 assert len(ItemSearch().query('match', foo='bar').execute()) == 1 # Search for nonsense assert len(ItemSearch().query('match', foo='banana').execute()) == 0
def index_after_commit(sender, changes): """Index a record in ES after it was committed to the DB. This cannot happen in an ``after_record_commit`` receiver from Invenio-Records because, despite the name, at that point we are not yet sure whether the record has been really committed to the DB. """ indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update') and not model_instance.json.get("deleted"): if hasattr(model_instance, '_enhanced_record'): record = model_instance._enhanced_record else: record = model_instance.json indexer.index(InspireRecord(record, model_instance)) else: try: indexer.delete(InspireRecord( model_instance.json, model_instance)) except NotFoundError: # Record not found in ES LOGGER.debug('Record %s not found in ES', model_instance.json.get("id")) pass pid_type = get_pid_type_from_schema(model_instance.json['$schema']) pid_value = model_instance.json['control_number'] db_version = model_instance.version_id index_modified_citations_from_record.delay(pid_type, pid_value, db_version)
def load_records(app, filename, schema, tries=5): """Try to index records.""" indexer = RecordIndexer() records = [] with app.app_context(): with mock.patch('invenio_records.api.Record.validate', return_value=None): data_filename = pkg_resources.resource_filename( 'invenio_records', filename) records_data = load(data_filename) with db.session.begin_nested(): for item in records_data: record_id = uuid.uuid4() item_dict = dict(marc21.do(item)) item_dict['$schema'] = schema recid_minter(record_id, item_dict) oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) records.append(record.id) db.session.commit() # Wait for indexer to finish for i in range(tries): response = current_search_client.search() if response['hits']['total'] >= len(records): break current_search.flush_and_refresh('_all') return records
def records(): """Load records.""" import pkg_resources import uuid from flask_login import login_user, logout_user from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record, split_blob from invenio_accounts.models import User from invenio_deposit.api import Deposit users = User.query.all() # pkg resources the demodata data_path = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml' ) with open(data_path) as source: with current_app.test_request_context(): indexer = RecordIndexer() with db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): login_user(users[index % len(users)]) # do translate record = marc21.do(create_record(data)) # create record indexer.index(Deposit.create(record)) logout_user() db.session.commit()
def record_not_yet_deleted(app): snippet = ( '<record>' ' <controlfield tag="001">333</controlfield>' ' <controlfield tag="005">20160913214552.0</controlfield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>' ) with app.app_context(): json_record = hep.do(create_record(snippet)) json_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record = record_upsert(json_record) if record: ri = RecordIndexer() ri.index(record) db.session.commit() yield with app.app_context(): _delete_record_from_everywhere('literature', 333)
def glossary_terms(): """Load demo terms records.""" from invenio_db import db from invenio_records import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.termid import \ cernopendata_termid_minter indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/glossary-term-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') glossary_terms_json = glob.glob(os.path.join(data, 'terms', '*.json')) for filename in glossary_terms_json: click.echo('Loading glossary-terms from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): if "collections" not in data and \ not isinstance( data.get("collections", None), basestring): data["collections"] = [] data["collections"].append({"primary": "Terms"}) id = uuid.uuid4() cernopendata_termid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) db.session.commit() indexer.index(record) db.session.expunge_all()
def records(): """Load records.""" import pkg_resources import uuid from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record, split_blob from invenio_pidstore import current_pidstore from invenio_records.api import Record # pkg resources the demodata data_path = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml' ) with open(data_path) as source: indexer = RecordIndexer() with db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): # create uuid rec_uuid = uuid.uuid4() # do translate record = marc21.do(create_record(data)) # create PID current_pidstore.minters['recid_minter']( rec_uuid, record ) # create record indexer.index(Record.create(record, id_=rec_uuid)) db.session.commit()
def data_policies(skip_files): """Load demo Data Policy records.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_records_files.api import Record from invenio_records.models import RecordMetadata indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/data-policies-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') data_policies_json = glob.glob(os.path.join(data, '*.json')) for filename in data_policies_json: click.echo('Loading data-policies from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def receive_after_model_commit(sender, changes): """Perform actions after models committed to database.""" indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update'): indexer.index(InspireRecord(model_instance.json, model_instance)) else: indexer.delete(InspireRecord(model_instance.json, model_instance))
def closed_access_record(db, es, record_with_files_creation): """Creation of a full record with closed access right.""" _, record, record_url = record_with_files_creation record['access_right'] = AccessRight.CLOSED record.commit() db.session.commit() indexer = RecordIndexer() indexer.index(record) current_search.flush_and_refresh(index='records') return record
def oaiserver(sets, records): """Initialize OAI-PMH server.""" from invenio_db import db from invenio_oaiserver.models import OAISet from invenio_records.api import Record # create a OAI Set with db.session.begin_nested(): for i in range(sets): db.session.add( OAISet( spec='test{0}'.format(i), name='Test{0}'.format(i), description='test desc {0}'.format(i), search_pattern='title_statement.title:Test{0}'.format(i), )) # create a record schema = { 'type': 'object', 'properties': { 'title_statement': { 'type': 'object', 'properties': { 'title': { 'type': 'string', }, }, }, 'field': { 'type': 'boolean' }, }, } search.client.indices.delete_alias('_all', '_all', ignore=[400, 404]) search.client.indices.delete('*') with app.app_context(): indexer = RecordIndexer() with db.session.begin_nested(): for i in range(records): record_id = uuid.uuid4() data = { 'title_statement': { 'title': 'Test{0}'.format(i) }, '$schema': schema, } recid_minter(record_id, data) oaiid_minter(record_id, data) record = Record.create(data, id_=record_id) indexer.index(record) db.session.commit()
def datasets(skip_files): """Load demo datasets records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from cernopendata.modules.records.minters.datasetid import \ cernopendata_datasetid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/datasets-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/datasets') datasets_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in datasets_json: with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() # (TOFIX) Remove if statement in production # as every dataset record should have a doi if data.get('doi', None): cernopendata_datasetid_minter(id, data) else: cernopendata_recid_minter(id, data) record = Record.create(data, id_=id) record['$schema'] = schema bucket = Bucket.create() RecordsBuckets.create(record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) ObjectVersion.create(bucket, filename, _file_id=f.id) db.session.commit() indexer.index(record) db.session.expunge_all()
def create_record(app, item_dict, mint_oaiid=True): """Create test record.""" indexer = RecordIndexer() with app.test_request_context(): record_id = uuid.uuid4() recid_minter(record_id, item_dict) if mint_oaiid: oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) return record
def receive_after_model_commit(sender, changes): """Perform actions after models committed to database.""" indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update'): indexer.index( InspireRecord(model_instance.json, model_instance)) else: indexer.delete( InspireRecord(model_instance.json, model_instance))
def _records_create_and_index(db, objs, cls, pid_type): """Create records and index.""" indexer = RecordIndexer() recs = [] for obj in objs: record = cls.create(obj) mint_record_pid(pid_type, "pid", record) record.commit() recs.append(record) db.session.commit() for rec in recs: indexer.index(rec)
def indexed_loans(es, test_loans): """Get a function to wait for records to be flushed to index.""" indexer = RecordIndexer() for pid, loan in test_loans: indexer.index(loan) current_search.flush_and_refresh(index="loans") yield test_loans for pid, loan in test_loans: indexer.delete_by_id(loan.id) current_search.flush_and_refresh(index="loans")
def testdata(app, db, es_clear, system_user): """Create, index and return test data.""" indexer = RecordIndexer() locations = load_json_from_datadir("locations.json") for location in locations: record = Location.create(location) mint_record_pid(LOCATION_PID_TYPE, "pid", record) record.commit() db.session.commit() indexer.index(record) internal_locations = load_json_from_datadir("internal_locations.json") for internal_location in internal_locations: record = InternalLocation.create(internal_location) mint_record_pid(INTERNAL_LOCATION_PID_TYPE, "pid", record) record.commit() db.session.commit() indexer.index(record) documents = load_json_from_datadir("documents.json") for doc in documents: record = Document.create(doc) mint_record_pid(DOCUMENT_PID_TYPE, "pid", record) record.commit() db.session.commit() indexer.index(record) items = load_json_from_datadir("items.json") for item in items: record = Item.create(item) mint_record_pid(ITEM_PID_TYPE, "pid", record) record.commit() db.session.commit() indexer.index(record) loans = load_json_from_datadir("loans.json") for loan in loans: record = Loan.create(loan) mint_record_pid(CIRCULATION_LOAN_PID_TYPE, "pid", record) record.commit() db.session.commit() indexer.index(record) # flush all indices after indexing, otherwise ES won't be ready for tests current_search.flush_and_refresh(index='*') return { "locations": locations, "documents": documents, "items": items, "loans": loans, }
def data_policies(skip_files): """Load demo Data Policy records.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_records_files.api import Record from invenio_records.models import RecordMetadata indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/data-policies-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') data_policies_json = glob.glob(os.path.join(data, '*.json')) for filename in data_policies_json: click.echo('Loading data-policies from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create(record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) ObjectVersion.create(bucket, filename, _file_id=f.id) db.session.commit() indexer.index(record) db.session.expunge_all()
def create_deposits(app, test_records_data, creator): """Create test deposits.""" DepositInfo = namedtuple('DepositInfo', ['id', 'data', 'deposit']) indexer = RecordIndexer() with authenticated_user(creator): deposits = [Deposit.create(data=data) for data in deepcopy(test_records_data)] for deposit in deposits: indexer.index(deposit) deposit.commit() deposit.commit() return [DepositInfo(dep.id, dep.dumps(), dep) for dep in deposits]
def _create_and_index_record(record): record = Record.create(record) inspire_recid_minter(record.id, record) # invenio-collections will populate _collections field in record upon # commit db.session.commit() # Record needs to be indexed since views fetch records from ES r = RecordIndexer() r.index(record) es.indices.refresh('records-hep') return record
def test_record_can_be_deleted(app, record_not_yet_deleted): with app.test_client() as client: assert client.get('/api/literature/333').status_code == 200 record = get_db_record('literature', 333) record['deleted'] = True record.commit() if record: ri = RecordIndexer() ri.index(record) db.session.commit() with app.test_client() as client: assert client.get('/api/literature/333').status_code == 410
def oaiserver(sets, records): """Initialize OAI-PMH server.""" from invenio_db import db from invenio_oaiserver.models import OAISet from invenio_records.api import Record # create a OAI Set with db.session.begin_nested(): for i in range(sets): db.session.add(OAISet( spec='test{0}'.format(i), name='Test{0}'.format(i), description='test desc {0}'.format(i), search_pattern='title_statement.title:Test{0}'.format(i), )) # create a record schema = { 'type': 'object', 'properties': { 'title_statement': { 'type': 'object', 'properties': { 'title': { 'type': 'string', }, }, }, 'field': {'type': 'boolean'}, }, } search.client.indices.delete_alias('_all', '_all', ignore=[400, 404]) search.client.indices.delete('*') with app.app_context(): indexer = RecordIndexer() with db.session.begin_nested(): for i in range(records): record_id = uuid.uuid4() data = { 'title_statement': {'title': 'Test{0}'.format(i)}, '$schema': schema, } recid_minter(record_id, data) oaiid_minter(record_id, data) record = Record.create(data, id_=record_id) indexer.index(record) db.session.commit()
def index_after_commit(sender, changes): """Index a record in ES after it was committed to the DB. This cannot happen in an ``after_record_commit`` receiver from Invenio-Records because, despite the name, at that point we are not yet sure whether the record has been really committed to the DB. """ indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update'): indexer.index(Record(model_instance.json, model_instance)) else: indexer.delete(Record(model_instance.json, model_instance))
def data(datafile): """Insert demo data.""" click.secho("Importing demo data from {}".format(datafile), fg="yellow") indexer = RecordIndexer() holder = Holder() loader = DataLoader(holder) loader.load(datafile) rec_items = loader.persist() for rec in rec_items: # TODO: bulk index when we have the queue in k8s deployment indexer.index(rec) current_search.flush_and_refresh(index="*")
def make_sample_record(db, title, community_id, state='filling', secondary=None): rec = { 'title': title, 'oarepo:primaryCommunity': community_id, 'oarepo:recordStatus': state, 'oarepo:secondaryCommunities': secondary, 'oarepo:ownedBy': 1 } record_uuid = uuid.uuid4() pid = recid_minter(record_uuid, rec) rec = TestRecord.create(rec, id_=record_uuid) db.session.commit() indexer = RecordIndexer() indexer.index(rec) return PIDRecord(pid, rec)
def index_after_commit(sender, changes): """Index records automatically after each modification.""" indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update') and model_instance.json: indexer.index(Record(model_instance.json, model_instance)) else: try: indexer.delete(Record(model_instance.json, model_instance)) except NotFoundError: # Record not found in ES current_app.logger.warning( 'Record with id "%s" not found in ElasticSearch' % model_instance.json.get('control_number'))
def test_publication_date_mapping(db, es, minimal_record): """Tests publication_date related fields are indexed properly. - Tests jsonschema validates correctly - Tests that retrieved record document is fine. NOTE: - es fixture depends on appctx fixture, so we are in app context - this test requires a running ES instance """ # Interval minimal_record['publication_date'] = '1939/1945' minimal_record['_publication_date_search'] = '1939-01-01' record_id = uuid.uuid4() current_pidstore.minters['recid_v2'](record_id, minimal_record) record = Record.create(minimal_record, id_=record_id) db.session.commit() indexer = RecordIndexer() index_result = indexer.index(record) _index = index_result['_index'] _doc = index_result['_type'] _id = index_result['_id'] es_doc = es.get(index=_index, doc_type=_doc, id=_id) source = es_doc['_source'] assert source['publication_date'] == '1939/1945' assert source['_publication_date_search'] == '1939-01-01'
def testdata(app, db, es_clear, patrons): """Create, index and return test data.""" data = load_json_from_datadir("locations.json") locations = _create_records(db, data, Location, LOCATION_PID_TYPE) data = load_json_from_datadir("internal_locations.json") int_locs = _create_records(db, data, InternalLocation, INTERNAL_LOCATION_PID_TYPE) data = load_json_from_datadir("documents.json") documents = _create_records(db, data, Document, DOCUMENT_PID_TYPE) data = load_json_from_datadir("series.json") series = _create_records(db, data, Series, SERIES_PID_TYPE) data = load_json_from_datadir("items.json") items = _create_records(db, data, Item, ITEM_PID_TYPE) data = load_json_from_datadir("eitems.json") eitems = _create_records(db, data, EItem, EITEM_PID_TYPE) data = load_json_from_datadir("ill_libraries.json") ill_libraries = _create_records(db, data, Provider, PROVIDER_PID_TYPE) data = load_json_from_datadir("ill_borrowing_requests.json") ill_brw_reqs = _create_records(db, data, BorrowingRequest, BORROWING_REQUEST_PID_TYPE) data = load_json_from_datadir("loans.json") loans = _create_records(db, data, Loan, CIRCULATION_LOAN_PID_TYPE) # index ri = RecordIndexer() for rec in (locations + int_locs + series + documents + items + eitems + loans + ill_libraries + ill_brw_reqs): ri.index(rec) current_search.flush_and_refresh(index="*") return { "documents": documents, "eitems": eitems, "internal_locations": int_locs, "items": items, "loans": loans, "locations": locations, "series": series, }
def test_records_can_be_merged(app, records_not_merged_in_marcxml): with app.test_client() as client: assert client.get('/api/literature/111').status_code == 200 assert client.get('/api/literature/222').status_code == 200 record = get_db_record('literature', 222) record['deleted'] = True record['new_record'] = {'$ref': 'http://localhost:5000/api/record/111'} record.commit() if record: ri = RecordIndexer() ri.index(record) db.session.commit() with app.test_client() as client: assert client.get('/api/literature/111').status_code == 200 assert client.get('/api/literature/222').status_code == 301
def test_before_deposit_index_hook_doesnt_create_new_buckets( create_record, db, es): deposit = create_record(published=False) bucket = Bucket.get(deposit['_buckets']['deposit']) obj = ObjectVersion.create(bucket, 'foo.txt') stream = BytesIO(b'Hello world!') obj.set_contents(stream, size=len(stream.getvalue()), size_limit=bucket.size_limit) db.session.commit() number_buckets_preindex = len(Bucket.query.all()) indexer = RecordIndexer() indexer.index(deposit) assert len(Bucket.query.all()) == number_buckets_preindex
def docs(): """Load demo article records.""" from invenio_db import db from invenio_records import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.docid import \ cernopendata_docid_minter indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/docs-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/docs') articles_json = get_jsons_from_dir(data) for filename in articles_json: name = filename.split('/')[-1] if name.startswith('opera'): click.echo('Skipping opera records ...') continue with open(filename, 'rb') as source: for data in json.load(source): # Replace body with responding content assert data["body"]["content"] content_filename = os.path.join(*([ "/", ] + filename.split('/')[:-1] + [ data["body"]["content"], ])) with open(content_filename) as body_field: data["body"]["content"] = body_field.read() if "collections" not in data and \ not isinstance( data.get("collections", None), basestring): data["collections"] = [] id = uuid.uuid4() cernopendata_docid_minter(id, data) record = Record.create(data, id_=id) record['$schema'] = schema db.session.commit() indexer.index(record) db.session.expunge_all()
def software(): """Load demo software records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/software-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/software') software_json = glob.glob(os.path.join(data, '*.json')) for filename in software_json: with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', None) id = uuid.uuid4() cernopendata_recid_minter(id, data) record = Record.create(data, id_=id) record['$schema'] = schema bucket = Bucket.create() RecordsBuckets.create(record=record.model, bucket=bucket) for file in files: assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get("size"), file.get("checksum")) ObjectVersion.create(bucket, filename, _file_id=f.id) db.session.commit() indexer.index(record) db.session.expunge_all()
def testdata_most_loaned(db, testdata): """Create, index and return test data for most loans tests.""" most_loaned = load_json_from_datadir("loans_most_loaned.json") recs = _create_records(db, most_loaned, Loan, CIRCULATION_LOAN_PID_TYPE) ri = RecordIndexer() for rec in recs: ri.index(rec) current_search.flush_and_refresh(index="loans") return { "locations": testdata["locations"], "internal_locations": testdata["internal_locations"], "documents": testdata["documents"], "items": testdata["items"], "loans": most_loaned, "series": testdata["series"], }
def importer_test_data(app, db, es_clear): """Provide test data for importer test suite.""" data = load_json_from_datadir( "existing_documents.json", relpath="importer" ) Document = current_app_ils.document_record_cls documents = _create_records(db, data, Document, DOCUMENT_PID_TYPE) data = load_json_from_datadir("existing_eitems.json", relpath="importer") eitems = _create_records(db, data, EItem, EITEM_PID_TYPE) # index ri = RecordIndexer() for rec in documents + eitems: ri.index(rec) current_search.flush_and_refresh(index="*") return {"documents": documents, "eitems": eitems}
def test_before_deposit_index_hook_sets_files(create_record, db, es): deposit = create_record(published=False) # Reproduce file upload: add file to bucket associated with deposit bucket = Bucket.get(deposit['_buckets']['deposit']) obj = ObjectVersion.create(bucket, 'foo.txt') stream = BytesIO(b'Hello world!') obj.set_contents(stream, size=len(stream.getvalue()), size_limit=bucket.size_limit) db.session.commit() indexer = RecordIndexer() indexer.index(deposit) # Get the raw indexed document index, doc_type = indexer.record_to_index(deposit) es_deposit = es.get(index=index, doc_type=doc_type, id=deposit.id) assert '_files' in es_deposit['_source'] assert es_deposit['_source']['_files'][0]['type'] == 'txt'
def test_access_permissions( client, json_headers, testdata, users, with_access ): """Test GET documents with `_access` ignoring `restricted`.""" # set the documents to have read access only by patron2. `_access` should # be taken into account and take precedence over `restricted`. indexer = RecordIndexer() doc1 = Document.get_record_by_pid("docid-open-access") doc2 = Document.get_record_by_pid("docid-closed-access") for doc in [doc1, doc2]: doc.update(dict(_access=dict(read=[users["patron2"].id]))) doc.commit() db.session.commit() indexer.index(doc) current_search.flush_and_refresh(index="documents") test_data = [ ("anonymous", "docid-open-access", 401, 0), ("patron1", "docid-open-access", 403, 0), ("patron2", "docid-open-access", 200, 1), # should have access ("librarian", "docid-open-access", 200, 1), ("admin", "docid-open-access", 200, 1), ("anonymous", "docid-closed-access", 401, 0), ("patron1", "docid-closed-access", 403, 0), ("patron2", "docid-closed-access", 200, 1), # should have access ("librarian", "docid-closed-access", 200, 1), ("admin", "docid-closed-access", 200, 1), ] for user, pid, status_code, n_hits in test_data: # item endpoint user_login(client, user, users) url = url_for("invenio_records_rest.docid_item", pid_value=pid) res = client.get(url, headers=json_headers) assert res.status_code == status_code # list endpoint user_login(client, user, users) url = url_for( "invenio_records_rest.docid_list", q="pid:{}".format(pid) ) res = client.get(url, headers=json_headers) hits = json.loads(res.data.decode("utf-8")) assert hits["hits"]["total"] == n_hits
def demo_records(app): """Create demo records.""" data_path = pkg_resources.resource_filename('cds.modules.fixtures', 'data/records.xml') with open(data_path) as source: indexer = RecordIndexer() with _db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): # create uuid rec_uuid = uuid.uuid4() # do translate record = marc21.do(create_record(data)) # create PID current_pidstore.minters['recid'](rec_uuid, record) # create record indexer.index(Record.create(record, id_=rec_uuid)) _db.session.commit() return data_path
def items(): """Create circulation items.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from invenio_circulation.api import Item from invenio_circulation.minters import circulation_item_minter for x in range(10): item = Item.create({ 'foo': 'bar{0}'.format(x), 'title_statement': {'title': 'title{0}'.format(x)}, 'record': {'id': 1} }) circulation_item_minter(item.id, item) item.commit() record_indexer = RecordIndexer() record_indexer.index(item) db.session.commit()
def test_crud_read(app, db, es): """Test REST API get functionality.""" item = Item.create({'foo': 'bar'}) circulation_item_minter(item.id, item) item.commit() db.session.commit() record_indexer = RecordIndexer() record_indexer.index(item) current_search.flush_and_refresh('_all') with app.test_request_context(): with app.test_client() as client: url = url_for('circulation_rest.crcitm_item', pid_value=item['control_number']) res = client.get(url) fetched_item = json.loads(res.data.decode('utf-8'))['metadata'] assert fetched_item['control_number'] == item['control_number']
def test_rest_search(app, db, es, url_addition, count): """Test REST API search functionality.""" item = Item.create({'foo': 'bar'}) circulation_item_minter(item.id, item) item.commit() db.session.commit() record_indexer = RecordIndexer() record_indexer.index(item) current_search.flush_and_refresh('_all') with app.test_request_context(): with app.test_client() as client: base_url = url_for('circulation_rest.crcitm_list') url = base_url + url_addition res = client.get(url) hits = json.loads(res.data.decode('utf-8'))['hits']['hits'] assert len(hits) == count
def testdata(in_cluster_app): """Create, index and return test data.""" indexer = RecordIndexer() filenames = ("records.json", "authors.json") with mock.patch('invenio_records.api.Record.validate', return_value=None): records = load_json_from_datadir('records.json') for record in records: record = Record.create(record) record_minter(record.id, record) record.commit() db.session.commit() indexer.index(record) authors = load_json_from_datadir('authors.json') for record in authors: record = Record.create(record) author_minter(record.id, record) record.commit() db.session.commit() indexer.index(record)
def _create_records(path, verbose): """Create demo records.""" indexer = RecordIndexer( record_to_index=lambda record: ('records', 'record') ) if verbose > 0: click.secho('Creating records', fg='yellow', bold=True) with db.session.begin_nested(): records_dir = os.path.join(path, 'records') nb_records = 0 for root, dirs, files in os.walk(records_dir): for filename in files: split_filename = os.path.splitext(filename) if split_filename[1] == '.json': rec_uuid = UUID(split_filename[0]) with open(os.path.join(records_dir, root, filename)) as record_file: record_str = record_file.read() record_str = resolve_community_id(record_str) record_str = resolve_block_schema_id(record_str) deposit = Deposit.create(json.loads(record_str), id_=rec_uuid) ObjectVersion.create(deposit.files.bucket, 'myfile', stream=BytesIO(b'mycontent')) deposit.publish() pid, record = deposit.fetch_published() # index the record indexer.index(record) if verbose > 1: click.secho('CREATED RECORD {0}:\n {1}'.format( str(rec_uuid), json.dumps(record, indent=4) )) click.secho('CREATED DEPOSIT {0}:\n {1}'.format( str(rec_uuid), json.dumps(deposit, indent=4) )) nb_records += 1 if verbose > 0: click.secho('Created {} records!'.format(nb_records), fg='green')
def prepare_data(): """Prepare data.""" days = current_app.config[ "ILS_CIRCULATION_MAIL_OVERDUE_REMINDER_INTERVAL" ] loans = testdata["loans"] recs = [] now = arrow.utcnow() def new_end_date(loan, date): loan["end_date"] = date.date().isoformat() loan["state"] = "ITEM_ON_LOAN" loan.commit() recs.append(loan) # overdue loans date = now - timedelta(days=days) new_end_date(loans[0], date) date = now - timedelta(days=days * 2) new_end_date(loans[1], date) # not overdue date = now - timedelta(days=-1) new_end_date(loans[2], date) # not overdue or overdue but not to be notified remaining_not_overdue = loans[3:] for loan in remaining_not_overdue: days = random.choice([-1, 0, 1]) date = now - timedelta(days=days) new_end_date(loan, date) db.session.commit() indexer = RecordIndexer() for rec in recs: indexer.index(rec) current_search.flush_and_refresh(index="*")
def create_record(data): """Create a record. :param dict data: The record data. """ indexer = RecordIndexer() with db.session.begin_nested(): # create uuid rec_uuid = uuid.uuid4() # add the schema host = current_app.config.get('JSONSCHEMAS_HOST') data["$schema"] = \ current_app.extensions['invenio-jsonschemas'].path_to_url( 'custom_record/custom-record-v1.0.0.json') # create PID current_pidstore.minters['custid']( rec_uuid, data, pid_value='custom_pid_{}'.format(rec_uuid)) # create record created_record = Record.create(data, id_=rec_uuid) # index the record indexer.index(created_record) db.session.commit()
def test_oai_set_result_count(mocker, audit_records, db, es, communities, oai_sources, issues): db_records, es_records, oai2d_records = oai_sources for recid in db_records: _, record = record_resolver.resolve(recid) record['_oai']['sets'] = ['user-c1'] record.commit() db.session.commit() indexer = RecordIndexer() for recid in es_records: _, record = record_resolver.resolve(recid) record['_oai']['sets'] = ['user-c1'] indexer.index(record) current_search.flush_and_refresh(index='records') # '/oai2d' needs straight-forward cheating... There's no way to be sure # why the endpoint sometimes fails to report the correct results. It could # be a Resumption Token issue, or even an indexing issue on Elasticsearch. # Either way, we have to be able to replicate when running on production # this behavior and report it as an issue. oai2d_ids_mock = MagicMock() oai2d_ids_mock.return_value = set(oai2d_records) oai2d_ids_mock = mocker.patch( 'zenodo.modules.auditor.oai.OAISetResultCheck' '._oai2d_endpoint_identifiers', new=oai2d_ids_mock) audit = OAIAudit('testAudit', logging.getLogger('auditorTesting'), []) check = OAISetResultCheck(audit, Community.get('c1')) check.perform() audit.clear_db_oai_set_cache() result_issues = check.issues.get('missing_ids', {}) db_issues, es_issues, api_issues = issues assert set(result_issues.get('db', [])) == set(db_issues) assert set(result_issues.get('es', [])) == set(es_issues) assert set(result_issues.get('oai2d', [])) == set(api_issues)
def load_records(es_app, filename, schema): """Try to index records.""" indexer = RecordIndexer() with es_app.test_request_context(): data_filename = pkg_resources.resource_filename("invenio_records", filename) records_data = load(data_filename) records = [] for item in records_data: item_dict = dict(marc21.do(item)) item_dict["$schema"] = schema record = Record.create(item_dict) records.append(record) db.session.commit() es_records = [] for record in records: es_records.append(indexer.index(record)) for record in es_records: search.client.get(index=record["_index"], doc_type=record["_type"], id=record["_id"])
def test_bibliographic_data(es_app): """Test indexation using bibliographic data.""" search = InvenioSearch(es_app) search.create() indexer = RecordIndexer() with es_app.test_request_context(): data_filename = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml') records_data = load(data_filename) records = [] for item in records_data: record = Record.create(item) record['$schema'] = "mappings/marc21_holdings.json" es_record = indexer.index(record) records.append(es_record) for record in records: search.client.get(index=record['_index'], doc_type=record['_type'], id=record['_id']) search.delete()
def load_records(es_app, filename, schema): """Try to index records.""" indexer = RecordIndexer() with es_app.test_request_context(): data_filename = pkg_resources.resource_filename( 'invenio_records', filename) records_data = load(data_filename) records = [] for item in records_data: item_dict = dict(marc21.do(item)) item_dict['$schema'] = schema record = Record.create(item_dict) records.append(record) db.session.commit() es_records = [] for record in records: es_records.append(indexer.index(record)) from invenio_search import current_search for record in es_records: current_search.client.get(index=record['_index'], doc_type=record['_type'], id=record['_id'])
def records_not_merged_in_marcxml(app): snippet_merged = ( '<record>' ' <controlfield tag="001">111</controlfield>' ' <controlfield tag="005">20160922232729.0</controlfield>' ' <datafield tag="024" ind1="7" ind2=" ">' ' <subfield code="2">DOI</subfield>' ' <subfield code="a">10.11588/heidok.00021652</subfield>' ' </datafield>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Humbert, Pascal</subfield>' ' <subfield code="u">Inst. Appl. Math., Heidelberg</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">THESIS</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">CORE</subfield>' ' </datafield>' ' <datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">222</subfield>' ' </datafield>' '</record>' ) snippet_deleted = ( '<record>' ' <controlfield tag="001">222</controlfield>' ' <controlfield tag="005">20160922232729.0</controlfield>' ' <datafield tag="024" ind1="7" ind2=" ">' ' <subfield code="2">DOI</subfield>' ' <subfield code="a">10.11588/heidok.00021652</subfield>' ' </datafield>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Humbert, Pascal</subfield>' ' <subfield code="u">Inst. Appl. Math., Heidelberg</subfield>' ' </datafield>' ' <datafield tag="701" ind1=" " ind2=" ">' ' <subfield code="a">Lindner, Manfred</subfield>' ' </datafield>' ' <datafield tag="856" ind1="4" ind2=" ">' ' <subfield code="u">http://www.ub.uni-heidelberg.de/archiv/21652</subfield>' ' <subfield code="y">U. Heidelberg</subfield>' ' </datafield>' ' <datafield tag="909" ind1="C" ind2="O">' ' <subfield code="o">oai:inspirehep.net:222</subfield>' ' <subfield code="p">INSPIRE:HEP</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' ' <datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">222</subfield>' ' </datafield>' '</record>' ) with app.app_context(): json_record_merged = hep.do(create_record(snippet_merged)) json_record_merged['$schema'] = 'http://localhost:5000/schemas/records/hep.json' json_record_deleted = hep.do(create_record(snippet_deleted)) json_record_deleted['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record_merged = record_upsert(json_record_merged) record_deleted = record_upsert(json_record_deleted) if record_deleted: if record_merged: r = RecordIndexer() r.index(record_merged) r.index(record_deleted) es.indices.refresh('records-hep') db.session.commit() yield with app.app_context(): _delete_merged_records_from_everywhere('literature', 111, 222)
def records_already_merged_in_marcxml(app): snippet_merged = ( '<record>' ' <controlfield tag="001">111</controlfield>' ' <controlfield tag="005">20160922232729.0</controlfield>' ' <datafield tag="024" ind1="7" ind2=" ">' ' <subfield code="2">DOI</subfield>' ' <subfield code="a">10.11588/heidok.00021652</subfield>' ' </datafield>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Humbert, Pascal</subfield>' ' <subfield code="u">Inst. Appl. Math., Heidelberg</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">THESIS</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">CORE</subfield>' ' </datafield>' ' <datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">222</subfield>' ' </datafield>' '</record>' ) snippet_deleted = ( '<record>' ' <controlfield tag="001">222</controlfield>' ' <controlfield tag="005">20160914115512.0</controlfield>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Humbert, Pascal</subfield>' ' </datafield>' ' <datafield tag="970" ind1=" " ind2=" ">' ' <subfield code="d">111</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">THESIS</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">CORE</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>' ) with app.app_context(): json_record_merged = hep.do(create_record(snippet_merged)) json_record_merged['$schema'] = 'http://localhost:5000/schemas/records/hep.json' json_record_deleted = hep.do(create_record(snippet_deleted)) json_record_deleted['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record_merged = record_upsert(json_record_merged) record_deleted = record_upsert(json_record_deleted) if record_deleted: if record_merged: r = RecordIndexer() r.index(record_merged) r.index(record_deleted) es.indices.refresh('records-hep') db.session.commit() yield with app.app_context(): _delete_merged_records_from_everywhere('literature', 111, 222)
def records(skip_files, files, profile, mode): """Load all records.""" if profile: import cProfile import pstats import StringIO pr = cProfile.Profile() pr.enable() indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/record-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/records') action = None if files: record_json = files else: record_json = glob.glob(os.path.join(data, '*.json')) for filename in record_json: # name = filename.split('/')[-1] # if name.startswith('opera'): # click.echo('Skipping opera records ...') # continue click.echo('Loading records from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): if not data: click.echo('IGNORING a possibly broken or corrupted ' 'record entry in file {0} ...'.format(filename)) continue files = data.get('files', []) if mode == 'insert-or-replace': try: pid = PersistentIdentifier.get('recid', data['recid']) if pid: record = update_record( pid, schema, data, files, skip_files) action = 'updated' except PIDDoesNotExistError: record = create_record(schema, data, files, skip_files) action = 'inserted' elif mode == 'insert': try: pid = PersistentIdentifier.get('recid', data['recid']) if pid: click.echo( 'Record recid {} exists already;' ' cannot insert it. '.format( data.get('recid')), err=True) return except PIDDoesNotExistError: record = create_record(schema, data, files, skip_files) action = 'inserted' else: try: pid = PersistentIdentifier.get('recid', data['recid']) except PIDDoesNotExistError: click.echo( 'Record recid {} does not exist; ' 'cannot replace it.'.format( data.get('recid')), err=True) return record = update_record( pid, schema, data, files, skip_files) action = 'updated' if not skip_files: record.files.flush() record.commit() db.session.commit() click.echo( 'Record recid {0} {1}.'.format( data.get('recid'), action)) indexer.index(record) db.session.expunge_all() if profile: pr.disable() s = StringIO.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue())
def docs(files, mode): """Load demo article records.""" from slugify import slugify indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/docs-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/docs') if files: articles_json = files else: articles_json = get_jsons_from_dir(data) for filename in articles_json: # name = filename.split('/')[-1] # if name.startswith('opera'): # click.echo('Skipping opera records ...') # continue click.echo('Loading docs from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): # Replace body with responding content assert data["body"]["content"] content_filename = os.path.join( *( ["/", ] + filename.split('/')[:-1] + [data["body"]["content"], ] ) ) with open(content_filename) as body_field: data["body"]["content"] = body_field.read() if "collections" not in data and \ not isinstance( data.get("collections", None), basestring): data["collections"] = [] if mode == 'insert-or-replace': try: pid = PersistentIdentifier.get( 'docid', str(slugify( data.get('slug', data['title'])))) if pid: record = update_doc(pid, data) action = 'updated' except PIDDoesNotExistError: record = create_doc(data, schema) action = 'inserted' elif mode == 'insert': try: pid = PersistentIdentifier.get( 'docid', str(slugify( data.get('slug', data['title'])))) if pid: click.echo( 'Record docid {} exists already;' ' cannot insert it. '.format( str(slugify( data.get('slug', data['title'])))), err=True) return except PIDDoesNotExistError: record = create_doc(data, schema) action = 'inserted' else: try: pid = PersistentIdentifier.get( 'docid', str(slugify( data.get('slug', data['title'])))) except PIDDoesNotExistError: click.echo( 'Record docid {} does not exist; ' 'cannot replace it.'.format( str(slugify( data.get('slug', data['title'])))), err=True) return record = update_doc(pid, data) action = 'updated' record.commit() db.session.commit() click.echo( ' Record docid {0} {1}.'.format( str(slugify(data.get( 'slug', data['title']))), action)) indexer.index(record) db.session.expunge_all()
def datasets(skip_files): """Load demo datasets records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from cernopendata.modules.records.minters.datasetid import \ cernopendata_datasetid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/datasets-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/datasets') datasets_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in datasets_json: click.echo('Loading datasets from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() # (TOFIX) Remove if statement in production # as every dataset record should have a doi if data.get('doi', None): cernopendata_datasetid_minter(id, data) else: cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()