def migrate_chunk(chunk, broken_output=None, dry_run=False): from invenio_indexer.api import RecordIndexer from ..pidstore.minters import inspire_recid_minter indexer = RecordIndexer() index_queue = [] for raw_record in chunk: record = marc_create_record(raw_record, keep_singletons=False) json_record = create_record(record) if '$schema' in json_record: json_record['$schema'] = url_for( 'invenio_jsonschemas.get_schema', schema_path="records/{0}".format(json_record['$schema']) ) rec_uuid = str(Record.create(json_record, id_=None).id) # Create persistent identifier. pid = inspire_recid_minter(rec_uuid, json_record) index_queue.append(pid.object_uuid) db.session.commit() # Request record indexing for i in index_queue: indexer.index_by_id(i) # Send task to migrate files. return rec_uuid
def records(): """Load records.""" import pkg_resources import uuid from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record, split_blob from invenio_pidstore import current_pidstore from invenio_records.api import Record # pkg resources the demodata data_path = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml' ) with open(data_path) as source: indexer = RecordIndexer() with db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): # create uuid rec_uuid = uuid.uuid4() # do translate record = marc21.do(create_record(data)) # create PID current_pidstore.minters['recid_minter']( rec_uuid, record ) # create record indexer.index(Record.create(record, id_=rec_uuid)) db.session.commit()
def test_basic_search(app, db, es): """Test basic search functionality.""" # The index should be empty assert len(ItemSearch().execute()) == 0 # Create item1, search for everything item1 = Item.create({}) item1.commit() record_indexer = RecordIndexer() record_indexer.index(item1) current_search.flush_and_refresh('_all') assert len(ItemSearch().execute()) == 1 # Create item2, search for everything again item2 = Item.create({'foo': 'bar'}) item2.commit() record_indexer.index(item2) current_search.flush_and_refresh('_all') assert len(ItemSearch().execute()) == 2 # Search for item2 assert len(ItemSearch().query('match', foo='bar').execute()) == 1 # Search for nonsense assert len(ItemSearch().query('match', foo='banana').execute()) == 0
def update_expired_embargoes(): """Release expired embargoes every midnight.""" logger = current_app.logger base_url = urlunsplit(( current_app.config.get('PREFERRED_URL_SCHEME', 'http'), current_app.config['JSONSCHEMAS_HOST'], current_app.config.get('APPLICATION_ROOT') or '', '', '' )) # The task needs to run in a request context as JSON Schema validation # will use url_for. with current_app.test_request_context('/', base_url=base_url): s = B2ShareRecordsSearch( using=current_search_client, index='records' ).query( 'query_string', query='open_access:false AND embargo_date:{{* TO {0}}}'.format( datetime.now(timezone.utc).isoformat() ), allow_leading_wildcard=False ).fields([]) record_ids = [hit.meta.id for hit in s.scan()] if record_ids: logger.info('Changing access of {} embargoed publications' ' to public.'.format(len(record_ids))) for record in Record.get_records(record_ids): logger.debug('Making embargoed publication {} public'.format( record.id)) record['open_access'] = True record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def records(): """Load test data fixture.""" import uuid from invenio_records.api import Record from invenio_pidstore.models import PersistentIdentifier, PIDStatus create_test_user() indexer = RecordIndexer() # Record 1 - Live record with db.session.begin_nested(): rec_uuid = uuid.uuid4() pid1 = PersistentIdentifier.create( 'recid', '1', object_type='rec', object_uuid=rec_uuid, status=PIDStatus.REGISTERED) Record.create({ 'title': 'Registered', 'description': 'This is an awesome description', 'control_number': '1', 'access_right': 'restricted', 'access_conditions': 'fuu', 'owners': [1, 2], 'recid': 1 }, id_=rec_uuid) indexer.index_by_id(pid1.object_uuid) db.session.commit() sleep(3)
def store_record(obj, *args, **kwargs): """Create and index new record in main record space.""" assert "$schema" in obj.data, "No $schema attribute found!" # Create record # FIXME: Do some preprocessing of obj.data before creating a record so that # we're sure that the schema will be validated without touching the full # holdingpen stack. record = Record.create(obj.data, id_=None) # Create persistent identifier. pid = inspire_recid_minter(str(record.id), record) # Commit any changes to record record.commit() # Dump any changes to record obj.data = record.dumps() # Commit to DB before indexing db.session.commit() # Index record indexer = RecordIndexer() indexer.index_by_id(pid.object_uuid)
def records(): """Load records.""" import pkg_resources import uuid from flask_login import login_user, logout_user from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record, split_blob from invenio_accounts.models import User from invenio_deposit.api import Deposit users = User.query.all() # pkg resources the demodata data_path = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml' ) with open(data_path) as source: with current_app.test_request_context(): indexer = RecordIndexer() with db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): login_user(users[index % len(users)]) # do translate record = marc21.do(create_record(data)) # create record indexer.index(Deposit.create(record)) logout_user() db.session.commit()
def test_reindex(app, script_info): """Test reindex.""" # load records with app.test_request_context(): runner = CliRunner() rec_uuid = uuid.uuid4() data = {'title': 'Test0'} record = Record.create(data, id_=rec_uuid) db.session.commit() # Initialize queue res = runner.invoke(cli.queue, ['init', 'purge'], obj=script_info) assert 0 == res.exit_code res = runner.invoke(cli.reindex, ['--yes-i-know'], obj=script_info) assert 0 == res.exit_code res = runner.invoke(cli.run, [], obj=script_info) assert 0 == res.exit_code sleep(5) indexer = RecordIndexer() index, doc_type = indexer.record_to_index(record) res = current_search_client.get(index=index, doc_type=doc_type, id=rec_uuid) assert res['found'] # Destroy queue res = runner.invoke(cli.queue, ['delete'], obj=script_info) assert 0 == res.exit_code
def glossary_terms(): """Load demo terms records.""" from invenio_db import db from invenio_records import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.termid import \ cernopendata_termid_minter indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/glossary-term-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') glossary_terms_json = glob.glob(os.path.join(data, 'terms', '*.json')) for filename in glossary_terms_json: click.echo('Loading glossary-terms from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): if "collections" not in data and \ not isinstance( data.get("collections", None), basestring): data["collections"] = [] data["collections"].append({"primary": "Terms"}) id = uuid.uuid4() cernopendata_termid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) db.session.commit() indexer.index(record) db.session.expunge_all()
def load_records(app, filename, schema, tries=5): """Try to index records.""" indexer = RecordIndexer() records = [] with app.app_context(): with mock.patch('invenio_records.api.Record.validate', return_value=None): data_filename = pkg_resources.resource_filename( 'invenio_records', filename) records_data = load(data_filename) with db.session.begin_nested(): for item in records_data: record_id = uuid.uuid4() item_dict = dict(marc21.do(item)) item_dict['$schema'] = schema recid_minter(record_id, item_dict) oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) records.append(record.id) db.session.commit() # Wait for indexer to finish for i in range(tries): response = current_search_client.search() if response['hits']['total'] >= len(records): break current_search.flush_and_refresh('_all') return records
def record_not_yet_deleted(app): snippet = ( '<record>' ' <controlfield tag="001">333</controlfield>' ' <controlfield tag="005">20160913214552.0</controlfield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>' ) with app.app_context(): json_record = hep.do(create_record(snippet)) json_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record = record_upsert(json_record) if record: ri = RecordIndexer() ri.index(record) db.session.commit() yield with app.app_context(): _delete_record_from_everywhere('literature', 333)
def continuous_migration(): """Task to continuously migrate what is pushed up by Legacy.""" indexer = RecordIndexer() redis_url = current_app.config.get('CACHE_REDIS_URL') r = StrictRedis.from_url(redis_url) try: while r.llen('legacy_records'): raw_record = r.lpop('legacy_records') if raw_record: # FIXME use migrate_and_insert_record(raw_record) # The record might be None, in case a parallel # continuous_migration task has already consumed the queue. raw_record = zlib.decompress(raw_record) record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001'][0]) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record json_record = create_record(record) with db.session.begin_nested(): try: record = record_upsert(json_record) except ValidationError as e: # Invalid record, will not get indexed errors = "ValidationError: Record {0}: {1}".format( recid, e ) prod_record.valid = False prod_record.errors = errors db.session.merge(prod_record) continue indexer.index_by_id(record.id) finally: db.session.commit() db.session.close()
def test_indexer_bulk_index(app, queue): """Test delay indexing.""" with app.app_context(): with establish_connection() as c: indexer = RecordIndexer() id1 = uuid.uuid4() id2 = uuid.uuid4() indexer.bulk_index([id1, id2]) indexer.bulk_delete([id1, id2]) consumer = Consumer( connection=c, queue=indexer.mq_queue.name, exchange=indexer.mq_exchange.name, routing_key=indexer.mq_routing_key) messages = list(consumer.iterqueue()) [m.ack() for m in messages] assert len(messages) == 4 data0 = messages[0].decode() assert data0['id'] == str(id1) assert data0['op'] == 'index' data2 = messages[2].decode() assert data2['id'] == str(id1) assert data2['op'] == 'delete'
def data_policies(skip_files): """Load demo Data Policy records.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_records_files.api import Record from invenio_records.models import RecordMetadata indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/data-policies-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') data_policies_json = glob.glob(os.path.join(data, '*.json')) for filename in data_policies_json: click.echo('Loading data-policies from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def receive_after_model_commit(sender, changes): """Perform actions after models committed to database.""" indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update'): indexer.index(InspireRecord(model_instance.json, model_instance)) else: indexer.delete(InspireRecord(model_instance.json, model_instance))
def update_authors_recid(record_id, uuid, profile_recid): """Update author profile for a given signature. The method receives UUIDs representing record and signature respectively together with an author profile recid. The new recid will be placed in the signature with the given UUID. :param record_id: A string representing UUID of a given record. Example: record_id = "a5afb151-8f75-4e91-8dc1-05e7e8e8c0b8" :param uuid: A string representing UUID of a given signature. Example: uuid = "c2f432bd-2f52-4c16-ac66-096f168c762f" :param profile_recid: A string representing author profile recid, that updated signature should point to. Example: profile_recid = "1" """ try: record = Record.get_record(record_id) update_flag = False for author in record['authors']: if author['uuid'] == uuid: author['recid'] = str(profile_recid) update_flag = True if update_flag: # Disconnect the signal on insert of a new record. before_record_index.disconnect(append_updated_record_to_queue) # Update the record in the database. record.commit() db.session.commit() # Update the record in Elasticsearch. indexer = RecordIndexer() indexer.index_by_id(record.id) except StaleDataError as exc: raise update_authors_recid.retry(exc=exc) finally: # Reconnect the disconnected signal. before_record_index.connect(append_updated_record_to_queue) # Report. logger.info("Updated signature %s with profile %s", uuid, profile_recid)
def closed_access_record(db, es, record_with_files_creation): """Creation of a full record with closed access right.""" _, record, record_url = record_with_files_creation record['access_right'] = AccessRight.CLOSED record.commit() db.session.commit() indexer = RecordIndexer() indexer.index(record) current_search.flush_and_refresh(index='records') return record
def index_record(obj, eng): """ Index the record. It only should be indexed when every other step finished successfully. """ recid = obj.data['control_number'] pid = PersistentIdentifier.get('recid', recid) indexer = RecordIndexer() indexer.index_by_id(pid.object_uuid)
def create_record(app, item_dict, mint_oaiid=True): """Create test record.""" indexer = RecordIndexer() with app.test_request_context(): record_id = uuid.uuid4() recid_minter(record_id, item_dict) if mint_oaiid: oaiid_minter(record_id, item_dict) record = Record.create(item_dict, id_=record_id) indexer.index(record) return record
def update_expired_embargos(): """Release expired embargoes every midnight.""" record_ids = AccessRight.get_expired_embargos() for record in Record.get_records(record_ids): record['access_right'] = AccessRight.OPEN record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def indexed_loans(es, test_loans): """Get a function to wait for records to be flushed to index.""" indexer = RecordIndexer() for pid, loan in test_loans: indexer.index(loan) current_search.flush_and_refresh(index="loans") yield test_loans for pid, loan in test_loans: indexer.delete_by_id(loan.id) current_search.flush_and_refresh(index="loans")
def remove_records(app, record_ids): """Remove all records.""" with app.app_context(): indexer = RecordIndexer() for r_id in record_ids: record = RecordMetadata.query.get(r_id) indexer.delete_by_id(r_id) pids = PersistentIdentifier.query.filter_by( object_uuid=r_id).all() for pid in pids: db.session.delete(pid) db.session.delete(record) db.session.commit()
def _create_and_index_record(record): record = Record.create(record) inspire_recid_minter(record.id, record) # invenio-collections will populate _collections field in record upon # commit db.session.commit() # Record needs to be indexed since views fetch records from ES r = RecordIndexer() r.index(record) es.indices.refresh('records-hep') return record
def oaiserver(sets, records): """Initialize OAI-PMH server.""" from invenio_db import db from invenio_oaiserver.models import OAISet from invenio_records.api import Record # create a OAI Set with db.session.begin_nested(): for i in range(sets): db.session.add(OAISet( spec='test{0}'.format(i), name='Test{0}'.format(i), description='test desc {0}'.format(i), search_pattern='title_statement.title:Test{0}'.format(i), )) # create a record schema = { 'type': 'object', 'properties': { 'title_statement': { 'type': 'object', 'properties': { 'title': { 'type': 'string', }, }, }, 'field': {'type': 'boolean'}, }, } search.client.indices.delete_alias('_all', '_all', ignore=[400, 404]) search.client.indices.delete('*') with app.app_context(): indexer = RecordIndexer() with db.session.begin_nested(): for i in range(records): record_id = uuid.uuid4() data = { 'title_statement': {'title': 'Test{0}'.format(i)}, '$schema': schema, } recid_minter(record_id, data) oaiid_minter(record_id, data) record = Record.create(data, id_=record_id) indexer.index(record) db.session.commit()
def test_record_can_be_deleted(app, record_not_yet_deleted): with app.test_client() as client: assert client.get('/api/literature/333').status_code == 200 record = get_db_record('literature', 333) record['deleted'] = True record.commit() if record: ri = RecordIndexer() ri.index(record) db.session.commit() with app.test_client() as client: assert client.get('/api/literature/333').status_code == 410
def _delete_record_from_everywhere(pid_type, record_control_number): record = get_db_record(pid_type, record_control_number) ri = RecordIndexer() ri.delete(record) record.delete(force=True) pid = PersistentIdentifier.get(pid_type, record_control_number) PersistentIdentifier.delete(pid) object_uuid = pid.object_uuid PersistentIdentifier.query.filter( object_uuid == PersistentIdentifier.object_uuid).delete() db.session.commit()
def index_after_commit(sender, changes): """Index a record in ES after it was committed to the DB. This cannot happen in an ``after_record_commit`` receiver from Invenio-Records because, despite the name, at that point we are not yet sure whether the record has been really committed to the DB. """ indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update'): indexer.index(Record(model_instance.json, model_instance)) else: indexer.delete(Record(model_instance.json, model_instance))
def store_record(obj, *args, **kwargs): """Create and index new record in main record space.""" if '$schema' in obj.data: obj.data['$schema'] = url_for( 'invenio_jsonschemas.get_schema', schema_path="records/{0}".format(obj.data['$schema']) ) # Create record rec_uuid = str(Record.create(obj.data, id_=None).id) # Create persistent identifier. pid = inspire_recid_minter(rec_uuid, obj.data) db.session.commit() # Index record indexer = RecordIndexer() indexer.index_by_id(pid.object_uuid)
def test_records_can_be_merged(app, records_not_merged_in_marcxml): with app.test_client() as client: assert client.get('/api/literature/111').status_code == 200 assert client.get('/api/literature/222').status_code == 200 record = get_db_record('literature', 222) record['deleted'] = True record['new_record'] = {'$ref': 'http://localhost:5000/api/record/111'} record.commit() if record: ri = RecordIndexer() ri.index(record) db.session.commit() with app.test_client() as client: assert client.get('/api/literature/111').status_code == 200 assert client.get('/api/literature/222').status_code == 301
def test_before_record_index_dynamic_connect(app): """Test before_record_index.dynamic_connect.""" with app.app_context(): with patch('invenio_records.api.Record.validate'): auth_record = Record.create({ '$schema': '/records/authorities/authority-v1.0.0.json', 'title': 'Test' }) bib_record = Record.create({ '$schema': '/records/bibliographic/bibliographic-v1.0.0.json', 'title': 'Test' }) db.session.commit() def _simple(sender, json=None, **kwargs): json['simple'] = 'simple' def _custom(sender, json=None, **kwargs): json['custom'] = 'custom' def _cond(sender, connect_kwargs, index=None, **kwargs): return 'bibliographic' in index _receiver1 = before_record_index.dynamic_connect( _simple, index='records-authorities-authority-v1.0.0') _receiver2 = before_record_index.dynamic_connect(_custom, condition_func=_cond) action = RecordIndexer()._index_action( dict(id=str(auth_record.id), op='index')) assert 'title' in action['_source'] assert action['_source']['simple'] == 'simple' action = RecordIndexer()._index_action( dict(id=str(bib_record.id), index='foo', op='index')) assert 'title' in action['_source'] assert action['_source']['custom'] == 'custom' before_record_index.disconnect(_receiver1) before_record_index.disconnect(_receiver2)
def index_after_commit(sender, changes): """Index a record in ES after it was committed to the DB. This cannot happen in an ``after_record_commit`` receiver from Invenio-Records because, despite the name, at that point we are not yet sure whether the record has been really committed to the DB. """ indexer = RecordIndexer() for model_instance, change in changes: if isinstance(model_instance, RecordMetadata): if change in ('insert', 'update') and not model_instance.json.get("deleted"): if hasattr(model_instance, '_enhanced_record'): record = model_instance._enhanced_record else: record = model_instance.json indexer.index(InspireRecord(record, model_instance)) else: try: indexer.delete(InspireRecord( model_instance.json, model_instance)) except NotFoundError: # Record not found in ES LOGGER.debug('Record %s not found in ES', model_instance.json.get("id")) pass pid_type = get_pid_type_from_schema(model_instance.json['$schema']) pid_value = model_instance.json['control_number'] db_version = model_instance.version_id index_modified_citations_from_record.delay(pid_type, pid_value, db_version)
def articles(): """Load demo article records.""" from invenio_db import db from invenio_records import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.artid import \ cernopendata_articleid_minter indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/article-v1.0.0.json') data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/articles') articles_json = get_jsons_from_dir(data) for filename in articles_json: with open(filename, 'rb') as source: for data in json.load(source): # Replace body with responding content assert data["body"]["content"] content_filename = os.path.join(*([ "/", ] + filename.split('/')[:-1] + [ data["body"]["content"], ])) with open(content_filename) as body_field: data["body"]["content"] = body_field.read() if "collections" not in data and \ not isinstance(data.get("collections", None), basestring): data["collections"] = [] id = uuid.uuid4() cernopendata_articleid_minter(id, data) record = Record.create(data, id_=id) record['$schema'] = schema db.session.commit() indexer.index(record) db.session.expunge_all()
def _create_records(path, verbose): """Create demo records.""" indexer = RecordIndexer( record_to_index=lambda record: ('records', 'record') ) if verbose > 0: click.secho('Creating records', fg='yellow', bold=True) with db.session.begin_nested(): records_dir = os.path.join(path, 'records') nb_records = 0 for root, dirs, files in os.walk(records_dir): for filename in files: split_filename = os.path.splitext(filename) if split_filename[1] == '.json': rec_uuid = UUID(split_filename[0]) with open(os.path.join(records_dir, root, filename)) as record_file: record_str = record_file.read() record_str = resolve_community_id(record_str) record_str = resolve_block_schema_id(record_str) deposit = Deposit.create(json.loads(record_str), id_=rec_uuid) ObjectVersion.create(deposit.files.bucket, 'myfile', stream=BytesIO(b'mycontent')) deposit.publish() pid, record = deposit.fetch_published() # index the record indexer.index(record) if verbose > 1: click.secho('CREATED RECORD {0}:\n {1}'.format( str(rec_uuid), json.dumps(record, indent=4) )) click.secho('CREATED DEPOSIT {0}:\n {1}'.format( str(rec_uuid), json.dumps(deposit, indent=4) )) nb_records += 1 if verbose > 0: click.secho('Created {} records!'.format(nb_records), fg='green')
def update_expired_embargoes(): """Release expired embargoes every midnight.""" logger = current_app.logger base_url = urlunsplit( (current_app.config.get('PREFERRED_URL_SCHEME', 'http'), current_app.config['JSONSCHEMAS_HOST'], current_app.config.get('APPLICATION_ROOT') or '', '', '')) # The task needs to run in a request context as JSON Schema validation # will use url_for. with current_app.test_request_context('/', base_url=base_url): s = B2ShareRecordsSearch( using=current_search_client, index='records').query( 'query_string', query='open_access:false AND embargo_date:{{* TO {0}}}'.format( datetime.now(timezone.utc).isoformat()), allow_leading_wildcard=False).fields([]) record_ids = [hit.meta.id for hit in s.scan()] if record_ids: logger.info('Changing access of {} embargoed publications' ' to public.'.format(len(record_ids))) for record in Record.get_records(record_ids): logger.debug('Making embargoed publication {} public'.format( record.id)) record['open_access'] = True record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def demo_init(): """Initialize demo site.""" from flask import current_app records = [] # Import bibliographic records click.secho('Importing bibliographic records', fg='green') records += import_records( marc21, current_app.extensions['invenio-jsonschemas'].path_to_url( 'marc21/bibliographic/bd-v1.0.2.json'), pkg_resources.resource_filename('invenio_records', 'data/marc21/bibliographic.xml'), ) # FIXME add support for authority records. # Import authority records # click.secho('Importing authority records', fg='green') # records += import_records( # marc21_authority, # current_app.extensions['invenio-jsonschemas'].path_to_url( # 'marc21/authority/ad-v1.0.2.json'), # pkg_resources.resource_filename( # 'invenio_records', 'data/marc21/authority.xml'), # ) db.session.commit() # Index all records click.secho('Indexing records', fg='green') indexer = RecordIndexer() indexer.bulk_index(records) indexer.process_bulk_queue()
def test_open_access_permissions(client, json_headers, testdata, users): """Test GET open/close access documents.""" # set the documents to have read access only by patron2. `_access` should # be totally ignored. indexer = RecordIndexer() doc1 = Document.get_record_by_pid("docid-open-access") doc2 = Document.get_record_by_pid("docid-closed-access") for doc in [doc1, doc2]: doc.update(dict(_access=dict(read=["patron2"]))) doc.commit() db.session.commit() indexer.index(doc) current_search.flush_and_refresh(index="documents") test_data = [ ("anonymous", "docid-open-access", 200, 1), ("patron1", "docid-open-access", 200, 1), ("patron2", "docid-open-access", 200, 1), ("librarian", "docid-open-access", 200, 1), ("admin", "docid-open-access", 200, 1), ("anonymous", "docid-closed-access", 401, 0), ("patron1", "docid-closed-access", 403, 0), ("patron2", "docid-closed-access", 403, 0), ("librarian", "docid-closed-access", 200, 1), ("admin", "docid-closed-access", 200, 1), ] for user, pid, status_code, n_hits in test_data: # item endpoint user_login(client, user, users) url = url_for("invenio_records_rest.docid_item", pid_value=pid) res = client.get(url, headers=json_headers) assert res.status_code == status_code # list endpoint user_login(client, user, users) url = url_for("invenio_records_rest.docid_list", q="pid:{}".format(pid)) res = client.get(url, headers=json_headers) hits = json.loads(res.data.decode("utf-8")) assert hits["hits"]["total"] == n_hits
def prepare_data(): """Prepare data.""" days = current_app.config[ "ILS_CIRCULATION_MAIL_OVERDUE_REMINDER_INTERVAL" ] loans = testdata["loans"] recs = [] now = arrow.utcnow() def new_end_date(loan, date): loan["end_date"] = date.date().isoformat() loan["state"] = "ITEM_ON_LOAN" loan.commit() recs.append(loan) # overdue loans date = now - timedelta(days=days) new_end_date(loans[0], date) date = now - timedelta(days=days * 2) new_end_date(loans[1], date) # not overdue date = now - timedelta(days=-1) new_end_date(loans[2], date) # not overdue or overdue but not to be notified remaining_not_overdue = loans[3:] for loan in remaining_not_overdue: days = random.choice([-1, 0, 1]) date = now - timedelta(days=days) new_end_date(loan, date) db.session.commit() indexer = RecordIndexer() for rec in recs: indexer.index(rec) current_search.flush_and_refresh(index="*")
def testdata(app, db, es_clear, patron1): """Create, index and return test data.""" data = load_json_from_datadir("locations.json") locations = _create_records(db, data, Location, LOCATION_PID_TYPE) data = load_json_from_datadir("internal_locations.json") int_locs = _create_records(db, data, InternalLocation, INTERNAL_LOCATION_PID_TYPE) data = load_json_from_datadir("documents.json") documents = _create_records(db, data, Document, DOCUMENT_PID_TYPE) data = load_json_from_datadir("series.json") series = _create_records(db, data, Series, SERIES_PID_TYPE) data = load_json_from_datadir("items.json") items = _create_records(db, data, Item, ITEM_PID_TYPE) data = load_json_from_datadir("eitems.json") eitems = _create_records(db, data, EItem, EITEM_PID_TYPE) data = load_json_from_datadir("loans.json") loans = _create_records(db, data, Loan, CIRCULATION_LOAN_PID_TYPE) # index ri = RecordIndexer() for rec in (locations + int_locs + series + documents + items + eitems + loans): ri.index(rec) current_search.flush_and_refresh(index="*") return { "documents": documents, "eitems": eitems, "internal_locations": int_locs, "items": items, "loans": loans, "locations": locations, "series": series, }
def curate(community): """Index page with uploader and list of existing depositions. :param community_id: ID of the community to curate. """ if request.method == 'POST': action = request.json.get('action') recid = request.json.get('recid') # 'recid' is mandatory if not recid: abort(400) if action not in ['accept', 'reject', 'remove']: abort(400) # Resolve recid to a Record resolver = Resolver(pid_type='recid', object_type='rec', getter=Record.get_record) pid, record = resolver.resolve(recid) # Perform actions if action == "accept": community.accept_record(record) elif action == "reject": community.reject_record(record) elif action == "remove": community.remove_record(record) record.commit() db.session.commit() RecordIndexer().index_by_id(record.id) return jsonify({'status': 'success'}) ctx = {'community': community} community_id = community.id community_flg = "0" # Get index style style = IndexStyle.get( current_app.config['WEKO_INDEX_TREE_STYLE_OPTIONS']['id']) width = style.width if style else '3' height = style.height if style else None sort_options, display_number = SearchSetting.get_results_setting() return render_template(current_app.config['COMMUNITIES_CURATE_TEMPLATE'], community_id=community_id, sort_option=sort_options, width=width, height=height, **ctx)
def test_get_record_no_acls_authenticated(app, db, es, es_acl_prepare, test_users): pid, record = create_record({}, clz=SchemaEnforcingRecord) RecordIndexer().index(record) # make sure it is flushed current_search_client.indices.flush() # try to get it ... with app.test_client() as client: login(client, test_users.u1) res = client.get(record_url(pid)) assert res.status_code == 403 # Forbidden
def unindex_record_trigger(sender, *args, **kwargs): """Unindex the given record if it is a publication.""" record = kwargs['record'] if is_publication(record.model): # The indexer requires that the record still exists in the database # when it is removed from the search index. Thus we have to unindex it # synchonously. try: RecordIndexer().delete(record) except NotFoundError: pass
def indexed_records(es, records): """Fixture for the records, which are already indexed.""" # es.indices.flush('*') # # delete all elasticsearch indices and recreate them # for deleted in current_search.delete(ignore=[404]): # pass # for created in current_search.create(None): # pass # flush the indices so that indexed records are searchable for pid_name, record in records.items(): RecordIndexer().index(record) es.indices.flush('*') return records
def remove_oaiset_spec(record_uuid, spec): """Remove the OAI spec from the record and commit.""" rec = Record.get_record(record_uuid) rec['_oai']['sets'] = sorted( [s for s in rec['_oai'].get('sets', []) if s != spec]) rec['_oai']['updated'] = datetime_to_datestamp(datetime.utcnow()) if not rec['_oai']['sets']: del rec['_oai']['sets'] rec.commit() db.session.commit() RecordIndexer().bulk_index([ str(rec.id), ])
def test_custom_search(es, api, json_headers, record_with_bucket, custom_metadata, query, result): """Test custom metadata search.""" pid, record = record_with_bucket record['custom'] = custom_metadata RecordIndexer().index(record) current_search.flush_and_refresh(index='records') with api.test_request_context(): with api.test_client() as client: res = client.get(url_for('invenio_records_rest.recid_list', custom=query), headers=json_headers) assert len(res.json) == result
def reindex_pid(pid_type, RecordClass): index_name = None indexer = RecordIndexer() for pid in tqdm.tqdm(PersistentIdentifier.query.filter_by( pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value)): record = RecordClass.get_record(pid.object_uuid) if only and str(record.id) != only: continue try: index_name, doc_type = indexer.record_to_index(record) index_name = build_alias_name(index_name) # print('Indexing', record.get('id'), 'into', index_name) indexer.index(record) except: with open('/tmp/indexing-error.json', 'a') as f: print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f) traceback.print_exc(file=f) if raise_on_error: raise if index_name: current_search_client.indices.refresh(index_name) current_search_client.indices.flush(index_name)
def test_delete(app): """Test record indexing.""" with app.app_context(): recid = uuid.uuid4() record = Record.create({'title': 'Test'}, id_=recid) db.session.commit() client_mock = MagicMock() RecordIndexer(search_client=client_mock).delete(record) doc_type = app.config['INDEXER_DEFAULT_DOC_TYPE'] if lt_es7 else '_doc' client_mock.delete.assert_called_with( id=str(recid), index=app.config['INDEXER_DEFAULT_INDEX'], doc_type=doc_type, version=record.revision_id, version_type='external_gte', ) with patch('invenio_indexer.api.RecordIndexer.delete') as fun: RecordIndexer(search_client=client_mock).delete_by_id(recid) assert fun.called
def test_process_bulk_queue(app, queue): """Test process indexing.""" with app.app_context(): # Create a test record r = Record.create({'title': 'test'}) db.session.commit() invalid_id2 = uuid.uuid4() RecordIndexer().bulk_index([r.id, invalid_id2]) RecordIndexer().bulk_delete([r.id, invalid_id2]) ret = {} def _mock_bulk(client, actions_iterator, **kwargs): ret['actions'] = list(actions_iterator) return len(ret['actions']) with patch('invenio_indexer.api.bulk', _mock_bulk): # Invalid actions are rejected assert RecordIndexer().process_bulk_queue() == 2 assert [x['_op_type'] for x in ret['actions']] == \ ['index', 'delete']
def request(community_id, record_id, accept): """Request a record acceptance to a community.""" c = Community.get(community_id) assert c is not None record = Record.get_record(record_id) if accept: c.add_record(record) record.commit() else: InclusionRequest.create(community=c, record=record, notify=False) db.session.commit() RecordIndexer().index_by_id(record.id)
def test_oai_set_result_count(mocker, audit_records, db, es, communities, oai_sources, issues): db_records, es_records, oai2d_records = oai_sources for recid in db_records: _, record = record_resolver.resolve(recid) record['_oai']['sets'] = ['user-c1'] record.commit() db.session.commit() indexer = RecordIndexer() for recid in es_records: _, record = record_resolver.resolve(recid) record['_oai']['sets'] = ['user-c1'] indexer.index(record) current_search.flush_and_refresh(index='records') # '/oai2d' needs straight-forward cheating... There's no way to be sure # why the endpoint sometimes fails to report the correct results. It could # be a Resumption Token issue, or even an indexing issue on Elasticsearch. # Either way, we have to be able to replicate when running on production # this behavior and report it as an issue. oai2d_ids_mock = MagicMock() oai2d_ids_mock.return_value = set(oai2d_records) oai2d_ids_mock = mocker.patch( 'zenodo.modules.auditor.oai.OAISetResultCheck' '._oai2d_endpoint_identifiers', new=oai2d_ids_mock) audit = OAIAudit('testAudit', logging.getLogger('auditorTesting'), []) check = OAISetResultCheck(audit, Community.get('c1')) check.perform() audit.clear_db_oai_set_cache() result_issues = check.issues.get('missing_ids', {}) db_issues, es_issues, api_issues = issues assert set(result_issues.get('db', [])) == set(db_issues) assert set(result_issues.get('es', [])) == set(es_issues) assert set(result_issues.get('oai2d', [])) == set(api_issues)
def create_record(data): """Create a record. :param dict data: The record data. """ indexer = RecordIndexer() with db.session.begin_nested(): # create uuid rec_uuid = uuid.uuid4() # add the schema host = current_app.config.get('JSONSCHEMAS_HOST') data["$schema"] = \ current_app.extensions['invenio-jsonschemas'].path_to_url( 'custom_record/custom-record-v1.0.0.json') # create PID current_pidstore.minters['custid']( rec_uuid, data, pid_value='custom_pid_{}'.format(rec_uuid)) # create record created_record = Record.create(data, id_=rec_uuid) # index the record indexer.index(created_record) db.session.commit()
def delete(cls, data, vendor=None, delindex=True, force=False): """Delete a IrokoRecord record.""" assert data.get(cls.pid_uuid_field) pid = data.get(cls.pid_uuid_field) record = cls.get_record_by_pid(pid, with_deleted=False) pid.delete() result = record.delete(force=force) if delindex: try: RecordIndexer().delete(record) except NotFoundError: pass return result
def test_records_serializers_dc(app, test_records_data): with app.app_context(): pid, record = make_record(test_records_data) rec = { '_source': RecordIndexer._prepare_record(record, 'records', 'record').copy(), '_version': record.revision_id } dcxml = oaipmh_oai_dc(pid=pid, record=rec) namespaces = {'dc': 'http://purl.org/dc/elements/1.1/'} identifiers = dcxml.xpath('//dc:identifier', namespaces=namespaces) titles = dcxml.xpath('//dc:title', namespaces=namespaces) creators = dcxml.xpath('//dc:creator', namespaces=namespaces) descriptions = dcxml.xpath('//dc:description', namespaces=namespaces) subjects = dcxml.xpath('//dc:subject', namespaces=namespaces) contributors = dcxml.xpath('//dc:contributor', namespaces=namespaces) rights = dcxml.xpath('//dc:rights', namespaces=namespaces) publishers = dcxml.xpath('//dc:publisher', namespaces=namespaces) languages = dcxml.xpath('//dc:language', namespaces=namespaces) types = dcxml.xpath('//dc:type', namespaces=namespaces) assert identifiers for x in identifiers: assert x.text.endswith(pid.pid_value) assert [x.text for x in titles] == [r['title'] for r in record['titles']] assert [x.text for x in creators ] == [r['creator_name'] for r in record['creators']] assert [x.text for x in descriptions ] == [r['description'] for r in record['descriptions']] assert [x.text for x in types] == [ r['resource_type_general'] for r in record['resource_types'] ] assert [x.text for x in contributors ] == [r['contributor_name'] for r in record['contributors']] assert [x.text for x in publishers] == [record['publisher']] assert [x.text for x in languages] == [record['language']] assert [x.text for x in subjects] == record.get('keywords') rights = [x.text for x in rights] access = 'info:eu-repo/semantics/closedAccess' if record['open_access']: access = 'info:eu-repo/semantics/openAccess' assert access in rights license = record.get('license', {}).get('license') if license: assert license in rights
def delete(self, **kwargs): """Delete a record.""" from b2share.modules.deposit.api import Deposit from b2share.modules.deposit.providers import DepositUUIDProvider pid = self.pid # Fetch deposit id from record and resolve deposit record and pid. depid = PersistentIdentifier.get(DepositUUIDProvider.pid_type, pid.pid_value) if depid.status == PIDStatus.REGISTERED: depid, deposit = Resolver( pid_type=depid.pid_type, object_type='rec', # Retrieve the deposit with the Record class on purpose # as the current Deposit api prevents the deletion of # published deposits. getter=Deposit.get_record, ).resolve(depid.pid_value) deposit.delete() # Mark all record's PIDs as DELETED all_pids = PersistentIdentifier.query.filter( PersistentIdentifier.object_type == pid.object_type, PersistentIdentifier.object_uuid == pid.object_uuid, ).all() for rec_pid in all_pids: if not rec_pid.is_deleted(): rec_pid.delete() # Mark the bucket as deleted # delete all buckets linked to the deposit res = Bucket.query.join(RecordsBuckets).\ filter(RecordsBuckets.bucket_id == Bucket.id, RecordsBuckets.record_id == self.id).all() for bucket in res: bucket.deleted = True # Mark the record and deposit as deleted. The record is unindexed # via the trigger on record deletion. super(B2ShareRecord, self).delete() version_master = PIDNodeVersioning(pid=pid) # If the parent has no other children and no draft child # mark it as deleted if not version_master.children.all(): if not version_master.draft_child: version_master.parent.delete() else: # Reindex the "new" last published version in order to have # its "is_last_version" up to date. RecordIndexer().index_by_id(version_master.last_child.object_uuid)
def import_v1_data(verbose, download, token, download_directory,limit): click.secho("Importing data to the current instance") logger = logging.getLogger("sqlalchemy.engine") logger.setLevel(logging.ERROR) logfile = open(current_app.config.get('MIGRATION_LOGFILE'), 'a') logfile.write("\n\n\n~~~ Starting import task download={} limit={}" .format(download, limit)) if os.path.isdir(download_directory): os.chdir(download_directory) else: raise click.ClickException("%s does not exist or is not a directory. If you want to import " "records specify an empty, existing directory." % download_directory) if limit and not download: raise click.ClickException("Limit can only be set with download") if download: filelist = os.listdir('.') if len(filelist) > 0: click.secho("!!! Downloading data into existing directory, " "overwriting previous data", fg='red') click.secho("----------") click.secho("Downloading data into directory %s" % download_directory) if limit is not None: limit = int(limit) click.secho("Limiting to %d records for debug purposes" % limit) download_v1_data(token, download_directory, logfile, limit) indexer = RecordIndexer(record_to_index=record_to_index) dirlist = os.listdir('.') click.secho("-----------") click.secho("Processing %d downloaded records" % (len(dirlist))) base_url = urlunsplit(( current_app.config.get('PREFERRED_URL_SCHEME', 'http'), # current_app.config['SERVER_NAME'], current_app.config['JSONSCHEMAS_HOST'], current_app.config.get('APPLICATION_ROOT') or '', '', '' )) for d in dirlist: try: process_v1_record(d, indexer, base_url, logfile) except: logfile.write("\n********************") logfile.write("\nERROR: exception while processing record /{}/___record.json___\n" .format(d)) logfile.write(traceback.format_exc()) logfile.write("\n********************") logfile.close()
def bulk_index_records(records): """Bulk index a list of records.""" indexer = RecordIndexer() click.echo("Bulk indexing {} records...".format(len(records))) indexer.bulk_index([str(r.id) for r in records]) indexer.process_bulk_queue() click.echo("Indexing completed!")
def prepare_data(): """Prepare data.""" loans = testdata["loans"] recs = [] now = arrow.utcnow() def new_expiration_date(loan, date): loan["request_expire_date"] = date.date().isoformat() loan["state"] = "PENDING" loan.commit() recs.append(loan) # expired loans date = now - timedelta(days=1) new_expiration_date(loans[0], date) new_expiration_date(loans[1], date) date = now - timedelta(days=2) new_expiration_date(loans[2], date) expired_pids = [loans[0]["pid"], loans[1]["pid"], loans[2]["pid"]] # not expired loans not_expired_pids = [] remaining_not_expired = loans[3:] n_days = 0 # today for loan in remaining_not_expired: date = now + timedelta(days=n_days) new_expiration_date(loan, date) not_expired_pids.append(loan["pid"]) n_days += 1 db.session.commit() indexer = RecordIndexer() for rec in recs: indexer.index(rec) current_search.flush_and_refresh(index="*") return expired_pids, not_expired_pids
def test_delete_action(app): """Test delete action.""" with app.app_context(): testid = str(uuid.uuid4()) action = RecordIndexer()._delete_action( dict(id=testid, op='delete', index='idx', doc_type='doc')) assert action['_op_type'] == 'delete' assert action['_index'] == 'idx' assert action['_type'] == 'doc' assert action['_id'] == testid # Skip JSONSchema validation with patch('invenio_records.api.Record.validate'): record = Record.create({ '$schema': { '$ref': '/records/authorities/authority-v1.0.0.json' }, 'title': 'Test', }) db.session.commit() action = RecordIndexer()._delete_action( dict(id=str(record.id), op='delete', index=None, doc_type=None)) assert action['_op_type'] == 'delete' assert action['_index'] == 'records-authorities-authority-v1.0.0' assert action['_type'] == 'authority-v1.0.0' if lt_es7 else '_doc' assert action['_id'] == str(record.id) record.delete() db.session.commit() action = RecordIndexer()._delete_action( dict(id=str(record.id), op='delete', index=None, doc_type=None)) assert action['_op_type'] == 'delete' # Deleted record doesn't have '$schema', so index and doc type cannot # be determined, resulting to the defaults from config assert action['_index'] == app.config['INDEXER_DEFAULT_INDEX'] assert action['_type'] == \ app.config['INDEXER_DEFAULT_DOC_TYPE'] if lt_es7 else '_doc' assert action['_id'] == str(record.id)