def update_record_statistics(start_date=None, end_date=None): """Update "_stats" field of affected records.""" start_date = dateutil_parse(start_date) if start_date else None end_date = dateutil_parse(end_date) if start_date else None aggr_configs = {} if not start_date and not end_date: start_date = datetime.utcnow() end_date = datetime.utcnow() for aggr_name, aggr_cfg in current_stats.aggregations.items(): aggr = aggr_cfg.cls(name=aggr_cfg.name, **aggr_cfg.params) if not Index(aggr.index, using=aggr.client).exists(): if not Index(aggr.event_index, using=aggr.client).exists(): start_date = min(start_date, datetime.utcnow()) else: start_date = min(start_date, aggr._get_oldest_event_timestamp()) # Retrieve the last two bookmarks bookmarks = aggr.list_bookmarks(limit=2) if len(bookmarks) >= 1: end_date = max( end_date, datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix)) if len(bookmarks) == 2: start_date = min( start_date, datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix)) aggr_configs[aggr.index] = aggr elif start_date and end_date: for aggr_name, aggr_cfg in current_stats.aggregations.items(): aggr = aggr_cfg.cls(name=aggr_cfg.name, **aggr_cfg.params) aggr_configs[aggr.index] = aggr else: return # Get conceptrecids for all the affected records between the two dates conceptrecids = set() for aggr_index, aggr in aggr_configs.items(): query = Search( using=aggr.client, index=aggr.index, doc_type=aggr.doc_type, ).filter('range', timestamp={ 'gte': start_date.replace(microsecond=0).isoformat() + '||/d', 'lte': end_date.replace(microsecond=0).isoformat() + '||/d' }).source(include='conceptrecid') conceptrecids |= {b.conceptrecid for b in query.scan()} indexer = RecordIndexer() for concpetrecid_val in conceptrecids: conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val) pv = PIDVersioning(parent=conceptrecid) children_recids = pv.children.all() indexer.bulk_index([str(p.object_uuid) for p in children_recids])
def update_expired_embargoes(): """Release expired embargoes every midnight.""" logger = current_app.logger base_url = urlunsplit(( current_app.config.get('PREFERRED_URL_SCHEME', 'http'), current_app.config['JSONSCHEMAS_HOST'], current_app.config.get('APPLICATION_ROOT') or '', '', '' )) # The task needs to run in a request context as JSON Schema validation # will use url_for. with current_app.test_request_context('/', base_url=base_url): s = B2ShareRecordsSearch( using=current_search_client, index='records' ).query( 'query_string', query='open_access:false AND embargo_date:{{* TO {0}}}'.format( datetime.now(timezone.utc).isoformat() ), allow_leading_wildcard=False ).fields([]) record_ids = [hit.meta.id for hit in s.scan()] if record_ids: logger.info('Changing access of {} embargoed publications' ' to public.'.format(len(record_ids))) for record in Record.get_records(record_ids): logger.debug('Making embargoed publication {} public'.format( record.id)) record['open_access'] = True record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def demo_init(): """Initialize demo site.""" from flask import current_app records = [] # Import bibliographic records click.secho('Importing bibliographic records', fg='green') records += import_records( marc21, current_app.extensions['invenio-jsonschemas'].path_to_url( 'marc21/bibliographic/bd-v1.0.2.json'), pkg_resources.resource_filename('invenio_records', 'data/marc21/bibliographic.xml'), ) # FIXME add support for authority records. # Import authority records # click.secho('Importing authority records', fg='green') # records += import_records( # marc21_authority, # current_app.extensions['invenio-jsonschemas'].path_to_url( # 'marc21/authority/ad-v1.0.2.json'), # pkg_resources.resource_filename( # 'invenio_records', 'data/marc21/authority.xml'), # ) db.session.commit() # Index all records click.secho('Indexing records', fg='green') indexer = RecordIndexer() indexer.bulk_index(records) indexer.process_bulk_queue()
def create_fake_record(bulk_size, fake): """Create records for demo purposes.""" records_bulk = [] start = timeit.default_timer() for _ in range(bulk_size): # Create fake record metadata record_data = { "contributors": [{ "name": fake.name() }], "description": fake.bs(), "title": fake.company() + "'s dataset", } # Create record in DB rec_uuid = uuid.uuid4() current_pidstore.minters["recid"](rec_uuid, record_data) Record.create(record_data, id_=rec_uuid) # Add record for bulk indexing records_bulk.append(rec_uuid) # Flush to index and database db.session.commit() click.secho(f"Writing {bulk_size} records to the database", fg="green") # Bulk index records ri = RecordIndexer() ri.bulk_index(records_bulk) current_search.flush_and_refresh(index="records") click.secho(f"Sending {bulk_size} records to be indexed", fg="green") stop = timeit.default_timer() click.secho(f"Creating {bulk_size} records took {stop - start}.", fg="green")
def test_indexer_bulk_index(app, queue): """Test delay indexing.""" with app.app_context(): with establish_connection() as c: indexer = RecordIndexer() id1 = uuid.uuid4() id2 = uuid.uuid4() indexer.bulk_index([id1, id2]) indexer.bulk_delete([id1, id2]) consumer = Consumer( connection=c, queue=indexer.mq_queue.name, exchange=indexer.mq_exchange.name, routing_key=indexer.mq_routing_key) messages = list(consumer.iterqueue()) [m.ack() for m in messages] assert len(messages) == 4 data0 = messages[0].decode() assert data0['id'] == str(id1) assert data0['op'] == 'index' data2 = messages[2].decode() assert data2['id'] == str(id1) assert data2['op'] == 'delete'
def update_expired_embargos(): """Release expired embargoes every midnight.""" logger = current_app.logger base_url = urlunsplit( (current_app.config.get('PREFERRED_URL_SCHEME', 'http'), current_app.config['JSONSCHEMAS_HOST'], current_app.config.get('APPLICATION_ROOT') or '', '', '')) # The task needs to run in a request context as JSON Schema validation # will use url_for. with current_app.test_request_context('/', base_url=base_url): s = B2ShareRecordsSearch( using=current_search_client, index='records').query( 'query_string', query='open_access:false AND embargo_date:{{* TO {0}}}'.format( datetime.now(timezone.utc).isoformat()), allow_leading_wildcard=False).fields([]) record_ids = [hit.meta.id for hit in s.scan()] if record_ids: logger.info('Changing access of {} embargoed publications' ' to public.'.format(len(record_ids))) for record in Record.get_records(record_ids): logger.debug('Making embargoed publication {} public'.format( record.id)) record['open_access'] = True record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def bulk_index_records(records): """Bulk index a list of records.""" indexer = RecordIndexer() click.echo("Bulk indexing {} records...".format(len(records))) indexer.bulk_index([str(r.id) for r in records]) indexer.process_bulk_queue() click.echo("Indexing completed!")
def update_expired_embargos(): """Release expired embargoes every midnight.""" record_ids = AccessRight.get_expired_embargos() for record in Record.get_records(record_ids): record['access_right'] = AccessRight.OPEN record.commit() db.session.commit() indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def create_records(records): """Async records creation and indexing.""" record_indexer = RecordIndexer() record_uuids = [] for record in records: uid = uuid.uuid4() id = bibid_minter(uid, record) record = Record.create(record, id_=uid) record_uuids.append(uid) record_indexer.bulk_index(record_uuids) record_indexer.process_bulk_queue() db.session.commit()
def load_custom_records(): """Initialize demo site.""" from flask import current_app current_app.config['RECORDS_REST_DEFAULT_READ_PERMISSION_FACTORY'] = \ allow_all # Import bibliographic records click.secho('Importing custom records', fg='green') records = import_records() db.session.commit() # Index all records click.secho('Indexing records', fg='green') indexer = RecordIndexer() indexer.bulk_index(records) indexer.process_bulk_queue()
def marc21_import(dojson_model, input): """Import MARCXML records.""" from flask import current_app if dojson_model == marc21: schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'marc21/bibliographic/bd-v1.0.0.json') elif dojson_model == marc21_authority: schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'marc21/authority/ad-v1.0.0.json') # Create records click.secho('Importing records', fg='green') record_ids = import_records(dojson_model, schema, input) db.session.commit() # Index records click.secho('Indexing records', fg='green') indexer = RecordIndexer() indexer.bulk_index(record_ids) indexer.process_bulk_queue()
def bulk_index(uuids, process=False, verbose=False): """Bulk index records.""" if verbose: click.echo(' add to index: {count}'.format(count=len(uuids))) indexer = RecordIndexer() retry = True minutes = 1 while retry: try: indexer.bulk_index(uuids) retry = False except Exception as exc: msg = 'Bulk Index Error: retry in {minutes} min {exc}'.format( exc=exc, minutes=minutes) current_app.logger.error(msg) if verbose: click.secho(msg, fg='red') sleep(minutes * 60) retry = True minutes *= 2 if process: indexer.process_bulk_queue()
def data( n_docs, n_items, n_eitems, n_loans, n_tags, n_intlocs, n_series, n_document_requests, ): """Insert demo data.""" click.secho("Generating demo data", fg="yellow") indexer = RecordIndexer() holder = Holder( patrons_pids=["1", "2", "5", "6"], librarian_pid="4", total_intloc=n_intlocs, total_tags=n_tags, total_items=n_items, total_eitems=n_eitems, total_documents=n_docs, total_loans=n_loans, total_series=n_series, total_document_requests=n_document_requests, ) click.echo("Creating locations...") loc_generator = LocationGenerator(holder, minter) loc_generator.generate() rec = loc_generator.persist() indexer.index(rec) # InternalLocations intlocs_generator = InternalLocationGenerator(holder, minter) intlocs_generator.generate() rec_intlocs = intlocs_generator.persist() # Tags click.echo("Creating tags...") tags_generator = TagGenerator(holder, minter) tags_generator.generate() rec_tags = tags_generator.persist() # Series click.echo("Creating series...") series_generator = SeriesGenerator(holder, minter) series_generator.generate() rec_series = series_generator.persist() # Documents click.echo("Creating documents...") documents_generator = DocumentGenerator(holder, minter) documents_generator.generate() rec_docs = documents_generator.persist() # Items click.echo("Creating items...") items_generator = ItemGenerator(holder, minter) items_generator.generate() rec_items = items_generator.persist() # EItems click.echo("Creating eitems...") eitems_generator = EItemGenerator(holder, minter) eitems_generator.generate() rec_eitems = eitems_generator.persist() # Loans click.echo("Creating loans...") loans_generator = LoanGenerator(holder, minter) loans_generator.generate() rec_loans = loans_generator.persist() # Related records click.echo("Creating related records...") related_generator = RecordRelationsGenerator(holder, minter) related_generator.generate(rec_docs, rec_series) related_generator.persist() # Document requests click.echo("Creating document requests...") document_requests_generator = DocumentRequestGenerator(holder, minter) document_requests_generator.generate() rec_requests = document_requests_generator.persist() # index locations indexer.bulk_index([str(r.id) for r in rec_intlocs]) click.echo("Sent to the indexing queue {0} locations".format( len(rec_intlocs))) # index tags indexer.bulk_index([str(r.id) for r in rec_tags]) click.echo("Sent to the indexing queue {0} tags".format(len(rec_tags))) # process queue so series can resolve tags correctly indexer.process_bulk_queue() # index series indexer.bulk_index([str(r.id) for r in rec_series]) click.echo("Sent to the indexing queue {0} series".format(len(rec_series))) # index loans indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans))) click.secho("Now indexing...", fg="green") # process queue so items can resolve circulation status correctly indexer.process_bulk_queue() # index eitems indexer.bulk_index([str(r.id) for r in rec_eitems]) click.echo("Sent to the indexing queue {0} eitems".format(len(rec_eitems))) # index items indexer.bulk_index([str(r.id) for r in rec_items]) click.echo("Sent to the indexing queue {0} items".format(len(rec_items))) click.secho("Now indexing...", fg="green") # process queue so documents can resolve circulation correctly indexer.process_bulk_queue() # index document requests indexer.bulk_index([str(r.id) for r in rec_requests]) click.echo("Sent to the indexing queue {0} document requests".format( len(rec_requests))) click.secho("Now indexing...", fg="green") indexer.process_bulk_queue() # flush all indices after indexing, otherwise ES won't be ready for tests current_search.flush_and_refresh(index="*") # index documents indexer.bulk_index([str(r.id) for r in rec_docs]) click.echo("Sent to the indexing queue {0} documents".format( len(rec_docs))) # index loans again indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans))) click.secho("Now indexing...", fg="green") indexer.process_bulk_queue()
def update_record_statistics(start_date=None, end_date=None): """Update "_stats" field of affected records.""" start_date = dateutil_parse(start_date) if start_date else None end_date = dateutil_parse(end_date) if start_date else None aggr_configs = {} if not start_date and not end_date: start_date = datetime.utcnow() end_date = datetime.utcnow() for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name, **aggr_cfg.aggregator_config) if not Index(aggr.aggregation_alias, using=aggr.client).exists(): if not Index(aggr.event_index, using=aggr.client).exists(): start_date = min(start_date, datetime.utcnow()) else: start_date = min(start_date, aggr._get_oldest_event_timestamp()) # Retrieve the last two bookmarks bookmarks = Search(using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.bookmark_doc_type)[0:2].sort({ 'date': { 'order': 'desc' } }).execute() if len(bookmarks) >= 1: end_date = max( end_date, datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix)) if len(bookmarks) == 2: start_date = min( start_date, datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix)) aggr_configs[aggr.aggregation_alias] = aggr elif start_date and end_date: for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name, **aggr_cfg.aggregator_config) aggr_configs[aggr.aggregation_alias] = aggr else: return # Get conceptrecids for all the affected records between the two dates conceptrecids = set() for aggr_alias, aggr in aggr_configs.items(): query = Search( using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.aggregation_doc_type, ).filter('range', timestamp={ 'gte': start_date.replace(microsecond=0).isoformat() + '||/d', 'lte': end_date.replace(microsecond=0).isoformat() + '||/d' }).extra(_source=False) query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0) conceptrecids |= { b.key for b in query.execute().aggregations.ids.buckets } indexer = RecordIndexer() for concpetrecid_val in conceptrecids: conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val) pv = PIDVersioning(parent=conceptrecid) children_recids = pv.children.all() indexer.bulk_index([str(p.object_uuid) for p in children_recids])
def data(n_docs, n_items, n_loans): """Insert demo data.""" indexer = RecordIndexer() rec_location = create_loc_record() db.session.commit() indexer.index(rec_location) rec_int_locs = [] with click.progressbar(get_internal_locations(rec_location), label="Internal Locations") as ilocs: for iloc in ilocs: rec = create_iloc_record(iloc, rec_location[Location.pid_field]) rec_int_locs.append(rec) documents, items = get_documents_items(rec_int_locs, n_docs=n_docs, n_items=n_items) rec_docs = [] with click.progressbar(documents, label="Documents") as docs: for doc in docs: rec = create_doc_record(doc) rec_docs.append(rec) rec_items = [] with click.progressbar(items, label="Items") as _items: for item in _items: iloc = rec_int_locs[randint(0, len(rec_int_locs) - 1)] rec = create_item_record(item, iloc[InternalLocation.pid_field]) rec_items.append(rec) db.session.commit() loans = get_loans_for_items( rec_items, rec_location, patron_ids=["1", "2"], librarian_id="4", n_loans=n_loans, ) rec_loans = [] with click.progressbar(loans, label="Loans") as _loans: for _loan in _loans: rec = create_loan_record(_loan) rec_loans.append(rec) db.session.commit() # index locations indexer.bulk_index([str(r.id) for r in rec_int_locs]) click.echo('Sent to the indexing queue {0} locations'.format( len(rec_int_locs))) # index loans indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo('Sent to the indexing queue {0} loans'.format(len(rec_loans))) click.secho('Now indexing...', fg='green') # process queue so items can resolve circulation status correctly indexer.process_bulk_queue() # index items indexer.bulk_index([str(r.id) for r in rec_items]) click.echo('Sent to the indexing queue {0} items'.format(len(rec_items))) click.secho('Now indexing...', fg='green') # process queue so documents can resolve circulation correctly indexer.process_bulk_queue() # sleep to give time for items to be indexed time.sleep(1) # index documents indexer.bulk_index([str(r.id) for r in rec_docs]) click.echo('Sent to the indexing queue {0} documents'.format( len(rec_docs))) click.secho('Now indexing...', fg='green') indexer.process_bulk_queue()
def data(n_docs, n_items, n_eitems, n_loans, n_keywords, n_intlocs, n_series): """Insert demo data.""" click.secho('Generating demo data', fg='yellow') indexer = RecordIndexer() holder = Holder( patrons_pids=["1", "2", "5", "6"], librarian_pid="4", total_intloc=n_intlocs, total_keywords=n_keywords, total_items=n_items, total_eitems=n_eitems, total_documents=n_docs, total_loans=n_loans, total_series=n_series, ) click.echo('Creating locations...') loc_generator = LocationGenerator(holder, minter) loc_generator.generate() rec = loc_generator.persist() indexer.index(rec) # InternalLocations intlocs_generator = InternalLocationGenerator(holder, minter) intlocs_generator.generate() rec_intlocs = intlocs_generator.persist() # Keywords click.echo('Creating keywords...') keywords_generator = KeywordGenerator(holder, minter) keywords_generator.generate() rec_keywords = keywords_generator.persist() # Series click.echo('Creating series...') series_generator = SeriesGenerator(holder, minter) series_generator.generate() rec_series = series_generator.persist() # Documents click.echo('Creating documents...') documents_generator = DocumentGenerator(holder, minter) documents_generator.generate() rec_docs = documents_generator.persist() # Items click.echo('Creating items...') items_generator = ItemGenerator(holder, minter) items_generator.generate() rec_items = items_generator.persist() # EItems click.echo('Creating eitems...') eitems_generator = EItemGenerator(holder, minter) eitems_generator.generate() rec_eitems = eitems_generator.persist() # Loans click.echo('Creating loans...') loans_generator = LoanGenerator(holder, minter) loans_generator.generate() rec_loans = loans_generator.persist() # Related records click.echo('Creating related records...') related_generator = RelatedRecordsGenerator(holder, minter) related_generator.generate(rec_docs, rec_series) related_generator.persist() # index locations indexer.bulk_index([str(r.id) for r in rec_intlocs]) click.echo('Sent to the indexing queue {0} locations'.format( len(rec_intlocs))) # index keywords indexer.bulk_index([str(r.id) for r in rec_keywords]) click.echo('Sent to the indexing queue {0} keywords'.format( len(rec_keywords))) # process queue so series can resolve keywords correctly indexer.process_bulk_queue() # index series indexer.bulk_index([str(r.id) for r in rec_series]) click.echo('Sent to the indexing queue {0} series'.format(len(rec_series))) # index loans indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo('Sent to the indexing queue {0} loans'.format(len(rec_loans))) click.secho('Now indexing...', fg='green') # process queue so items can resolve circulation status correctly indexer.process_bulk_queue() # index eitems indexer.bulk_index([str(r.id) for r in rec_eitems]) click.echo('Sent to the indexing queue {0} eitems'.format(len(rec_eitems))) # index items indexer.bulk_index([str(r.id) for r in rec_items]) click.echo('Sent to the indexing queue {0} items'.format(len(rec_items))) click.secho('Now indexing...', fg='green') # process queue so documents can resolve circulation correctly indexer.process_bulk_queue() # flush all indices after indexing, otherwise ES won't be ready for tests current_search.flush_and_refresh(index='*') # index documents indexer.bulk_index([str(r.id) for r in rec_docs]) click.echo('Sent to the indexing queue {0} documents'.format( len(rec_docs))) click.secho('Now indexing...', fg='green') indexer.process_bulk_queue()
def data(n_docs, n_items, n_eitems, n_loans, n_intlocs, n_series, n_document_requests, n_vendors, n_orders, n_libraries, n_borrowing_requests): """Insert demo data.""" click.secho("Generating demo data", fg="yellow") indexer = RecordIndexer() vocabulary_dir = os.path.join(os.path.realpath("."), "invenio_app_ils", "vocabularies", "data") with open(os.path.join(vocabulary_dir, "tags.json")) as f: tags = json.loads(f.read()) with open(os.path.join(vocabulary_dir, "languages.json")) as f: languages = json.loads(f.read()) holder = Holder( patrons_pids=["1", "2", "5", "6"], languages=languages, librarian_pid="4", tags=tags, total_intloc=n_intlocs, total_items=n_items, total_eitems=n_eitems, total_documents=n_docs, total_loans=n_loans, total_series=n_series, total_document_requests=n_document_requests, total_vendors=n_vendors, total_orders=n_orders, total_borrowing_requests=n_borrowing_requests, total_libraries=n_libraries, ) click.echo("Creating locations...") loc_generator = LocationGenerator(holder, minter) loc_generator.generate() rec = loc_generator.persist() indexer.index(rec) # InternalLocations intlocs_generator = InternalLocationGenerator(holder, minter) intlocs_generator.generate() rec_intlocs = intlocs_generator.persist() # Series click.echo("Creating series...") series_generator = SeriesGenerator(holder, minter) series_generator.generate() rec_series = series_generator.persist() # Documents click.echo("Creating documents...") documents_generator = DocumentGenerator(holder, minter) documents_generator.generate() rec_docs = documents_generator.persist() # Items click.echo("Creating items...") items_generator = ItemGenerator(holder, minter) items_generator.generate() rec_items = items_generator.persist() # EItems click.echo("Creating eitems...") eitems_generator = EItemGenerator(holder, minter) eitems_generator.generate() rec_eitems = eitems_generator.persist() # Loans click.echo("Creating loans...") loans_generator = LoanGenerator(holder, minter) loans_generator.generate() rec_loans = loans_generator.persist() # Related records click.echo("Creating related records...") related_generator = RecordRelationsGenerator(holder, minter) related_generator.generate(rec_docs, rec_series) related_generator.persist() # Document requests click.echo("Creating document requests...") document_requests_generator = DocumentRequestGenerator(holder, minter) document_requests_generator.generate() rec_requests = document_requests_generator.persist() # Vendors click.echo("Creating vendors...") vendor_generator = VendorGenerator(holder, minter) vendor_generator.generate() rec_vendors = vendor_generator.persist() # Orders click.echo("Creating orders...") order_generator = OrderGenerator(holder, minter) order_generator.generate() rec_orders = order_generator.persist() # Libraries click.echo("Creating libraries...") library_generator = LibraryGenerator(holder, minter) library_generator.generate() rec_libraries = library_generator.persist() # Borrowing requests click.echo("Creating borrowing requests...") borrowing_requests_generator = BorrowingRequestGenerator(holder, minter) borrowing_requests_generator.generate() rec_borrowing_requests = borrowing_requests_generator.persist() # index locations indexer.bulk_index([str(r.id) for r in rec_intlocs]) click.echo("Sent to the indexing queue {0} locations".format( len(rec_intlocs))) # index series indexer.bulk_index([str(r.id) for r in rec_series]) click.echo("Sent to the indexing queue {0} series".format(len(rec_series))) # index loans indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans))) click.secho("Now indexing...", fg="green") # process queue so items can resolve circulation status correctly indexer.process_bulk_queue() # index eitems indexer.bulk_index([str(r.id) for r in rec_eitems]) click.echo("Sent to the indexing queue {0} eitems".format(len(rec_eitems))) # index items indexer.bulk_index([str(r.id) for r in rec_items]) click.echo("Sent to the indexing queue {0} items".format(len(rec_items))) click.secho("Now indexing...", fg="green") # process queue so documents can resolve circulation correctly indexer.process_bulk_queue() # index libraries indexer.bulk_index([str(r.id) for r in rec_libraries]) click.echo("Sent to the indexing queue {0} libraries".format( len(rec_libraries))) # index borrowing requests indexer.bulk_index([str(r.id) for r in rec_borrowing_requests]) click.echo("Sent to the indexing queue {0} borrowing requests".format( len(rec_borrowing_requests))) click.secho("Now indexing...", fg="green") indexer.process_bulk_queue() # flush all indices after indexing, otherwise ES won't be ready for tests current_search.flush_and_refresh(index="*") # index documents indexer.bulk_index([str(r.id) for r in rec_docs]) click.echo("Sent to the indexing queue {0} documents".format( len(rec_docs))) # index document requests indexer.bulk_index([str(r.id) for r in rec_requests]) click.echo("Sent to the indexing queue {0} document requests".format( len(rec_requests))) # index loans again indexer.bulk_index([str(r.id) for r in rec_loans]) click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans))) # index items again indexer.bulk_index([str(r.id) for r in rec_items]) click.echo("Sent to the indexing queue {0} items".format(len(rec_items))) # index vendors indexer.bulk_index([str(r.id) for r in rec_vendors]) click.echo("Sent to the indexing queue {0} vendors".format( len(rec_vendors))) # index orders indexer.bulk_index([str(r.id) for r in rec_orders]) click.echo("Sent to the indexing queue {0} orders".format(len(rec_orders))) click.secho("Now indexing...", fg="green") indexer.process_bulk_queue()
def import_documents(institution, pages): """Import documents from RERO doc. institution: String institution filter for retreiving documents pages: Number of pages to import """ url = current_app.config.get('SONAR_DOCUMENTS_RERO_DOC_URL') click.secho('Importing {pages} pages of records for "{institution}" ' 'from {url}'.format(pages=pages, institution=institution, url=url)) # Get institution record from database institution_record = InstitutionRecord.get_record_by_pid(institution) if not institution_record: raise ClickException('Institution record not found in database') institution_ref_link = InstitutionRecord.get_ref_link( 'institutions', institution_record['pid']) # mapping between institution key and RERO doc filter institution_map = current_app.config.get( 'SONAR_DOCUMENTS_INSTITUTIONS_MAP') if not institution_map: raise ClickException('Institution map not found in configuration') if institution not in institution_map: raise ClickException( 'Institution map for "{institution}" not found in configuration, ' 'keys available {keys}'.format(institution=institution, keys=institution_map.keys())) key = institution_map[institution] current_page = 1 indexer = RecordIndexer() while (current_page <= pages): click.echo('Importing records {start} to {end}... '.format( start=(current_page * 10 - 9), end=(current_page * 10)), nl=False) # Read Marc21 data for current page response = requests.get( '{url}?of=xm&jrec={first_record}&c=NAVSITE.{institution}'.format( url=url, first_record=(current_page * 10 - 9), institution=key.upper()), stream=True) if response.status_code != 200: raise ClickException('Request to "{url}" failed'.format(url=url)) response.raw.decode_content = True ids = [] for data in split_stream(response.raw): # Convert from Marc XML to JSON record = create_record(data) # Transform JSON record = marc21tojson.do(record) # Add institution record['institution'] = {'$ref': institution_ref_link} # Register record to DB db_record = DocumentRecord.create(record) db.session.commit() # Add ID for bulk index in elasticsearch ids.append(str(db_record.id)) # index and process queue in elasticsearch indexer.bulk_index(ids) indexer.process_bulk_queue() current_page += 1 click.secho('Done', fg='green', nl=True) click.secho('Finished', fg='green')
def import_records(records_to_import): """Import records in database and index them. Used as celery task. "ignore_result" flag means that we don't want to get the status and/or the result of the task, execution is faster. :param list records_to_import: List of records to import. :returns: List of IDs. """ indexer = RecordIndexer() ids = [] for data in records_to_import: try: files_data = data.pop('files', []) record = DocumentRecord.get_record_by_identifier( data.get('identifiedBy', [])) if not record: record = DocumentRecord.create(data, dbcommit=False, with_bucket=True) else: record.update(data) for file_data in files_data: # Store url and key and remove it from dict to pass dict to # kwargs in add_file_from_url method url = file_data.pop('url') key = file_data.pop('key') try: record.add_file_from_url(url, key, **file_data) except Exception as exception: current_app.logger.warning( 'Error during import of file {file} of record ' '{record}: {error}'.format( file=key, error=exception, record=record['identifiedBy'])) # Merge record in database, at this time it's not saved into DB. record.commit() # Pushing record to database, not yet persisted into DB db.session.flush() # Add ID for bulk index in elasticsearch ids.append(str(record.id)) current_app.logger.info( 'Record with reference "{reference}" imported successfully'. format(reference=record['identifiedBy'])) except Exception as exception: current_app.logger.error( 'Error during importation of record {record}: {exception}'. format(record=data, exception=exception)) # Commit and index records db.session.commit() indexer.bulk_index(ids) indexer.process_bulk_queue() return ids
def update_record_statistics(start_date=None, end_date=None): """Update "_stats" field of affected records.""" start_date = dateutil_parse(start_date) if start_date else None end_date = dateutil_parse(end_date) if start_date else None aggr_configs = {} if not start_date and not end_date: start_date = datetime.utcnow() end_date = datetime.utcnow() for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class( name=aggr_cfg.name, **aggr_cfg.aggregator_config) if not Index(aggr.aggregation_alias, using=aggr.client).exists(): if not Index(aggr.event_index, using=aggr.client).exists(): start_date = min(start_date, datetime.utcnow()) else: start_date = min( start_date, aggr._get_oldest_event_timestamp()) # Retrieve the last two bookmarks bookmarks = Search( using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.bookmark_doc_type )[0:2].sort({'date': {'order': 'desc'}}).execute() if len(bookmarks) >= 1: end_date = max( end_date, datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix)) if len(bookmarks) == 2: start_date = min( start_date, datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix)) aggr_configs[aggr.aggregation_alias] = aggr elif start_date and end_date: for aggr_name in current_stats.enabled_aggregations: aggr_cfg = current_stats.aggregations[aggr_name] aggr = aggr_cfg.aggregator_class( name=aggr_cfg.name, **aggr_cfg.aggregator_config) aggr_configs[aggr.aggregation_alias] = aggr else: return # Get conceptrecids for all the affected records between the two dates conceptrecids = set() for aggr_alias, aggr in aggr_configs.items(): query = Search( using=aggr.client, index=aggr.aggregation_alias, doc_type=aggr.aggregation_doc_type, ).filter( 'range', timestamp={ 'gte': start_date.replace(microsecond=0).isoformat() + '||/d', 'lte': end_date.replace(microsecond=0).isoformat() + '||/d'} ).extra(_source=False) query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0) conceptrecids |= { b.key for b in query.execute().aggregations.ids.buckets} indexer = RecordIndexer() for concpetrecid_val in conceptrecids: conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val) pv = PIDVersioning(parent=conceptrecid) children_recids = pv.children.all() indexer.bulk_index([str(p.object_uuid) for p in children_recids])
def _index(iterator): """Bulk index the iterator.""" indexer = RecordIndexer() indexer.bulk_index(iterator) indexer.process_bulk_queue()
def bulk_records(records): """Records creation.""" n_updated = 0 n_rejected = 0 n_created = 0 record_schema = current_jsonschemas.path_to_url('documents/document-v0.0.1.json') item_schema = current_jsonschemas.path_to_url('items/item-v0.0.1.json') holding_schema = current_jsonschemas.path_to_url('holdings/holding-v0.0.1.json') host_url = current_app.config.get('RERO_ILS_APP_BASE_URL') url_api = '{host}/api/{doc_type}/{pid}' record_id_iterator = [] item_id_iterator = [] holding_id_iterator = [] indexer = RecordIndexer() start_time = datetime.now() for record in records: try: if record.get('frbr', False): document = record.get('document', {}) """ # check if already in Rero-ILS pid = None for identifier in document.get('identifiedBy') : if identifier.get('source') == 'VIRTUA' : bibid = identifier.get('value') query = DocumentsSearch().filter( 'term', identifiedBy__value=bibid ).source(includes=['pid']) try: pid = [r.pid for r in query.scan()].pop() except IndexError: pid = None if pid: # update the record # Do nothing for the moment continue else: """ document['$schema'] = record_schema created_time = datetime.now() document = Document.create( document, dbcommit=False, reindex=False ) record_id_iterator.append(document.id) uri_documents = url_api.format(host=host_url, doc_type='documents', pid=document.pid) map_holdings = {} for holding in record.get('holdings'): holding['$schema'] = holding_schema holding['document'] = { '$ref': uri_documents } holding['circulation_category'] = { '$ref': map_item_type(str(holding.get('circulation_category'))) } holding['location'] = { '$ref': map_locations(str(holding.get('location'))) } created_time = datetime.now() result = Holding.create( holding, dbcommit=False, reindex=False ) map_holdings.update({ '{location}#{cica}'.format( location = holding.get('location'), cica = holding.get('circulation_category')) : result.get('pid') } ) holding_id_iterator.append(result.id) for item in record.get('items'): item['$schema'] = item_schema item['document'] = { '$ref': uri_documents } item['item_type'] = { '$ref': map_item_type(str(item.get('item_type'))) } item['location'] = { '$ref': map_locations(str(item.get('location'))) } holding_pid = map_holdings.get( '{location}#{cica}'.format( location = item.get('location'), cica = item.get('item_type'))) item['holding'] = { '$ref': url_api.format(host=host_url, doc_type='holdings', pid=holding_pid) } result = Item.create( item, dbcommit=False, reindex=False ) item_id_iterator.append(result.id) n_created += 1 if n_created % 1000 == 0: execution_time = datetime.now() - start_time click.secho('{nb} created records in {execution_time}.' .format(nb=len(record_id_iterator), execution_time=execution_time), fg='white') start_time = datetime.now() db.session.commit() execution_time = datetime.now() - start_time click.secho('{nb} commited records in {execution_time}.' .format(nb=len(record_id_iterator), execution_time=execution_time), fg='white') start_time = datetime.now() click.secho('sending {n} holdings to indexer queue.' .format(n=len(holding_id_iterator)), fg='white') indexer.bulk_index(holding_id_iterator) click.secho('process queue...', fg='yellow') indexer.process_bulk_queue() click.secho('sending {n} items to indexer queue.' .format(n=len(item_id_iterator)), fg='white') indexer.bulk_index(item_id_iterator) click.secho('process queue...', fg='yellow') indexer.process_bulk_queue() click.secho('sending {n} documents to indexer queue.' .format(n=len(record_id_iterator)), fg='white') indexer.bulk_index(record_id_iterator) click.secho('process queue...', fg='yellow') indexer.process_bulk_queue() execution_time = datetime.now() - start_time click.secho('indexing records process in {execution_time}.' .format(execution_time=execution_time), fg='white') click.secho('processing next batch records.', fg='green') record_id_iterator.clear() holding_id_iterator.clear() item_id_iterator.clear() start_time = datetime.now() except Exception as e: n_rejected += 1 click.secho('Error processing record [{id}] : {e}' .format(id=record.get('_id'), e=e), fg='red') db.session.commit() indexer.bulk_index(holding_id_iterator) indexer.process_bulk_queue() indexer.bulk_index(item_id_iterator) indexer.process_bulk_queue() indexer.bulk_index(record_id_iterator) indexer.process_bulk_queue() return n_created