def bulk_indexer(pid_type, object_uuids, req_timeout): exceptions = [] try: if not bulk_app: bulk_app.append(create_api()) with bulk_app[0].app_context(): endpoint: RecordEndpointConfiguration = current_drafts.endpoint_for_pid_type( pid_type) record_class = endpoint.record_class indexer_class = endpoint.indexer_class indexer = indexer_class() # force record class indexer.record_cls = record_class def get_indexing_data(record_uuid): try: return indexer._index_action({"id": record_uuid}) except Exception as e: exceptions.append({ 'record_uuid': str(record_uuid), 'message': str(e), 'traceback': traceback.format_exc(), }) return {} recs = (get_indexing_data(record_uuid) for record_uuid in object_uuids) success, errors = bulk(indexer.client, recs, stats_only=False, request_timeout=req_timeout, expand_action_callback=_es7_expand_action, raise_on_error=False) return success, [*errors, *exceptions] except Exception as e: if len(object_uuids) == 1: return 0, [{ 'message': str(e), 'traceback': traceback.format_exc() }, *exceptions] else: # index what could be indexed ok = 0 errors = [] if len(object_uuids) > 4: # split into two halves and try for each half mid = len(object_uuids) / 2 object_uuids = [object_uuids[:mid], object_uuids[mid:]] else: # try for each element object_uuids = [[x] for x in object_uuids] for uuids in object_uuids: p_ok, p_errors = bulk_indexer(pid_type, uuids, req_timeout) ok += p_ok errors.extend(p_errors) return ok, errors
def article_reindex(ctx, raise_on_error=True, only=None): version_type = None # elasticsearch version to use api = create_api() with api.app_context(): def reindex_pid(pid_type, RecordClass): index_name = None indexer = RecordIndexer() for pid in tqdm.tqdm(PersistentIdentifier.query.filter_by( pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value)): record = RecordClass.get_record(pid.object_uuid) if only and str(record.id) != only: continue try: index_name, doc_type = indexer.record_to_index(record) index_name = build_alias_name(index_name) # print('Indexing', record.get('id'), 'into', index_name) indexer.index(record) except: with open('/tmp/indexing-error.json', 'a') as f: print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f) traceback.print_exc(file=f) if raise_on_error: raise if index_name: current_search_client.indices.refresh(index_name) current_search_client.indices.flush(index_name) # reindex all objects reindex_pid(ARTICLE_PID_TYPE, ArticleRecord) reindex_pid(ARTICLE_DRAFT_PID_TYPE, ArticleDraftRecord)
def run(provider, synchronizer, break_on_error, start_oai, start_id, oai, overwrite, bulk, only_fetch, index: str = None): """ Starts harvesting the resources set in invenio.cfg through the OAREPO_OAI_PROVIDERS environment variable. """ api = create_api() with api.app_context(): _run_internal(provider=provider, synchronizer=synchronizer, break_on_error=break_on_error, start_oai=start_oai, start_id=start_id, oai=oai, overwrite=overwrite, bulk=bulk, only_fetch=only_fetch, index=index)
def demo_reindex(only=None): with create_api().app_context(): def reindex_pid(pid_type): for pid in PersistentIdentifier.query.filter_by(pid_type=pid_type, object_type='rec'): record = Record.get_record(pid.object_uuid) if only and str(record.id) != only: continue try: RecordIndexer().index(record) except: with open('/tmp/indexing-error.json', 'w') as f: print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f) raise current_search_client.indices.flush() reindex_pid('recid')
def nr_recommit(ctx): api = create_api() with api.app_context(): endpoints = current_app.config.get("RECORDS_REST_ENDPOINTS").endpoints for config in endpoints.values(): try: pid_type: str = config["pid_type"] print(f'PID type: {pid_type}') record_class = obj_or_import_string(config["record_class"]) pids = PersistentIdentifier.query.filter_by( pid_type=pid_type).all() for i, pid in enumerate(tqdm(pids)): try: record = record_class.get_record(pid.object_uuid) except NoResultFound: continue t0 = datetime.now() record.commit() print(f"Commiting time: {datetime.now() - t0}") if i % 100 == 0: db.session.commit() finally: db.session.commit()
original_script_name = environ.get("SCRIPT_NAME", "") environ["SCRIPT_NAME"] = original_script_name + script environ["PATH_INFO"] = path_info return self.app(environ, start_response) class HeartbeatMiddleware: """HeartBeat endpoints WSGI middleware.""" def __init__(self, app): """Initialize heartbeat middleware.""" self.app = app def __call__(self, environ, start_response): """Handle .well-known endpoints outside of /api prefix.""" rsp = None with application.app_context(): pi = environ.get('PATH_INFO', '') if pi == '/.well-known/heartbeat/readiness': rsp = readiness() elif pi == '/.well-known/heartbeat/liveliness': rsp = liveliness() if rsp: return rsp(environ, start_response) else: return self.app(environ, start_response) application = create_api() application.wsgi_app = HeartbeatMiddleware( PrefixMiddleware(application.wsgi_app))
import itertools import time from multiprocessing.pool import Pool from random import random from invenio_app.factory import create_api from invenio_db import db from invenio_records.models import RecordMetadata from sqlalchemy.orm.attributes import flag_modified from tqdm import tqdm app = create_api() def set_providers(*mds): with app.app_context(): with db.session.begin_nested(): for md in RecordMetadata.query.filter(RecordMetadata.id.in_(mds)): control_number = md.json.get('control_number') print(f"Control number: {control_number}") providers = md.json.get('provider') primary_community = None if providers: provider = None for p in providers: if not provider or p['level'] > provider['level']: provider = p provider = providers[0] self_link = provider.get('links', {}).get('self')
def setup(admin_password, recreate_db, skip_demo_data, skip_file_location, drop_taxonomies, skip_taxonomy_import, verbose, taxonomies='./assets/taxonomy'): """OARepo setup command.""" from flask import current_app from invenio_base.app import create_cli click.secho("oarepo setup started...", fg="blue") # Clean redis redis.StrictRedis.from_url( current_app.config["CACHE_REDIS_URL"]).flushall() click.secho("redis cache cleared...", fg="red") cli = create_cli() # Important: force API app on CLI context for proper URL generation cli.create_app = create_api runner = create_api().test_cli_runner() def run_command(command, catch_exceptions=False): click.secho("oarepo {}...".format(command), fg="green") res = runner.invoke(cli, command, catch_exceptions=catch_exceptions) if verbose: click.secho(res.output) # Print all routes considered for URL generation run_command('routes') # Remove and create db and indexes if recreate_db: run_command("db destroy --yes-i-know", catch_exceptions=True) run_command("db init") else: run_command("db drop --yes-i-know") run_command("db create") run_command("index destroy --force --yes-i-know") run_command("index init --force") run_command("index queue init purge") # Create roles to restrict access run_command("roles create admin") # Create users run_command("users create [email protected] -a --password={}".format( admin_password)) # ID 1 create_userprofile_for("*****@*****.**", "admin", "OArepo Administrator") # Assign roles run_command("roles add [email protected] admin") # Assign actions run_command("access allow superuser-access role admin") # Create files location if not skip_file_location: run_command("files location --default oarepo /tmp/oarepo") # Create ACLs index for preferred SCHEMA run_command("invenio invenio_explicit_acls prepare {}".format( ACL_PREFERRED_SCHEMA)) # Drop taxonomy data if drop_taxonomies: taxo_list = runner.invoke(cli, 'taxonomies list', catch_exceptions=False) click.secho("oarepo dropping existing taxonomies {}".format( taxo_list.output), fg="yellow") for tax in [ t for t in taxo_list.output.splitlines() if t[0] not in [' ', '*'] ]: click.secho("oarepo deleting taxonomy {}".format(tax), fg="yellow") run_command('taxonomies delete {}'.format(tax)) # Import taxonomies if not skip_taxonomy_import: import os click.secho("oarepo importing taxonomies from {}".format(taxonomies), fg="green") for tax_file in os.listdir(taxonomies): if tax_file.endswith('xlsx'): tax_path = os.path.join(taxonomies, tax_file) click.secho("oarepo importing taxonomy {}".format(tax_path), fg="green") if tax_file.startswith('event'): run_command( 'taxonomies import {} --str web --str organizer --str startDate --str endDate --bool ' 'selectable --drop'.format(tax_path)) elif tax_file.startswith('format'): run_command( 'taxonomies import {} --str resolution --str spec --bool selectable --drop' .format(tax_path)) click.secho("oarepo setting all-read permission on taxonomies", fg="green") run_command('taxonomies all-read') # TODO: what about taxonomy modify? run_command('demo data') click.secho("oarepo setup finished successfully", fg="blue")
def nr_update_access_rights(ctx): api = create_api() with api.app_context(): update_access_rights(deep=True) _reindex(None, raise_on_error=True, only=None)
def nr_reindex(ctx, pids, raise_on_error=True, only=None): version_type = None # elasticsearch version to use api = create_api() with api.app_context(): _reindex(pids, raise_on_error=raise_on_error, only=only)