def preprocess_config(self, config): if isinstance(config, str): config = {'published_schema': config} else: config = {**config} if 'draft_schema' not in config: if not config['published_schema'].startswith('/'): config['draft_schema'] = 'draft' + '/' + config[ 'published_schema'] config['draft_schema_file'] = os.path.join( self.app.config['INVENIO_RECORD_DRAFT_SCHEMAS_DIR'], config['draft_schema']) config['published_index'] = schema_to_index( config['published_schema'])[0] draft_index = schema_to_index(config['draft_schema'])[0] config['draft_index'] = draft_index config['published_mapping_file'] = current_search.mappings[ config['published_index']] config['draft_mapping_file'] = os.path.join( self.app.config['INVENIO_RECORD_DRAFT_MAPPINGS_DIR'], f'{draft_index}.json') return config
def schema_prefix(schema): """Get index prefix for a given schema.""" if not schema: return None index, doctype = schema_to_index( schema, index_names=current_search.mappings.keys()) return index.split('-')[0]
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': { 'citation_count': citation_count } } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar( es_scan(es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list( list( chain.from_iterable( map(force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo( '... DONE: {} records updated with success. {} failures.'.format( success, failed))
def test_schema_to_index_prefixes_indices(app): """Test that prefix is added to the index when creating it.""" new_conf = {'SEARCH_INDEX_PREFIX': 'prefix-'} with patch.dict(app.config, new_conf): result = schema_to_index('default-v1.0.0.json') assert result == ('prefix-default-v1.0.0', 'default-v1.0.0')
def _record_to_index(record): """Get index/doctype given a record.""" index, doctype = schema_to_index(record.get('$schema', '')) if index and doctype: return index, doctype else: return current_app.config['INDEXER_DEFAULT_INDEX'], \ current_app.config['INDEXER_DEFAULT_DOCTYPE']
class Meta: """Configuration for CERN search.""" index = schema_to_index(Keyword._schema)[0] doc_types = None fields = ('*', ) default_filter = DefaultFilter( Q('bool', filter=[Q('match', deleted=False)]))
def get_search_index(json_schemas, url_prefix): indices = [schema_to_index(x)[0] for x in json_schemas] indices = [x for x in indices if x] if len(indices) == 1: return indices[0] else: raise Exception('Add "published_search_index" or "json_schemas" to ' 'DRAFT_ENABLED_RECORDS_REST_ENDPOINTS["%s"]' % url_prefix)
def _get_percolator_doc_type(index): es_ver = ES_VERSION[0] if es_ver == 2: return '.percolator' elif es_ver == 5: return 'percolators' elif es_ver == 6: mapping_path = current_search.mappings[index] _, doc_type = schema_to_index(mapping_path) return doc_type
def _get_percolator_doc_type(index): es_ver = ES_VERSION[0] if es_ver == 2: return '.percolator' elif es_ver == 5: return 'percolators' elif es_ver in (6, 7): mapping_path = current_search.mappings[index] _, doc_type = schema_to_index(mapping_path) return doc_type
def add_citation_counts(chunk_size=500, request_timeout=120): def _get_records_to_update_generator(citations_lookup): with click.progressbar(citations_lookup.iteritems()) as bar: for uuid, citation_count in bar: yield { '_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } index, doc_type = schema_to_index('records/hep.json') citations_lookup = Counter() click.echo('Extracting all citations...') with click.progressbar(es_scan( es, query={ '_source': 'references.recid', 'filter': { 'exists': { 'field': 'references.recid' } }, 'size': LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: unique_refs_ids = dedupe_list(list(chain.from_iterable(map( force_list, get_value(record, '_source.references.recid'))))) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo('... DONE.') click.echo('Mapping recids to UUIDs...') citations_lookup = _build_recid_to_uuid_map(citations_lookup) click.echo('... DONE.') click.echo('Adding citation numbers...') success, failed = es_bulk( es, _get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, request_timeout=request_timeout, stats_only=True, ) click.echo('... DONE: {} records updated with success. {} failures.'.format( success, failed))
def _record_to_index(record): """Get index/doc_type given a record.""" index_names = current_search.mappings.keys() schema = record.get('$schema', '') if isinstance(schema, dict): schema = schema.get('$ref', '') index, doc_type = schema_to_index(schema, index_names=index_names) if index and doc_type: return index, doc_type else: return (current_app.config['INDEXER_DEFAULT_INDEX'], current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
def preprocess_record(self, pid, record, links_factory=None, **kwargs): """Adds cached ACLs to the serialized record.""" ret = super().preprocess_record(pid, record, links_factory, **kwargs) index_names = current_search.mappings.keys() index, doc_type = schema_to_index(record['$schema'], index_names=index_names) search_class = None if self.acl_rest_endpoint is None: if has_request_context(): search_class = getattr(request._methodview, 'search_class') if not search_class: # pragma no cover raise AttributeError( 'Please set acl_rest_endpoint property with the key to RECORDS_REST_ENDPOINTS' ) else: rest_configuration = current_app.config['RECORDS_REST_ENDPOINTS'][ self.acl_rest_endpoint] search_class = obj_or_import_string(rest_configuration.get( 'search_class', None), default=ACLRecordsSearch) sc = search_class(index=index, doc_type=doc_type) rec = sc.acl_return_all(operation=self.acl_operations).get_record( str(record.id)) rec = rec.execute() if rec.hits: matched_acls = getattr(rec.hits[0].meta, 'matched_queries', []) matched_acls = [ x.replace(f'{ACL_MATCHED_QUERY}_', '') for x in matched_acls ] else: # pragma no cover logger.error( 'Should not happen, record %s not found in elasticsearch', record.id) matched_acls = [] ret['invenio_explicit_acls'] = matched_acls return ret
def default_record_to_index(record): """Default function to get index/doc_type given a record. It tries to extract from `record['$schema']` the index and doc_type. If it fails, return the default values. :param record: The record object. :returns: Tuple (index, doc_type). """ index_names = current_search.mappings.keys() schema = record.get('$schema', '') if isinstance(schema, dict): schema = schema.get('$ref', '') index, doc_type = schema_to_index(schema, index_names=index_names) if index and doc_type: return index, doc_type else: return (current_app.config['INDEXER_DEFAULT_INDEX'], current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
def record_to_index(record): """Get index/doc_type given a record. It tries to extract from `record['$schema']` the index and doc_type. If it fails, return the default values. :param record: The record object. :returns: Tuple (index, doc_type). """ index_names = current_search.mappings.keys() schema = record.get('$schema', '') if isinstance(schema, dict): schema = schema.get('$ref', '') # put all document in the same index if re.search(r'/documents/', schema): schema = re.sub(r'-.*\.json', '.json', schema) index, doc_type = schema_to_index(schema, index_names=index_names) if index and doc_type: return index, doc_type else: return (current_app.config['INDEXER_DEFAULT_INDEX'], current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
def test_schema_to_index(schema_url, result): """Test conversion of schema to index name and document type.""" assert result == schema_to_index(schema_url)
class Meta: """Configuration for CERN search.""" index = schema_to_index(Keyword._schema)[0] doc_types = None fields = ('*', )
def test_schema_to_index_with_names(app): """Test that prefix is added to the index when creating it.""" result = schema_to_index('default-v1.0.0.json', index_names=['default-v1.0.0']) doc_type = '_doc' if ES_VERSION[0] >= 7 else 'default-v1.0.0' assert result == ('default-v1.0.0', doc_type)
def test_schema_to_index(schema, expected, index_names, app): """Test the expected value of schema to index.""" result = schema_to_index(schema, index_names=index_names) if ES_VERSION[0] >= 7 and expected[0]: expected = (expected[0], "_doc") assert result == expected
def add_citation_counts(chunk_size=500, request_timeout=10): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
def doc_type_should_be_sent_to_orcid(record): index, doc_type = schema_to_index(record['$schema']) return doc_type == 'hep'
def test_schema_to_index_with_names(app): """Test that prefix is added to the index when creating it.""" result = schema_to_index("default-v1.0.0.json", index_names=["default-v1.0.0"]) doc_type = "_doc" if ES_VERSION[0] >= 7 else "default-v1.0.0" assert result == ("default-v1.0.0", doc_type)
def doc_type_should_be_sent_to_orcid(record): """Return ``True`` is a document type should be sent to ORCID.""" index, doc_type = schema_to_index(record['$schema']) pushable_doc_types = current_app.config['ORCID_RECORDS_DOC_TYPES'] return doc_type in pushable_doc_types
def test_schema_to_index(schema, expected, index_names, app): """Test the expected value of schema to index.""" result = schema_to_index(schema, index_names=index_names) assert result == expected
def doc_type_should_be_sent_to_orcid(record): """Return ``True`` is a document type should be sent to ORCID.""" index, doc_type = schema_to_index(record['$schema']) return doc_type == 'hep'
def add_citation_counts(chunk_size=500, request_timeout=40): index, doc_type = schema_to_index('records/hep.json') def get_records_to_update_generator(citation_lookup): for recid, citation_count in citation_lookup.iteritems(): try: uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid yield {'_op_type': 'update', '_index': index, '_type': doc_type, '_id': str(uuid), 'doc': {'citation_count': citation_count} } except NoResultFound: continue click.echo("Extracting all citations...") # lookup dictionary where key: recid of the record # and value: number of records that cite that record citations_lookup = Counter() with click.progressbar(es_scan( current_search_client, query={ "_source": "references.recid", "filter": { "exists": { "field": "references.recid" } }, "size": LARGE_CHUNK_SIZE }, scroll=u'2m', index=index, doc_type=doc_type)) as records: for record in records: # update lookup dictionary based on references of the record if 'references' in record['_source']: unique_refs_ids = set() references = record['_source']['references'] for reference in references: recid = reference.get('recid') if recid: if isinstance(recid, list): # Sometimes there is more than one recid in the # reference. recid = recid.pop() unique_refs_ids.add(recid) for unique_refs_id in unique_refs_ids: citations_lookup[unique_refs_id] += 1 click.echo("... DONE.") click.echo("Adding citation numbers...") success, failed = es_bulk( current_search_client, get_records_to_update_generator(citations_lookup), chunk_size=chunk_size, raise_on_exception=True, raise_on_error=True, request_timeout=request_timeout, stats_only=True) click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))