示例#1
0
    def preprocess_config(self, config):
        if isinstance(config, str):
            config = {'published_schema': config}
        else:
            config = {**config}

        if 'draft_schema' not in config:
            if not config['published_schema'].startswith('/'):
                config['draft_schema'] = 'draft' + '/' + config[
                    'published_schema']

        config['draft_schema_file'] = os.path.join(
            self.app.config['INVENIO_RECORD_DRAFT_SCHEMAS_DIR'],
            config['draft_schema'])

        config['published_index'] = schema_to_index(
            config['published_schema'])[0]
        draft_index = schema_to_index(config['draft_schema'])[0]
        config['draft_index'] = draft_index

        config['published_mapping_file'] = current_search.mappings[
            config['published_index']]

        config['draft_mapping_file'] = os.path.join(
            self.app.config['INVENIO_RECORD_DRAFT_MAPPINGS_DIR'],
            f'{draft_index}.json')

        return config
示例#2
0
def schema_prefix(schema):
    """Get index prefix for a given schema."""
    if not schema:
        return None
    index, doctype = schema_to_index(
        schema, index_names=current_search.mappings.keys())
    return index.split('-')[0]
示例#3
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {
                        'citation_count': citation_count
                    }
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(
            es_scan(es,
                    query={
                        '_source': 'references.recid',
                        'filter': {
                            'exists': {
                                'field': 'references.recid'
                            }
                        },
                        'size': LARGE_CHUNK_SIZE
                    },
                    scroll=u'2m',
                    index=index,
                    doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(
                list(
                    chain.from_iterable(
                        map(force_list,
                            get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo(
        '... DONE: {} records updated with success. {} failures.'.format(
            success, failed))
示例#4
0
文件: utils.py 项目: hachreak/zenodo
def schema_prefix(schema):
    """Get index prefix for a given schema."""
    if not schema:
        return None
    index, doctype = schema_to_index(
        schema, index_names=current_search.mappings.keys())
    return index.split('-')[0]
示例#5
0
def test_schema_to_index_prefixes_indices(app):
    """Test that prefix is added to the index when creating it."""
    new_conf = {'SEARCH_INDEX_PREFIX': 'prefix-'}
    with patch.dict(app.config, new_conf):
        result = schema_to_index('default-v1.0.0.json')

        assert result == ('prefix-default-v1.0.0', 'default-v1.0.0')
示例#6
0
def _record_to_index(record):
    """Get index/doctype given a record."""
    index, doctype = schema_to_index(record.get('$schema', ''))
    if index and doctype:
        return index, doctype
    else:
        return current_app.config['INDEXER_DEFAULT_INDEX'], \
            current_app.config['INDEXER_DEFAULT_DOCTYPE']
示例#7
0
    class Meta:
        """Configuration for CERN search."""

        index = schema_to_index(Keyword._schema)[0]
        doc_types = None
        fields = ('*', )
        default_filter = DefaultFilter(
            Q('bool', filter=[Q('match', deleted=False)]))
示例#8
0
def _record_to_index(record):
    """Get index/doctype given a record."""
    index, doctype = schema_to_index(record.get('$schema', ''))
    if index and doctype:
        return index, doctype
    else:
        return current_app.config['INDEXER_DEFAULT_INDEX'], \
            current_app.config['INDEXER_DEFAULT_DOCTYPE']
示例#9
0
def get_search_index(json_schemas, url_prefix):
    indices = [schema_to_index(x)[0] for x in json_schemas]
    indices = [x for x in indices if x]
    if len(indices) == 1:
        return indices[0]
    else:
        raise Exception('Add "published_search_index" or "json_schemas" to '
                        'DRAFT_ENABLED_RECORDS_REST_ENDPOINTS["%s"]' %
                        url_prefix)
示例#10
0
def _get_percolator_doc_type(index):
    es_ver = ES_VERSION[0]
    if es_ver == 2:
        return '.percolator'
    elif es_ver == 5:
        return 'percolators'
    elif es_ver == 6:
        mapping_path = current_search.mappings[index]
        _, doc_type = schema_to_index(mapping_path)
        return doc_type
示例#11
0
def _get_percolator_doc_type(index):
    es_ver = ES_VERSION[0]
    if es_ver == 2:
        return '.percolator'
    elif es_ver == 5:
        return 'percolators'
    elif es_ver in (6, 7):
        mapping_path = current_search.mappings[index]
        _, doc_type = schema_to_index(mapping_path)
        return doc_type
示例#12
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            es,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))
示例#13
0
def _record_to_index(record):
    """Get index/doc_type given a record."""
    index_names = current_search.mappings.keys()
    schema = record.get('$schema', '')
    if isinstance(schema, dict):
        schema = schema.get('$ref', '')

    index, doc_type = schema_to_index(schema, index_names=index_names)

    if index and doc_type:
        return index, doc_type
    else:
        return (current_app.config['INDEXER_DEFAULT_INDEX'],
                current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
示例#14
0
def _record_to_index(record):
    """Get index/doc_type given a record."""
    index_names = current_search.mappings.keys()
    schema = record.get('$schema', '')
    if isinstance(schema, dict):
        schema = schema.get('$ref', '')

    index, doc_type = schema_to_index(schema, index_names=index_names)

    if index and doc_type:
        return index, doc_type
    else:
        return (current_app.config['INDEXER_DEFAULT_INDEX'],
                current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
示例#15
0
    def preprocess_record(self, pid, record, links_factory=None, **kwargs):
        """Adds cached ACLs to the serialized record."""
        ret = super().preprocess_record(pid, record, links_factory, **kwargs)

        index_names = current_search.mappings.keys()
        index, doc_type = schema_to_index(record['$schema'],
                                          index_names=index_names)

        search_class = None
        if self.acl_rest_endpoint is None:
            if has_request_context():
                search_class = getattr(request._methodview, 'search_class')
            if not search_class:  # pragma no cover
                raise AttributeError(
                    'Please set acl_rest_endpoint property with the key to RECORDS_REST_ENDPOINTS'
                )
        else:
            rest_configuration = current_app.config['RECORDS_REST_ENDPOINTS'][
                self.acl_rest_endpoint]
            search_class = obj_or_import_string(rest_configuration.get(
                'search_class', None),
                                                default=ACLRecordsSearch)

        sc = search_class(index=index, doc_type=doc_type)

        rec = sc.acl_return_all(operation=self.acl_operations).get_record(
            str(record.id))
        rec = rec.execute()

        if rec.hits:
            matched_acls = getattr(rec.hits[0].meta, 'matched_queries', [])
            matched_acls = [
                x.replace(f'{ACL_MATCHED_QUERY}_', '') for x in matched_acls
            ]
        else:  # pragma no cover
            logger.error(
                'Should not happen, record %s not found in elasticsearch',
                record.id)
            matched_acls = []

        ret['invenio_explicit_acls'] = matched_acls

        return ret
示例#16
0
文件: utils.py 项目: N03/invenio
def default_record_to_index(record):
    """Default function to get index/doc_type given a record.

    It tries to extract from `record['$schema']` the index and doc_type.
    If it fails, return the default values.

    :param record: The record object.
    :returns: Tuple (index, doc_type).
    """
    index_names = current_search.mappings.keys()
    schema = record.get('$schema', '')
    if isinstance(schema, dict):
        schema = schema.get('$ref', '')

    index, doc_type = schema_to_index(schema, index_names=index_names)

    if index and doc_type:
        return index, doc_type
    else:
        return (current_app.config['INDEXER_DEFAULT_INDEX'],
                current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
示例#17
0
def default_record_to_index(record):
    """Default function to get index/doc_type given a record.

    It tries to extract from `record['$schema']` the index and doc_type.
    If it fails, return the default values.

    :param record: The record object.
    :returns: Tuple (index, doc_type).
    """
    index_names = current_search.mappings.keys()
    schema = record.get('$schema', '')
    if isinstance(schema, dict):
        schema = schema.get('$ref', '')

    index, doc_type = schema_to_index(schema, index_names=index_names)

    if index and doc_type:
        return index, doc_type
    else:
        return (current_app.config['INDEXER_DEFAULT_INDEX'],
                current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
示例#18
0
def record_to_index(record):
    """Get index/doc_type given a record.

    It tries to extract from `record['$schema']` the index and doc_type.
    If it fails, return the default values.

    :param record: The record object.
    :returns: Tuple (index, doc_type).
    """
    index_names = current_search.mappings.keys()
    schema = record.get('$schema', '')
    if isinstance(schema, dict):
        schema = schema.get('$ref', '')

    # put all document in the same index
    if re.search(r'/documents/', schema):
        schema = re.sub(r'-.*\.json', '.json', schema)
    index, doc_type = schema_to_index(schema, index_names=index_names)

    if index and doc_type:
        return index, doc_type
    else:
        return (current_app.config['INDEXER_DEFAULT_INDEX'],
                current_app.config['INDEXER_DEFAULT_DOC_TYPE'])
def test_schema_to_index(schema_url, result):
    """Test conversion of schema to index name and document type."""
    assert result == schema_to_index(schema_url)
def test_schema_to_index(schema_url, result):
    """Test conversion of schema to index name and document type."""
    assert result == schema_to_index(schema_url)
示例#21
0
    class Meta:
        """Configuration for CERN search."""

        index = schema_to_index(Keyword._schema)[0]
        doc_types = None
        fields = ('*', )
示例#22
0
def test_schema_to_index_with_names(app):
    """Test that prefix is added to the index when creating it."""
    result = schema_to_index('default-v1.0.0.json',
                             index_names=['default-v1.0.0'])
    doc_type = '_doc' if ES_VERSION[0] >= 7 else 'default-v1.0.0'
    assert result == ('default-v1.0.0', doc_type)
示例#23
0
def test_schema_to_index(schema, expected, index_names, app):
    """Test the expected value of schema to index."""
    result = schema_to_index(schema, index_names=index_names)
    if ES_VERSION[0] >= 7 and expected[0]:
        expected = (expected[0], "_doc")
    assert result == expected
示例#24
0
def add_citation_counts(chunk_size=500, request_timeout=10):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
示例#25
0
def doc_type_should_be_sent_to_orcid(record):
    index, doc_type = schema_to_index(record['$schema'])
    return doc_type == 'hep'
示例#26
0
def test_schema_to_index_with_names(app):
    """Test that prefix is added to the index when creating it."""
    result = schema_to_index("default-v1.0.0.json", index_names=["default-v1.0.0"])
    doc_type = "_doc" if ES_VERSION[0] >= 7 else "default-v1.0.0"
    assert result == ("default-v1.0.0", doc_type)
示例#27
0
def doc_type_should_be_sent_to_orcid(record):
    """Return ``True`` is a document type should be sent to ORCID."""
    index, doc_type = schema_to_index(record['$schema'])
    pushable_doc_types = current_app.config['ORCID_RECORDS_DOC_TYPES']
    return doc_type in pushable_doc_types
示例#28
0
def test_schema_to_index(schema, expected, index_names, app):
    """Test the expected value of schema to index."""
    result = schema_to_index(schema, index_names=index_names)
    assert result == expected
示例#29
0
def doc_type_should_be_sent_to_orcid(record):
    """Return ``True`` is a document type should be sent to ORCID."""
    index, doc_type = schema_to_index(record['$schema'])
    return doc_type == 'hep'
示例#30
0
def add_citation_counts(chunk_size=500, request_timeout=40):
    index, doc_type = schema_to_index('records/hep.json')

    def get_records_to_update_generator(citation_lookup):
        for recid, citation_count in citation_lookup.iteritems():
            try:
                uuid = PersistentIdentifier.query.filter(PersistentIdentifier.object_type == "rec", PersistentIdentifier.pid_value == str(recid)).one().object_uuid
                yield {'_op_type': 'update',
                       '_index': index,
                       '_type': doc_type,
                       '_id': str(uuid),
                       'doc': {'citation_count': citation_count}
                       }
            except NoResultFound:
                continue

    click.echo("Extracting all citations...")

    # lookup dictionary where key: recid of the record
    # and value: number of records that cite that record
    citations_lookup = Counter()
    with click.progressbar(es_scan(
            current_search_client,
            query={
                "_source": "references.recid",
                "filter": {
                    "exists": {
                        "field": "references.recid"
                    }
                },
                "size": LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            # update lookup dictionary based on references of the record
            if 'references' in record['_source']:
                unique_refs_ids = set()
                references = record['_source']['references']
                for reference in references:
                    recid = reference.get('recid')
                    if recid:
                        if isinstance(recid, list):
                            # Sometimes there is more than one recid in the
                            # reference.
                            recid = recid.pop()
                        unique_refs_ids.add(recid)

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1

    click.echo("... DONE.")
    click.echo("Adding citation numbers...")

    success, failed = es_bulk(
        current_search_client,
        get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=True,
        raise_on_error=True,
        request_timeout=request_timeout,
        stats_only=True)
    click.echo("... DONE: {} records updated with success. {} failures.".format(success, failed))
示例#31
0
def doc_type_should_be_sent_to_orcid(record):
    index, doc_type = schema_to_index(record['$schema'])
    return doc_type == 'hep'