def run(self):
     timestamp = datetime.utcnow()
     sickle = Sickle('http://invenio.nusl.cz/oai2d/')
     sickle.class_mapping['ListRecords'] = MarcXMLParser
     sickle.class_mapping['GetRecord'] = MarcXMLParser
     oai_logger.info("Loading records")
     records = sickle.ListRecords(metadataPrefix='marcxml')
     for idx, record in enumerate(records):
         print(f"{idx}. {record.id}")
         oai_logger.info(f"{idx}. {record.id}")
         try:
             current_search_client.index(
                 index=self.index,
                 id=record.marc_dict["001"],
                 body=record.marc_dict
             )
         except:
             exc_traceback = traceback.format_exc()
             print(exc_traceback)
             print("\n\n\n")
             file_name = f'{timestamp.strftime("%Y%m%dT%H%M%S")}.err'
             file_path = os.path.join(self.path, file_name)
             with open(file_path, "a") as f:
                 f.write(
                     f"Dictionary: {record.marc_dict}\n\n"
                     f"{exc_traceback}\n\n\n\n")
             continue
Exemplo n.º 2
0
    def _update_status_in_doc(cls, record, es_item):
        """Update the status of a given item in the document index.

        :param record: an item object
        :param es_item: a dict of the elasticsearch item
        """
        # retrieve the document in the corresponding es index
        document_pid = extracted_data_from_ref(record.get('document'))
        doc = next(DocumentsSearch().extra(version=True).filter(
            'term', pid=document_pid).scan())
        # update the item status in the document
        data = doc.to_dict()
        for hold in data.get('holdings', []):
            for item in hold.get('items', []):
                if item['pid'] == record.pid:
                    item['status'] = record['status']
                    break
            else:
                continue
            break
        # reindex the document with the same version
        current_search_client.index(index=DocumentsSearch.Meta.index,
                                    id=doc.meta.id,
                                    body=data,
                                    version=doc.meta.version,
                                    version_type='external_gte')
Exemplo n.º 3
0
def orcid_test(mock_user, request):
    """Orcid test fixture."""
    app = mock_user.app

    def teardown(app):
        with app.app_context():
            es.delete(index='records-authors', doc_type='authors', id=10)

    record = {
        "name": {
            "status": "ACTIVE",
            "preferred_name": "Full Name",
            "value": "Full Name"
        },
        "$schema":
        "http://localhost:5000/schemas/records/authors.json",
        "control_number":
        "10",
        "self": {
            "$ref": "http://localhost:5000/api/authors/10"
        },
        "ids": [{
            "type": "INSPIRE",
            "value": "INSPIRE-0000000"
        }, {
            "type": "ORCID",
            "value": "0000-0001-9412-8627"
        }],
        "self_recid":
        10,
        "earliest_date":
        "2015-09-23"
    }

    request.addfinalizer(lambda: teardown(app))

    with app.app_context():
        es.index(index='records-authors',
                 doc_type='authors',
                 id=10,
                 body=record)
        es.indices.refresh('records-authors')
        record = get_db_record('literature', 782466)
        record['authors'].append({
            u'affiliations': [{
                u'value': u'St. Petersburg, INP'
            }],
            u'curated_relation':
            True,
            u'full_name':
            u'Full, Name',
            u'profile': {
                u'__url__': u'http://inspirehep.net/record/00000000'
            },
            u'record': {
                u'$ref': u'http://localhost:5000/api/authors/10'
            }
        })
        mock_orcid_api = OrcidApiMock(1)
        return mock_orcid_api, record
Exemplo n.º 4
0
def test_appoint_profile_from_claimed_signature(small_app):
    """Check the module for the case where claimed signature takes
    everything.
    """
    from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid

    old_record_id = str(PersistentIdentifier.get("literature", 11883).object_uuid)
    old_record = get_es_record_by_uuid(old_record_id)
    old_author_uuid = old_record["authors"][0]["uuid"]

    # Add phonetic block to the record.
    old_record["authors"][0]["signature_block"] = "HAGp"
    old_record["authors"][0]["recid"] = "2"
    es.index(index="records-hep", doc_type="hep", id=old_record_id, body=old_record)
    es.indices.refresh("records-hep")

    record_id = str(PersistentIdentifier.get("literature", 1358492).object_uuid)
    record = get_es_record_by_uuid(record_id)
    author_uuid = record["authors"][0]["uuid"]

    # Add phonetic block to the record.
    record["authors"][0]["signature_block"] = "HAGp"
    record["authors"][0]["recid"] = "314159265"
    record["authors"][0]["curated_relation"] = True
    es.index(index="records-hep", doc_type="hep", id=record_id, body=record)
    es.indices.refresh("records-hep")

    with patch("celery.current_app.send_task", return_value=_BeardObject(({"2": [old_author_uuid, author_uuid]}, {}))):
        with patch(
            "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid
        ):
            disambiguation_clustering("HAGp")

    assert Record.get_record(old_record_id)["authors"][0]["recid"] == "314159265"
    assert Record.get_record(record_id)["authors"][0]["recid"] == "314159265"
Exemplo n.º 5
0
def index_record_modification(sender, changes):
    """Example handler for indexing record metadata."""
    for obj, change in changes:
        if isinstance(obj, RecordMetadata):
            if change in ("insert", "update"):
                current_search_client.index(index="records", doc_type="record", id=obj.id, body=obj.json)
            elif change in ("delete"):
                current_search_client.delete(index="records", doc_type="record", id=obj.id)
Exemplo n.º 6
0
 def index(self, index_name=None, doc_type=None):
     """Index the workflow record into desired index/doc_type."""
     config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get(
         self["_workflow"]["data_type"]
     )
     if config or (index_name and doc_type):
         current_search_client.index(
             id=str(self['id']),
             index=index_name or config.get('search_index'),
             doc_type=doc_type or config.get('search_type'),
             body=self.dumps(),
         )
Exemplo n.º 7
0
def fixtures():
    """Example fixtures."""
    # Index sample records
    current_search_client.index(
        index='demo-default-v1.0.0',
        body={'title': 'Public', 'body': 'test 1', 'public': 1},
        doc_type='example' if ES_VERSION[0] < 7 else '_doc'
    )
    current_search_client.index(
        index='demo-default-v1.0.0',
        body={'title': 'Private', 'body': 'test 2', 'public': 0},
        doc_type='example' if ES_VERSION[0] < 7 else '_doc'
    )
Exemplo n.º 8
0
def _new_percolator(spec, search_pattern):
    """Create new percolator associated with the new set."""
    if spec and search_pattern:
        query = query_string_parser(search_pattern=search_pattern).to_dict()
        for index in current_search.mappings.keys():
            # Create the percolator doc_type in the existing index for >= ES5
            # TODO: Consider doing this only once in app initialization
            percolator_doc_type = _get_percolator_doc_type(index)
            _create_percolator_mapping(index, percolator_doc_type)
            current_search_client.index(index=index,
                                        doc_type=percolator_doc_type,
                                        id='oaiset-{}'.format(spec),
                                        body={'query': query})
Exemplo n.º 9
0
def _new_percolator(spec, search_pattern):
    """Create new percolator associated with the new set."""
    if spec and search_pattern:
        query = query_string_parser(search_pattern=search_pattern).to_dict()
        for index in current_search.mappings.keys():
            # Create the percolator doc_type in the existing index for >= ES5
            # TODO: Consider doing this only once in app initialization
            percolator_doc_type = _get_percolator_doc_type(index)
            _create_percolator_mapping(index, percolator_doc_type)
            current_search_client.index(
                index=index, doc_type=percolator_doc_type,
                id='oaiset-{}'.format(spec),
                body={'query': query}
            )
Exemplo n.º 10
0
def index_documents(docs, bulk=False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Exemplo n.º 11
0
def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Exemplo n.º 12
0
def index_record_dict(record_dict, doc_type, recid, index=None, parent=None):
    """ Index a given document

    :param record_dict: [dict] A python dictionary containing
    a JSON-like structure which needs to be indexed
    :param doc_type: [string] type of document. "publication" or "datatable"
    :param index: [string] name of the index. If None a default is used
    :param parent: [int] record id of the potential parent

    :return: [dict] Response dictionary
    """
    if parent:
        return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict, parent=parent)
    else:
        return es.index(index=index, doc_type=doc_type, id=recid, body=record_dict)
Exemplo n.º 13
0
 def index(self, index_name=None, doc_type=None):
     """Index the workflow record into desired index/doc_type."""
     config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get(
         self["_workflow"]["data_type"])
     if config or (index_name and doc_type):
         try:
             current_search_client.index(
                 id=str(self['id']),
                 index=index_name or config.get('search_index'),
                 doc_type=doc_type or config.get('search_type'),
                 body=self.dumps(),
             )
         except TransportError:
             current_app.logger.exception()
             current_app.logger.error(
                 "Problem while indexing workflow object {0}".format(
                     self.model.id))
def test_appoint_profile_from_claimed_signature(small_app):
    """Check the module for the case where claimed signature takes
    everything.
    """
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_clustering, update_authors_recid)

    old_record_id = str(
        PersistentIdentifier.get("literature", 11883).object_uuid)
    old_record = get_es_record_by_uuid(old_record_id)
    old_author_uuid = old_record['authors'][0]['uuid']

    # Add phonetic block to the record.
    old_record['authors'][0]['signature_block'] = "HAGp"
    old_record['authors'][0]['recid'] = "2"
    es.index(index='records-hep',
             doc_type='hep',
             id=old_record_id,
             body=old_record)
    es.indices.refresh('records-hep')

    record_id = str(
        PersistentIdentifier.get("literature", 1358492).object_uuid)
    record = get_es_record_by_uuid(record_id)
    author_uuid = record['authors'][0]['uuid']

    # Add phonetic block to the record.
    record['authors'][0]['signature_block'] = "HAGp"
    record['authors'][0]['recid'] = "314159265"
    record['authors'][0]['curated_relation'] = True
    es.index(index='records-hep', doc_type='hep', id=record_id, body=record)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task",
               return_value=_BeardObject(({
                   "2": [old_author_uuid, author_uuid]
               }, {}))):
        with patch(
                "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay",
                side_effect=update_authors_recid):
            disambiguation_clustering("HAGp")

    assert Record.get_record(old_record_id)['authors'][0]['recid'] == \
        "314159265"
    assert Record.get_record(record_id)['authors'][0]['recid'] == \
        "314159265"
Exemplo n.º 15
0
def index_record_modification(sender, changes):
    """Example handler for indexing record metadata."""
    for obj, change in changes:
        if isinstance(obj, RecordMetadata):
            if change in ('insert', 'update'):
                current_search_client.index(
                    index='records',
                    doc_type='record',
                    id=obj.id,
                    body=obj.json,
                )
            elif change in ('delete'):
                current_search_client.delete(
                    index='records',
                    doc_type='record',
                    id=obj.id,
                )
Exemplo n.º 16
0
def index_documents(docs: Iterable[dict], bulk: bool = False):
    """Index a list of documents into ES."""
    if bulk:
        bulk_index(
            client=current_search_client,
            actions=docs,
            index='relationships',
            doc_type='doc',
            raise_on_error=False,
            chunk_size=300,  # TODO: Make configurable
            max_chunk_bytes=(30 * 1024 * 1024),  # TODO: Make configurable
        )
    else:
        for doc in docs:
            current_search_client.index(index='relationships',
                                        doc_type='doc',
                                        body=doc)
Exemplo n.º 17
0
    def set(self, taxonomy_term: TaxonomyTerm, timestamp=None) -> None:
        """
        Save serialized taxonomy into Elasticsearch. It create new or update old taxonomy record.

        :param taxonomy_term: Taxonomy term class from flask-taxonomies
        :type taxonomy_term: TaxonomyTerm
        :param timestamp: Datetime class
        :type timestamp: Datetime class
        :return: None
        :rtype: None
        """
        if taxonomy_term.parent:
            body = get_taxonomy_term(code=taxonomy_term.taxonomy.slug,
                                     slug=taxonomy_term.slug,
                                     timestamp=timestamp)
            current_search_client.index(index=self.index,
                                        id=taxonomy_term.id,
                                        body=body)
Exemplo n.º 18
0
    def create(cls, data, id_=None, index_refresh='false', **kwargs):
        """Create a new record instance and store it in elasticsearch.

        :param data: Dict with the record metadata.
        :param id_: Specify a UUID to use for the new record, instead of
                    automatically generated.
        :param index_refresh: If `true` then refresh the affected shards to
            make this operation visible to search, if `wait_for` then wait for
            a refresh to make this operation visible to search, if `false`
            (the default) then do nothing with refreshes.
            Valid choices: 'true', 'false', 'wait_for'
        :returns: A new :class:`Record` instance.
        """
        if id_:
            data['pid'] = id_

        record = cls(data, model=None, **kwargs)

        # Run pre create extensions
        for e in cls._extensions:
            e.pre_create(record)

        if current_app.config.get('RERO_ILS_ENABLE_OPERATION_LOG_VALIDATION'):
            # Validate also encodes the data
            # For backward compatibility we pop them here.
            format_checker = kwargs.pop('format_checker', None)
            validator = kwargs.pop('validator', None)
            if '$schema' not in record:
                record['$schema'] = current_jsonschemas.path_to_url(
                    cls._schema)
            record._validate(format_checker=format_checker,
                             validator=validator,
                             use_model=False)

        current_search_client.index(index=cls.get_index(record),
                                    body=record.dumps(),
                                    id=record['pid'],
                                    refresh=index_refresh)

        # Run post create extensions
        for e in cls._extensions:
            e.post_create(record)
        return record
Exemplo n.º 19
0
 def index(self, index_name=None, doc_type=None):
     """Index the workflow record into desired index/doc_type."""
     config = current_app.config['WORKFLOWS_UI_DATA_TYPES'].get(
         self["_workflow"]["data_type"]
     )
     if config or (index_name and doc_type):
         try:
             current_search_client.index(
                 id=str(self['id']),
                 index=index_name or config.get('search_index'),
                 doc_type=doc_type or config.get('search_type'),
                 body=self.dumps(),
             )
         except TransportError as err:
             current_app.logger.exception(err)
             current_app.logger.error(
                 "Problem while indexing workflow object {0}".format(
                     self.model.id
                 )
             )
Exemplo n.º 20
0
def test_single_signature_with_no_profile(small_app):
    """Check the module for the case with a single, new signature."""
    from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid

    record_id = str(PersistentIdentifier.get("literature", 11883).object_uuid)
    record = get_es_record_by_uuid(record_id)
    author_uuid = record["authors"][0]["uuid"]

    # Add phonetic block to the record.
    record["authors"][0]["signature_block"] = "HAGp"
    es.index(index="records-hep", doc_type="hep", id=record_id, body=record)
    es.indices.refresh("records-hep")

    with patch("celery.current_app.send_task", return_value=_BeardObject(({}, {"0": [author_uuid]}))):
        with patch(
            "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid
        ):
            disambiguation_clustering("HAGp")

    assert Record.get_record(record_id)["authors"][0]["recid"] == "1"
Exemplo n.º 21
0
def orcid_test(mock_user, request):
    """Orcid test fixture."""
    app = mock_user.app

    def teardown(app):
        with app.app_context():
            es.delete(index='records-authors', doc_type='authors', id=10)

    record = {
        "name": {
            "status": "ACTIVE",
            "preferred_name": "Full Name",
            "value": "Full Name"
        },
        "$schema": "http://localhost:5000/schemas/records/authors.json",
        "control_number": "10",
        "self": {"$ref": "http://localhost:5000/api/authors/10"},
        "ids": [{
            "type": "INSPIRE",
            "value": "INSPIRE-0000000"
        },
            {
            "type": "ORCID",
            "value": "0000-0001-9412-8627"
        }],
        "self_recid": 10,
        "earliest_date": "2015-09-23"
    }

    request.addfinalizer(lambda: teardown(app))

    with app.app_context():
        es.index(index='records-authors',
                 doc_type='authors', id=10, body=record)
        es.indices.refresh('records-authors')
        record = get_db_record('literature', 782466)
        record['authors'].append({u'affiliations': [{u'value': u'St. Petersburg, INP'}],  u'curated_relation': True,  u'full_name': u'Full, Name',  u'profile': {
                                 u'__url__': u'http://inspirehep.net/record/00000000'},  u'record': {u'$ref': u'http://localhost:5000/api/authors/10'}})
        mock_orcid_api = OrcidApiMock(1)
        return mock_orcid_api, record
Exemplo n.º 22
0
def index_record_dict(record_dict, doc_type, recid, index=None, parent=None):
    """ Index a given document

    :param record_dict: [dict] A python dictionary containing
    a JSON-like structure which needs to be indexed
    :param doc_type: [string] type of document. "publication" or "datatable"
    :param index: [string] name of the index. If None a default is used
    :param parent: [int] record id of the potential parent

    :return: [dict] Response dictionary
    """
    if parent:
        return es.index(index=index,
                        doc_type=doc_type,
                        id=recid,
                        body=record_dict,
                        parent=parent)
    else:
        return es.index(index=index,
                        doc_type=doc_type,
                        id=recid,
                        body=record_dict)
Exemplo n.º 23
0
def test_match_signature_with_existing_profile(small_app):
    """Check the module for the case with signatures and existing profile."""
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_clustering, update_authors_recid)

    old_record_id = str(PersistentIdentifier.get('lit', 11883).object_uuid)
    old_record = get_es_record_by_uuid(old_record_id)
    old_author_uuid = old_record['authors'][0]['uuid']

    # Add phonetic block to the record.
    old_record['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep',
             doc_type='hep',
             id=old_record_id,
             body=old_record)
    es.indices.refresh('records-hep')

    record_id = str(PersistentIdentifier.get('lit', 1358492).object_uuid)
    record = get_es_record_by_uuid(record_id)
    author_uuid = record['authors'][0]['uuid']

    # Add phonetic block to the record.
    record['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep', doc_type='hep', id=record_id, body=record)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task",
               return_value=_BeardObject(({
                   "1": [old_author_uuid, author_uuid]
               }, {}))):
        with patch(
                "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay",
                side_effect=update_authors_recid):
            disambiguation_clustering("HAGp")

    assert InspireRecord.get_record(
        old_record_id)['authors'][0]['recid'] == "1"
    assert InspireRecord.get_record(record_id)['authors'][0]['recid'] == "1"
Exemplo n.º 24
0
def index_record_modification(sender, changes):
    """Reset the set of processed records for the next session."""
    records_to_index = flask.g.get('invenio_search_records_to_index', dict())
    records_to_delete = flask.g.get('invenio_search_records_to_delete', set())
    for id in records_to_index:
        if id not in records_to_delete:
            current_search_client.index(
                index='invenio_records_rest_test_index',
                doc_type='record',
                id=id,
                body=records_to_index[id].body,
                version=records_to_index[id].version,
                version_type='external_gte',
            )
    for id in records_to_delete:
        current_search_client.delete(
            index='invenio_records_rest_test_index',
            doc_type='record',
            id=id,
        )

    flask.g.invenio_search_records_to_index = dict()
    flask.g.invenio_search_records_to_delete = set()
Exemplo n.º 25
0
def _new_percolator(spec, search_pattern):
    """Create new percolator associated with the new set."""
    if spec and search_pattern:
        query = query_string_parser(search_pattern=search_pattern).to_dict()
        oai_records_index = current_app.config["OAISERVER_RECORD_INDEX"]
        for index, mapping_path in current_search.mappings.items():
            # Skip indices/mappings not used by OAI-PMH
            if not index.startswith(oai_records_index):
                continue
            # Create the percolator doc_type in the existing index for >= ES5
            # TODO: Consider doing this only once in app initialization
            try:
                percolator_doc_type = _get_percolator_doc_type(index)
                _create_percolator_mapping(index, percolator_doc_type,
                                           mapping_path)
                current_search_client.index(
                    index=_build_percolator_index_name(index),
                    doc_type=percolator_doc_type,
                    id="oaiset-{}".format(spec),
                    body={"query": query},
                )
            except Exception as e:
                current_app.logger.warning(e)
def index_record_modification(sender, changes):
    """Reset the set of processed records for the next session."""
    records_to_index = flask.g.get('invenio_search_records_to_index', dict())
    records_to_delete = flask.g.get('invenio_search_records_to_delete', set())
    for id in records_to_index:
        if id not in records_to_delete:
            current_search_client.index(
                index='invenio_records_rest_test_index',
                doc_type='record',
                id=id,
                body=records_to_index[id].body,
                version=records_to_index[id].version,
                version_type='external_gte',
            )
    for id in records_to_delete:
        current_search_client.delete(
            index='invenio_records_rest_test_index',
            doc_type='record',
            id=id,
        )

    flask.g.invenio_search_records_to_index = dict()
    flask.g.invenio_search_records_to_delete = set()
Exemplo n.º 27
0
def test_match_signature_with_existing_profile(small_app):
    """Check the module for the case with signatures and existing profile."""
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_clustering,
        update_authors_recid
    )

    old_record_id = str(PersistentIdentifier.get('lit', 11883).object_uuid)
    old_record = get_es_record_by_uuid(old_record_id)
    old_author_uuid = old_record['authors'][0]['uuid']

    # Add phonetic block to the record.
    old_record['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep', doc_type='hep',
             id=old_record_id, body=old_record)
    es.indices.refresh('records-hep')

    record_id = str(PersistentIdentifier.get('lit', 1358492).object_uuid)
    record = get_es_record_by_uuid(record_id)
    author_uuid = record['authors'][0]['uuid']

    # Add phonetic block to the record.
    record['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep', doc_type='hep',
             id=record_id, body=record)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task",
               return_value=_BeardObject(
                   ({"1": [old_author_uuid, author_uuid]}, {}))):
        with patch("inspirehep.modules.disambiguation.tasks.update_authors_recid.delay",
                   side_effect=update_authors_recid):
            disambiguation_clustering("HAGp")

    assert InspireRecord.get_record(old_record_id)['authors'][0]['recid'] == "1"
    assert InspireRecord.get_record(record_id)['authors'][0]['recid'] == "1"
Exemplo n.º 28
0
def test_solve_claim_conflicts(small_app):
    """Check the module for the case where at least two claimed
    signatures are assigned to the same cluster.
    """
    from inspirehep.modules.disambiguation.tasks import disambiguation_clustering, update_authors_recid

    # Claimed signature #1.
    glashow_record_id_claimed = str(PersistentIdentifier.get("literature", 4328).object_uuid)
    glashow_record_claimed = get_es_record_by_uuid(glashow_record_id_claimed)
    glashow_record_uuid_claimed = glashow_record_claimed["authors"][0]["uuid"]

    # Add phonetic block to the record.
    glashow_record_claimed["authors"][0]["signature_block"] = "HAGp"
    glashow_record_claimed["authors"][0]["curated_relation"] = True
    glashow_record_claimed["authors"][0]["recid"] = "3"
    es.index(index="records-hep", doc_type="hep", id=glashow_record_id_claimed, body=glashow_record_claimed)
    es.indices.refresh("records-hep")

    # Claimed signature #2.
    higgs_record_id_claimed = str(PersistentIdentifier.get("literature", 1358492).object_uuid)
    higgs_record_claimed = get_es_record_by_uuid(higgs_record_id_claimed)
    higgs_record_uuid_claimed = higgs_record_claimed["authors"][0]["uuid"]

    # Add phonetic block to the record.
    higgs_record_claimed["authors"][0]["signature_block"] = "HAGp"
    higgs_record_claimed["authors"][0]["curated_relation"] = True
    higgs_record_claimed["authors"][0]["recid"] = "4"
    es.index(index="records-hep", doc_type="hep", id=higgs_record_id_claimed, body=higgs_record_claimed)
    es.indices.refresh("records-hep")

    # Not claimed signature.
    higgs_record_id_not_claimed = str(PersistentIdentifier.get("literature", 11883).object_uuid)
    higgs_record_not_claimed = get_es_record_by_uuid(higgs_record_id_not_claimed)
    higgs_record_uuid_not_claimed = higgs_record_not_claimed["authors"][0]["uuid"]

    # Add phonetic block to the record.
    higgs_record_not_claimed["authors"][0]["signature_block"] = "HAGp"
    es.index(index="records-hep", doc_type="hep", id=higgs_record_id_not_claimed, body=higgs_record_not_claimed)
    es.indices.refresh("records-hep")

    with patch(
        "celery.current_app.send_task",
        return_value=_BeardObject(
            ({"3": [glashow_record_uuid_claimed, higgs_record_uuid_claimed, higgs_record_uuid_not_claimed]}, {})
        ),
    ):
        with patch(
            "inspirehep.modules.disambiguation.logic._solve_claims_conflict",
            return_value=_ConflictObject({higgs_record_uuid_claimed: [higgs_record_uuid_not_claimed]}),
        ):
            with patch(
                "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay", side_effect=update_authors_recid
            ):
                disambiguation_clustering("HAGp")

    assert Record.get_record(higgs_record_id_not_claimed)["authors"][0]["recid"] == "4"
def index_record_modification(sender, changes):
    """Reset the set of processed records for the next session."""
    records_to_index = flask.g.get('invenio_search_records_to_index', dict())
    records_to_delete = flask.g.get('invenio_search_records_to_delete', set())
    es_index = current_app.config["RECORDS_REST_DEFAULT_SEARCH_INDEX"]
    for id in records_to_index:
        if id not in records_to_delete:
            current_search_client.index(
                index=es_index,
                doc_type='testrecord-v1.0.0',
                id=id,
                body=records_to_index[id].body,
                version=records_to_index[id].version,
                version_type='external_gte',
            )
    for id in records_to_delete:
        current_search_client.delete(
            index=es_index,
            doc_type='testrecord-v1.0.0',
            id=id,
        )

    flask.g.invenio_search_records_to_index = dict()
    flask.g.invenio_search_records_to_delete = set()
Exemplo n.º 30
0
    def create_from_kwargs(cls, index_name='', disable_persistent_identifier=False, **kwargs):
        instance = cls()

        updated_kwargs = copy.deepcopy(kwargs)
        if not kwargs.pop('id', None):
            updated_kwargs['id'] = uuid.uuid4()

        json_ = copy.deepcopy(cls.JSON_SKELETON)
        json_.update(kwargs.pop('json', {}))

        if kwargs.get('pid_type', 'lit') == 'lit' and 'titles' not in json_:
            json_.update({
                'titles': [
                    {
                        'title': generate_random_string(60)
                    }
                ]
            })
        if 'control_number' not in json_:
            json_['control_number'] = get_next_free_recid()
        else:
            reserve_recid(json_['control_number'])

        updated_kwargs['json'] = json_

        instance.record_metadata = super(TestRecordMetadata, cls)\
                .create_from_kwargs(updated_kwargs)

        if index_name:
            instance.es_index_result = es.index(
                index=index_name,
                doc_type=index_name.split('-')[-1],
                body=instance.record_metadata.json,
                params={}
            )
            instance.es_refresh_result = es.indices.refresh(index_name)
        if not disable_persistent_identifier:
            instance.persistent_identifier = TestPersistentIdentifier\
                    .create_from_kwargs(
                        object_uuid=instance.record_metadata.id,
                        pid_value=instance.record_metadata.json.get('control_number'),
                        **kwargs).persistent_identifier

        instance.inspire_record = InspireRecord(instance.record_metadata.json,
                                                model=RecordMetadata)

        return instance
Exemplo n.º 31
0
    def create_from_kwargs(cls, index_name='', disable_persistent_identifier=False, **kwargs):
        instance = cls()

        updated_kwargs = copy.deepcopy(kwargs)
        if not kwargs.pop('id', None):
            updated_kwargs['id'] = uuid.uuid4()

        json_ = copy.deepcopy(cls.JSON_SKELETON)
        json_.update(kwargs.pop('json', {}))

        if kwargs.get('pid_type', 'lit') == 'lit' and 'titles' not in json_:
            json_.update({
                'titles': [
                    {
                        'title': generate_random_string(60)
                    }
                ]
            })
        if 'control_number' not in json_:
            json_['control_number'] = get_next_free_recid()
        else:
            reserve_recid(json_['control_number'])

        updated_kwargs['json'] = json_

        instance.record_metadata = super(TestRecordMetadata, cls)\
                .create_from_kwargs(updated_kwargs)

        if index_name:
            instance.es_index_result = es.index(
                index=index_name,
                body=instance.record_metadata.json,
                params={}
            )
            instance.es_refresh_result = es.indices.refresh(index_name)
        if not disable_persistent_identifier:
            instance.persistent_identifier = TestPersistentIdentifier\
                    .create_from_kwargs(
                        object_uuid=instance.record_metadata.id,
                        pid_value=instance.record_metadata.json.get('control_number'),
                        **kwargs).persistent_identifier

        instance.inspire_record = InspireRecord(instance.record_metadata.json,
                                                model=RecordMetadata)

        return instance
Exemplo n.º 32
0
def test_count_phonetic_block_dispatched(small_app):
    """Count if two phonetic blocks were dispatched."""
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_daemon, )

    # Check if the queue has three records.
    assert DisambiguationRecord.query.count() == 3

    # Signature #1.
    glashow_record_id = str(
        PersistentIdentifier.get("literature", 4328).object_uuid)
    glashow_record = get_es_record_by_uuid(glashow_record_id)

    # Add phonetic block to the record.
    glashow_record['authors'][0]['signature_block'] = "GLASs"
    es.index(index='records-hep',
             doc_type='hep',
             id=glashow_record_id,
             body=glashow_record)
    es.indices.refresh('records-hep')

    # Signature #2.
    higgs_record_id_first = str(
        PersistentIdentifier.get("literature", 1358492).object_uuid)
    higgs_record_first = get_es_record_by_uuid(higgs_record_id_first)

    # Add phonetic block to the record.
    higgs_record_first['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep',
             doc_type='hep',
             id=higgs_record_id_first,
             body=higgs_record_first)
    es.indices.refresh('records-hep')

    # Signature #3.
    higgs_record_id_second = str(
        PersistentIdentifier.get("literature", 11883).object_uuid)
    higgs_record_second = get_es_record_by_uuid(higgs_record_id_second)

    # Add phonetic block to the record.
    higgs_record_second['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep',
             doc_type='hep',
             id=higgs_record_id_second,
             body=higgs_record_second)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task") as send_to_clustering:
        disambiguation_daemon()

        assert send_to_clustering.call_count == 2
Exemplo n.º 33
0
    def create_from_kwargs(cls,
                           index_name="",
                           disable_persistent_identifier=False,
                           **kwargs):
        instance = cls()

        updated_kwargs = copy.deepcopy(kwargs)
        if not kwargs.pop("id", None):
            updated_kwargs["id"] = uuid.uuid4()

        json_ = copy.deepcopy(cls.JSON_SKELETON)
        json_.update(kwargs.pop("json", {}))

        if kwargs.get("pid_type", "lit") == "lit" and "titles" not in json_:
            json_.update({"titles": [{"title": generate_random_string(60)}]})
        if "control_number" not in json_:
            json_["control_number"] = get_next_free_recid()
        else:
            reserve_recid(json_["control_number"])

        updated_kwargs["json"] = json_

        instance.record_metadata = super(
            TestRecordMetadata, cls).create_from_kwargs(updated_kwargs)

        if index_name:
            instance.es_index_result = es.index(
                index=index_name,
                body=instance.record_metadata.json,
                params={})
            instance.es_refresh_result = es.indices.refresh(index_name)
        if not disable_persistent_identifier:
            instance.persistent_identifier = (
                TestPersistentIdentifier.create_from_kwargs(
                    object_uuid=instance.record_metadata.id,
                    pid_value=instance.record_metadata.json.get(
                        "control_number"),
                    **kwargs).persistent_identifier)

        instance.inspire_record = InspireRecord(instance.record_metadata.json,
                                                model=RecordMetadata)

        return instance
Exemplo n.º 34
0
    def create_from_kwargs(cls, index=True, has_pid=True, **kwargs):
        instance = cls()

        updated_kwargs = copy.deepcopy(kwargs)
        if not kwargs.pop('id', None):
            updated_kwargs['id'] = uuid.uuid4()

        json_ = copy.deepcopy(cls.JSON_SKELETON)
        json_.update(kwargs.pop('json', {}))

        if 'titles' not in json_:
            json_.update({
                'titles': [
                    {
                        'title': generate_random_string(60)
                    }
                ]
            })
        if 'control_number' not in json_:
            json_['control_number'] = random.randint(1, 9) * 5

        updated_kwargs['json'] = json_

        instance.record_metadata = super(TestRecordMetadata, cls)\
                .create_from_kwargs(updated_kwargs)

        if index:
            instance.es_index_result = es.index(
                index='records-hep',
                doc_type='hep',
                body=instance.record_metadata.json,
                params={}
            )
            instance.es_refresh_result = es.indices.refresh('records-hep')

        if has_pid:
            instance.persistent_identifier = TestPersistentIdentifier\
                    .create_from_kwargs(
                        object_uuid=instance.record_metadata.id,
                        pid_value=instance.record_metadata.json.get('control_number')
                    ).persistent_identifier
        return instance
Exemplo n.º 35
0
 def update(self):
     """Update any internal representation / index for the acl."""
     body = {
         '__acl_record_selector': self.record_selector,
         '__acl_record_type': self.type
     }
     if logger.isEnabledFor(logging.DEBUG) <= logging.DEBUG:
         logger.debug('get_material_acls: query %s', json.dumps(body, indent=4, ensure_ascii=False))
     schema_indices = [schema_to_index(x)[0] for x in self.schemas]
     acl_index_names = [self.get_acl_index_name(x) for x in schema_indices]
     for acl_idx_name in acl_index_names:
         try:
             resp = current_search_client.index(
                 index=acl_idx_name,
                 **add_doc_type(current_app.config['INVENIO_EXPLICIT_ACLS_DOCTYPE_NAME']),
                 id=self.id,
                 body=body,
                 refresh='wait_for'
             )
             assert resp['result'] in ('created', 'updated')
         finally:
             current_search_client.indices.flush(index=acl_idx_name)
Exemplo n.º 36
0
def test_count_phonetic_block_dispatched(small_app):
    """Count if two phonetic blocks were dispatched."""
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_daemon,
    )

    # Check if the queue has three records.
    assert DisambiguationRecord.query.count() == 3

    # Signature #1.
    glashow_record_id = str(PersistentIdentifier.get(
        "literature", 4328).object_uuid)
    glashow_record = get_es_record_by_uuid(glashow_record_id)

    # Add phonetic block to the record.
    glashow_record['authors'][0]['signature_block'] = "GLASs"
    es.index(index='records-hep', doc_type='hep',
             id=glashow_record_id, body=glashow_record)
    es.indices.refresh('records-hep')

    # Signature #2.
    higgs_record_id_first = str(PersistentIdentifier.get(
        "literature", 1358492).object_uuid)
    higgs_record_first = get_es_record_by_uuid(higgs_record_id_first)

    # Add phonetic block to the record.
    higgs_record_first['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep', doc_type='hep',
             id=higgs_record_id_first, body=higgs_record_first)
    es.indices.refresh('records-hep')

    # Signature #3.
    higgs_record_id_second = str(PersistentIdentifier.get(
        "literature", 11883).object_uuid)
    higgs_record_second = get_es_record_by_uuid(higgs_record_id_second)

    # Add phonetic block to the record.
    higgs_record_second['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep', doc_type='hep',
             id=higgs_record_id_second, body=higgs_record_second)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task") as send_to_clustering:
        disambiguation_daemon()

        assert send_to_clustering.call_count == 2
Exemplo n.º 37
0
def index_documents(docs):
    """Index a list of documents into ES."""
    for doc in docs:
        current_search_client.index(index='relationships',
                                    doc_type='doc',
                                    body=doc)
Exemplo n.º 38
0
def test_solve_claim_conflicts(small_app):
    """Check the module for the case where at least two claimed
    signatures are assigned to the same cluster.
    """
    from inspirehep.modules.disambiguation.tasks import (
        disambiguation_clustering, update_authors_recid)

    # Claimed signature #1.
    glashow_record_id_claimed = str(
        PersistentIdentifier.get('lit', 4328).object_uuid)
    glashow_record_claimed = get_es_record_by_uuid(glashow_record_id_claimed)
    glashow_record_uuid_claimed = glashow_record_claimed['authors'][0]['uuid']

    # Add phonetic block to the record.
    glashow_record_claimed['authors'][0]['signature_block'] = "HAGp"
    glashow_record_claimed['authors'][0]['curated_relation'] = True
    glashow_record_claimed['authors'][0]['recid'] = "3"
    es.index(index='records-hep',
             doc_type='hep',
             id=glashow_record_id_claimed,
             body=glashow_record_claimed)
    es.indices.refresh('records-hep')

    # Claimed signature #2.
    higgs_record_id_claimed = str(
        PersistentIdentifier.get('lit', 1358492).object_uuid)
    higgs_record_claimed = get_es_record_by_uuid(higgs_record_id_claimed)
    higgs_record_uuid_claimed = higgs_record_claimed['authors'][0]['uuid']

    # Add phonetic block to the record.
    higgs_record_claimed['authors'][0]['signature_block'] = "HAGp"
    higgs_record_claimed['authors'][0]['curated_relation'] = True
    higgs_record_claimed['authors'][0]['recid'] = "4"
    es.index(index='records-hep',
             doc_type='hep',
             id=higgs_record_id_claimed,
             body=higgs_record_claimed)
    es.indices.refresh('records-hep')

    # Not claimed signature.
    higgs_record_id_not_claimed = str(
        PersistentIdentifier.get('lit', 11883).object_uuid)
    higgs_record_not_claimed = get_es_record_by_uuid(
        higgs_record_id_not_claimed)
    higgs_record_uuid_not_claimed = higgs_record_not_claimed['authors'][0][
        'uuid']

    # Add phonetic block to the record.
    higgs_record_not_claimed['authors'][0]['signature_block'] = "HAGp"
    es.index(index='records-hep',
             doc_type='hep',
             id=higgs_record_id_not_claimed,
             body=higgs_record_not_claimed)
    es.indices.refresh('records-hep')

    with patch("celery.current_app.send_task",
               return_value=_BeardObject(({
                   "3": [
                       glashow_record_uuid_claimed, higgs_record_uuid_claimed,
                       higgs_record_uuid_not_claimed
                   ]
               }, {}))):
        with patch(
                "inspirehep.modules.disambiguation.logic._solve_claims_conflict",
                return_value=_ConflictObject({
                    higgs_record_uuid_claimed: [higgs_record_uuid_not_claimed]
                })):
            with patch(
                    "inspirehep.modules.disambiguation.tasks.update_authors_recid.delay",
                    side_effect=update_authors_recid):
                disambiguation_clustering("HAGp")

    assert InspireRecord.get_record(
        higgs_record_id_not_claimed)['authors'][0]['recid'] == "4"