예제 #1
0
def test_create_author_method(small_app):
    """Test the method for generating new author profiles."""
    signature = {
        'affiliations': [{'value': 'Copenhagen U.'}],
        'curated_relation': False,
        'full_name': 'Glashow, S.L.',
        'uuid': '6a3d43be-e962-4c20-8908-a81bd39447b5'
    }

    recid = create_author(signature)
    pid = PersistentIdentifier.get('aut', recid)
    record = InspireRecord.get_record(pid.object_uuid)

    assert record['_collections'] == ['Authors']
    assert record['name'] == {'value': 'Glashow, S.L.'}
    assert record['positions'] == [{'institution': {'name': 'Copenhagen U.'}}]
예제 #2
0
def test_create_author_method(small_app):
    """Test the method for generating new author profiles."""
    signature = {
        "affiliations": [{"value": "Copenhagen U."}],
        "curated_relation": False,
        "full_name": "Glashow, S.L.",
        "uuid": "6a3d43be-e962-4c20-8908-a81bd39447b5",
    }

    recid = create_author(signature)
    pid = PersistentIdentifier.get("authors", recid)
    record = Record.get_record(pid.object_uuid)

    assert record["collections"] == [{"primary": "HEPNAMES"}]
    assert record["name"] == {"value": "Glashow, S.L."}
    assert record["positions"] == [{"institution": {"name": "Copenhagen U."}}]
예제 #3
0
def test_create_author_method(small_app):
    """Test the method for generating new author profiles."""
    signature = {
        'affiliations': [{'value': 'Copenhagen U.'}],
        'curated_relation': False,
        'full_name': 'Glashow, S.L.',
        'uuid': '6a3d43be-e962-4c20-8908-a81bd39447b5'
    }

    recid = create_author(signature)
    pid = PersistentIdentifier.get("authors", recid)
    record = Record.get_record(pid.object_uuid)

    assert record['collections'] == [{'primary': 'HEPNAMES'}]
    assert record['name'] == {'value': 'Glashow, S.L.'}
    assert record['positions'] == [
        {'institution': {'name': 'Copenhagen U.'}}]
예제 #4
0
def process_clusters(uuids, signatures, recid_key=None):
    """Process a given cluster of UUIDs followed by one of provided workflows.

    This method receives a list of UUIDs representing one author, a list
    of signatures and finally recid_key if a cluster was matched with
    already existing cluster of signatures (clustered by the same
    author profile).

    After clustering and matching against existing clusters (if any),
    this method is deciding how the cluster should be processed.

    If the cluster is new (ie. new author, no profile), the method is
    dispatching a job to create a new profile and assigning the given cluster
    to this profile.

    If the cluster was matched with existing cluster (signatures pointing to
    the same author profile), then all signatures will be overwritten to point
    to the profile (recid_key parameter).

    In case if an author has claimed her or his paper (ground truth),
    then it is known that the author has the profile already.
    This profile is overwriting all signatures within the same cluster.

    If the clustering task returned more than two claimed signatures,
    belonging to different authors, then re-clustering is triggered.
    Each of the claimed signatures become a bucket and each not claimed
    one is being allocated to one of the buckets based on the likelihood
    of representing the same author.

    :param uuids:
        A list of signatures, representing the same author.

        Example:
            uuids = ['4156520-560d-4248-a57f-949c361e0dd0']

    :param signatures:
        A list of signatures collected during clustering.

        Example:
            signatures =
                [{'author_affiliation': u'MIT, Cambridge, CTP',
                  'author_name': u'Wang, Yi-Nan',
                  'publication_id': u'9d3dca5d-3551-4ca4-9b52-63db656e4793',
                  'signature_id': u'a4156520-560d-4248-a57f-949c361e0dd0',
                  'author_recid': u'10123',
                  'author_claimed: False}]

    :param recid_key:
        A record id representing profile, which 'old' cluster is
        associated with.

        Example:
            recid_key = '10123'
    """
    from inspirehep.modules.disambiguation.tasks import update_authors_recid

    # Create a map where each signature can be accessed by its uuid.
    signatures_map = _create_uuid_key_dictionary(signatures)

    # Count claimed signatures. Set allows for unique signatures.
    claims = set()

    for uuid in uuids:
        claim_status = _check_if_claimed(signatures_map[uuid])

        # If it is claimed, and profile_id is indeed a digit.
        if claim_status[0] and claim_status[1].isdigit():
            claims.add(claim_status)

    # If there are no claimed signatures and no match with an 'old' cluster.
    if len(claims) == 0 and not recid_key:
        # Select the most rich-in-information signature.
        base_profile = get_signature(_select_profile_base(signatures_map,
                                                          uuids))
        # Create a new profile.
        recid = create_author(base_profile)

        # Update all signatures with the new profile (recid).
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid)

        logger.info("A new profile created: %s" % recid)

    # If there are not claimed signatures, but there is a match.
    elif len(claims) == 0 and recid_key:
        # Update all signatures with the profile of the 'old' cluster.
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid_key)

    # If there is one claimed signature.
    elif len(claims) == 1:
        # claims format: (False, u'1234')
        recid = claims.pop()[1]

        # Update all signatures with a profile of the claimed signature.
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid)

    # If there are more than two claimed signatures,
    # belonging to different authors.
    else:
        claimed_signatures = []
        not_claimed_signatures = []

        # Check each signature if is claimed or not.
        for uuid in uuids:
            claim_status = _check_if_claimed(signatures_map[uuid])

            if not claim_status[0]:
                not_claimed_signatures.append(uuid)

            if claim_status[0]:
                claimed_signatures.append(uuid)

        # Dispatch a resolving conflict job.
        try:
            matched_signatures = _solve_claims_conflict(
                signatures_map,
                not_claimed_signatures,
                claimed_signatures).get()
        except AttributeError:
            matched_signatures = {}

        # For each claimed signature, assign recid of it to not claimed ones.
        for claimed_uuid, uuids in matched_signatures.items():
            recid = signatures_map[claimed_uuid].get('author_recid')

            for uuid in uuids:
                record = signatures_map[uuid].get('publication_id')
                update_authors_recid.delay(record, uuid, recid)
예제 #5
0
def process_clusters(uuids, signatures, recid_key=None):
    """Process a given cluster of UUIDs followed by one of provided workflows.

    This method receives a list of UUIDs representing one author, a list
    of signatures and finally recid_key if a cluster was matched with
    already existing cluster of signatures (clustered by the same
    author profile).

    After clustering and matching against existing clusters (if any),
    this method is deciding how the cluster should be processed.

    If the cluster is new (ie. new author, no profile), the method is
    dispatching a job to create a new profile and assigning the given cluster
    to this profile.

    If the cluster was matched with existing cluster (signatures pointing to
    the same author profile), then all signatures will be overwritten to point
    to the profile (recid_key parameter).

    In case if an author has claimed her or his paper (ground truth),
    then it is known that the author has the profile already.
    This profile is overwriting all signatures within the same cluster.

    If the clustering task returned more than two claimed signatures,
    belonging to different authors, then re-clustering is triggered.
    Each of the claimed signatures become a bucket and each not claimed
    one is being allocated to one of the buckets based on the likelihood
    of representing the same author.

    :param uuids:
        A list of signatures, representing the same author.

        Example:
            uuids = ['4156520-560d-4248-a57f-949c361e0dd0']

    :param signatures:
        A list of signatures collected during clustering.

        Example:
            signatures =
                [{'author_affiliation': u'MIT, Cambridge, CTP',
                  'author_name': u'Wang, Yi-Nan',
                  'publication_id': u'9d3dca5d-3551-4ca4-9b52-63db656e4793',
                  'signature_id': u'a4156520-560d-4248-a57f-949c361e0dd0',
                  'author_recid': u'10123',
                  'author_claimed: False}]

    :param recid_key:
        A record id representing profile, which 'old' cluster is
        associated with.

        Example:
            recid_key = '10123'
    """
    from inspirehep.modules.disambiguation.tasks import update_authors_recid

    # Create a map where each signature can be accessed by its uuid.
    signatures_map = _create_uuid_key_dictionary(signatures)

    # Count claimed signatures. Set allows for unique signatures.
    claims = set()

    for uuid in uuids:
        claim_status = _check_if_claimed(signatures_map[uuid])

        # If it is claimed, and profile_id is indeed a digit.
        if claim_status[0] and claim_status[1].isdigit():
            claims.add(claim_status)

    # If there are no claimed signatures and no match with an 'old' cluster.
    if len(claims) == 0 and not recid_key:
        # Select the most rich-in-information signature.
        base_profile = get_signature(
            _select_profile_base(signatures_map, uuids))
        # Create a new profile.
        recid = create_author(base_profile)

        # Update all signatures with the new profile (recid).
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid)

        logger.info("A new profile created: %s" % recid)

    # If there are not claimed signatures, but there is a match.
    elif len(claims) == 0 and recid_key:
        # Update all signatures with the profile of the 'old' cluster.
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid_key)

    # If there is one claimed signature.
    elif len(claims) == 1:
        # claims format: (False, u'1234')
        recid = claims.pop()[1]

        # Update all signatures with a profile of the claimed signature.
        for uuid in uuids:
            record = signatures_map[uuid].get('publication_id')
            update_authors_recid.delay(record, uuid, recid)

    # If there are more than two claimed signatures,
    # belonging to different authors.
    else:
        claimed_signatures = []
        not_claimed_signatures = []

        # Check each signature if is claimed or not.
        for uuid in uuids:
            claim_status = _check_if_claimed(signatures_map[uuid])

            if not claim_status[0]:
                not_claimed_signatures.append(uuid)

            if claim_status[0]:
                claimed_signatures.append(uuid)

        # Dispatch a resolving conflict job.
        try:
            matched_signatures = _solve_claims_conflict(
                signatures_map, not_claimed_signatures,
                claimed_signatures).get()
        except AttributeError:
            matched_signatures = {}

        # For each claimed signature, assign recid of it to not claimed ones.
        for claimed_uuid, uuids in matched_signatures.items():
            recid = signatures_map[claimed_uuid].get('author_recid')

            for uuid in uuids:
                record = signatures_map[uuid].get('publication_id')
                update_authors_recid.delay(record, uuid, recid)