Пример #1
0
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config[
            'ORCID_APP_CREDENTIALS']['consumer_key']

    def get_all_inspire_putcodes(self):
        """
        Get all the Inspire putcodes for the given ORCID.
        """
        putcodes = self._get_all_putcodes()
        if not putcodes:
            return
        # Filter out putcodes that do not belong to Inspire.
        for putcode, url in self._get_urls_for_putcodes(putcodes):
            if INSPIRE_WORK_URL_REGEX.match(url):
                yield putcode, url

    def _get_all_putcodes(self):
        response = self.client.get_all_works_summary()
        utils.log_service_response(logger, response,
                                   'in OrcidPutcodeGetter works summary')
        try:
            response.raise_for_result()
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return list(
            response.get_putcodes_for_source(self.source_client_id_path))

    def _get_urls_for_putcodes(self, putcodes):
        # The call get_bulk_works_details_iter() can be very expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works, 8 calls would be performed
        # with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            utils.log_service_response(logger, response,
                                       'in OrcidPutcodeGetter works details')
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained,
                                      response.get_putcodes_and_urls())
        return chained
Пример #2
0
def get_putcode_for_work(orcid, token, recid):
    client = OrcidClient(token, orcid)
    response = client.get_all_works_summary()
    response.raise_for_result()
    source_client_id_path = config.get('orcid-api', 'consumer_key')
    putcodes = list(
        response.get_putcodes_for_source_iter(source_client_id_path))

    if not putcodes:
        return None

    # TODO: this has to be simplified when we push recids as external
    # identifier (thus just the get_all_works_summary() call is required to
    # match recids with putcodes).
    for response in client.get_bulk_works_details_iter(putcodes):
        response.raise_for_result()
        for putcode, url in response.get_putcodes_and_urls_iter():
            if url.endswith('/{}'.format(recid)):
                return putcode
Пример #3
0
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config[
            "ORCID_APP_CREDENTIALS"]["consumer_key"]

    def get_all_inspire_putcodes_and_recids_iter(self):
        """
        Query ORCID api and get all the Inspire putcodes for the given ORCID.
        """
        summary_response = self._get_all_works_summary()
        # `putcodes_recids` is a list like: [('43326850', 20), ('43255490', None)]
        putcodes_recids = list(
            summary_response.get_putcodes_and_recids_for_source_iter(
                self.source_client_id_path))
        putcodes_with_recids = [x for x in putcodes_recids if x[1]]
        putcodes_without_recids = [x[0] for x in putcodes_recids if not x[1]]

        for putcode, recid in putcodes_with_recids:
            yield putcode, recid

        if not putcodes_without_recids:
            return

        for putcode, recid in self._get_putcodes_and_recids_iter(
                putcodes_without_recids):
            yield putcode, recid

    def _get_all_works_summary(self):
        """
        Query ORCID api and get all the putcodes with their embedded recids
        for the given ORCID.
        An embedded recid is a recid written as external-identifier.
        """
        response = self.client.get_all_works_summary()
        LOGGER.info("Get ORCID work summary",
                    response=response,
                    orcid=self.orcid)
        try:
            response.raise_for_result()
        except (
                orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException,
        ):
            LOGGER.info(
                "OrcidPutcodeGetter: deleting Orcid push access",
                token=self.oauth_token,
                orcid=self.orcid,
            )
            push_access_tokens.delete_access_token(self.oauth_token,
                                                   self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return response

    def _get_putcodes_and_recids_iter(self, putcodes):
        for putcode, url in self._get_urls_for_putcodes_iter(putcodes):
            # Filter out putcodes that do not belong to Inspire.
            if INSPIRE_WORK_URL_REGEX.match(url):
                recid = PidStoreBase.get_pid_from_record_uri(url)[1]
                if not recid:
                    LOGGER.error(
                        "OrcidPutcodeGetter: cannot parse recid from url",
                        url=url,
                        orcid=self.orcid,
                    )
                    continue
                yield putcode, recid

    def _get_urls_for_putcodes_iter(self, putcodes):
        # The call `get_bulk_works_details_iter()` can be expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works (each of them with many
        # authors), 8 calls would be performed with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            # Note: this log can be large. Consider removing it when this part
            # is considered mature.
            LOGGER.info("ORCID work details",
                        response=response,
                        orcid=self.orcid)
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained,
                                      response.get_putcodes_and_urls_iter())
        return chained

    def get_putcodes_and_recids_by_identifiers_iter(self, identifiers):
        """
        Yield putcode and recid for each work matched by the external
        identifiers.
        Note: external identifiers of type 'other-id' are skipped.

        Args:
            identifiers (List[inspirehep.orcid.converter.ExternalIdentifier]):
                list af all external identifiers added after the xml conversion.
        """
        summary_response = self._get_all_works_summary()
        for (
                putcode,
                ids,
        ) in summary_response.get_putcodes_and_external_identifiers_iter():
            # ids is a list like:
            #   [
            #       {'external-id-relationship': 'SELF',
            #        'external-id-type': 'other-id',
            #        'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
            #        'external-id-value': '20'
            #       },...
            #   ]

            # Get the recid.
            recid = self._get_recid_for_work(ids, str(putcode))

            for identifier in ids:
                id_type = identifier.get("external-id-type")
                # We are interested only in doi, arxiv, isbns.
                if not id_type or id_type.lower() == "other-id":
                    continue
                id_value = identifier.get("external-id-value")
                if not id_value:
                    continue

                if ExternalIdentifier(id_type, id_value) in identifiers:
                    yield putcode, recid

    def _get_recid_for_work(self, external_identifiers, putcode):
        """
        Get the recid for a work given its external identifiers and putcode.
        The recid might be in the external identifiers or a get_work_details()
        might be called to find it.

        Args:
            external_identifier (List[Dict]): a list like:
               [
                   {'external-id-relationship': 'SELF',
                    'external-id-type': 'other-id',
                    'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
                    'external-id-value': '20'
                   },...
               ]
            putcode: putcode of the given work.

        Returns: the Inspire recid mathcing the work.
        """
        for identifier in external_identifiers:
            id_type = identifier.get("external-id-type")
            if not id_type or id_type.lower() != "other-id":
                continue

            id_url = inspire_service_orcid_utils.smartget(
                identifier, "external-id-url.value", "")
            if not re.match(r".*inspire.*", id_url, re.I):
                continue

            id_value = identifier.get("external-id-value")
            if not id_value:
                continue

            # recid found.
            return id_value

        # The recid was not found in the external_identifiers.
        # Thus we call get_bulk_works_details_iter().
        putcodes_recid = list(self._get_putcodes_and_recids_iter([putcode]))

        if putcodes_recid:
            return putcodes_recid[0][1]
Пример #4
0
class OrcidPutcodeGetter(object):
    def __init__(self, orcid, oauth_token):
        self.orcid = orcid
        self.oauth_token = oauth_token
        self.client = OrcidClient(self.oauth_token, self.orcid)
        self.source_client_id_path = current_app.config['ORCID_APP_CREDENTIALS'][
            'consumer_key']

    def get_all_inspire_putcodes_and_recids_iter(self):
        """
        Query ORCID api and get all the Inspire putcodes for the given ORCID.
        """
        summary_response = self._get_all_works_summary()
        # `putcodes_recids` is a list like: [('43326850', 20), ('43255490', None)]
        putcodes_recids = list(summary_response.get_putcodes_and_recids_for_source_iter(
            self.source_client_id_path))
        putcodes_with_recids = [x for x in putcodes_recids if x[1]]
        putcodes_without_recids = [x[0] for x in putcodes_recids if not x[1]]

        for putcode, recid in putcodes_with_recids:
            yield putcode, recid

        if not putcodes_without_recids:
            return

        for putcode, recid in self._get_putcodes_and_recids_iter(putcodes_without_recids):
            yield putcode, recid

    def _get_all_works_summary(self):
        """
        Query ORCID api and get all the putcodes with their embedded recids
        for the given ORCID.
        An embedded recid is a recid written as external-identifier.
        """
        response = self.client.get_all_works_summary()
        utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works summary')
        try:
            response.raise_for_result()
        except (orcid_client_exceptions.TokenInvalidException,
                orcid_client_exceptions.TokenMismatchException,
                orcid_client_exceptions.TokenWithWrongPermissionException):
            logger.info('OrcidPutcodeGetter: deleting Orcid push access token={} for orcid={}'.format(
                self.oauth_token, self.orcid))
            push_access_tokens.delete_access_token(self.oauth_token, self.orcid)
            raise exceptions.TokenInvalidDeletedException
        except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
            raise exceptions.InputDataInvalidException(from_exc=exc)
        return response

    def _get_putcodes_and_recids_iter(self, putcodes):
        for putcode, url in self._get_urls_for_putcodes_iter(putcodes):
            # Filter out putcodes that do not belong to Inspire.
            if INSPIRE_WORK_URL_REGEX.match(url):
                recid = get_pid_from_record_uri(url)[1]
                if not recid:
                    logger.error('OrcidPutcodeGetter: cannot parse recid from url={} for orcid={}'.format(
                        url, self.orcid))
                    continue
                yield putcode, recid

    def _get_urls_for_putcodes_iter(self, putcodes):
        # The call `get_bulk_works_details_iter()` can be expensive for an
        # author with many works (if each work also has many *contributors*).
        # Fi. for an ATLAS author with ~750 works (each of them with many
        # authors), 8 calls would be performed with a total data transfer > 0.5 Gb.
        chained = []
        for response in self.client.get_bulk_works_details_iter(putcodes):
            # Note: this log can be large. Consider removing it when this part
            # is considered mature.
            utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works details')
            try:
                response.raise_for_result()
            except orcid_client_exceptions.BaseOrcidClientJsonException as exc:
                raise exceptions.InputDataInvalidException(from_exc=exc)

            chained = itertools.chain(chained, response.get_putcodes_and_urls_iter())
        return chained

    def get_putcodes_and_recids_by_identifiers_iter(self, identifiers):
        """
        Yield putcode and recid for each work matched by the external
        identifiers.
        Note: external identifiers of type 'other-id' are skipped.

        Args:
            identifiers (List[inspirehep.modules.orcid.converter.ExternalIdentifier]):
                list af all external identifiers added after the xml conversion.
        """
        summary_response = self._get_all_works_summary()
        for putcode, ids in summary_response.get_putcodes_and_external_identifiers_iter():
            # ids is a list like:
            #   [
            #       {'external-id-relationship': 'SELF',
            #        'external-id-type': 'other-id',
            #        'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
            #        'external-id-value': '20'
            #       },...
            #   ]

            # Get the recid.
            recid = self._get_recid_for_work(ids, str(putcode))

            for identifier in ids:
                id_type = identifier.get('external-id-type')
                # We are interested only in doi, arxiv, isbns.
                if not id_type or id_type.lower() == 'other-id':
                    continue
                id_value = identifier.get('external-id-value')
                if not id_value:
                    continue

                if ExternalIdentifier(id_type, id_value) in identifiers:
                    yield putcode, recid

    def _get_recid_for_work(self, external_identifiers, putcode):
        """
        Get the recid for a work given its external identifiers and putcode.
        The recid might be in the external identifiers or a get_work_details()
        might be called to find it.

        Args:
            external_identifier (List[Dict]): a list like:
               [
                   {'external-id-relationship': 'SELF',
                    'external-id-type': 'other-id',
                    'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'},
                    'external-id-value': '20'
                   },...
               ]
            putcode: putcode of the given work.

        Returns: the Inspire recid mathcing the work.
        """
        for identifier in external_identifiers:
            id_type = identifier.get('external-id-type')
            if not id_type or id_type.lower() != 'other-id':
                continue

            id_url = inspire_service_orcid_utils.smartget(identifier, 'external-id-url.value', '')
            if not re.match(r'.*inspire.*', id_url, re.I):
                continue

            id_value = identifier.get('external-id-value')
            if not id_value:
                continue

            # recid found.
            return id_value

        # The recid was not found in the external_identifiers.
        # Thus we call get_bulk_works_details_iter().
        putcodes_recid = list(self._get_putcodes_and_recids_iter([putcode]))

        if putcodes_recid:
            return putcodes_recid[0][1]
Пример #5
0
class TestGenerateGetBulkWorksDetails(object):
    def setup(self):
        self.putcodes = [
            '43326850', '43255490', '43183518', '43857637', '43257979',
            '43938460', '43553536', '43846642', '43869107', '43466717',
            '43880082', '43852910', '44762573', '44762737', '44762744',
            '44762721', '44762617', '43257122', '43861964', '43938538',
            '43606530', '43855125', '44762615', '44762741', '43554289',
            '44762570', '44762735', '44762597', '43859780', '43941962',
            '43856818', '43938515', '43864453', '43875319', '43935537',
            '43467792', '44077351', '43554306', '44472652', '43911727',
            '43922432', '43916436', '43907796', '43924927', '43923874',
            '43938553', '43938542', '43878004', '43935695', '43881622',
            '43935569', '44231173', '43880802', '43938523', '43938458',
            '43935897', '43919253', '43918420', '43938697', '43920855',
            '43933388', '43942717', '43910178', '44515789', '43882441',
            '43935355', '43935418', '43935500', '43929711', '43935348',
            '43938613', '43919864', '43885354', '43935660', '43882622',
            '43935419', '43935519', '43942195', '43935682', '43949957',
            '43941870', '43938614', '43938644', '43941852', '43935478',
            '43937005', '44216033', '43948457', '43942230', '43938670',
            '43935725', '43942117', '43935577', '44227246', '43942042',
            '44219584', '43942229', '43942467', '43935574', '43461438',
            '43939244', '43942225', '43942110', '44218042', '44236863',
            '43942221', '43935690', '43938687', '43942306', '43326714',
            '43935600', '43935671', '43935595', '44229237', '43942579',
            '43935727', '43939389', '43935714', '44232896', '44227649',
            '43935744', '43938719', '43938710', '43942556', '44237648',
            '44226428', '43938991', '44236016', '43935746', '44236622',
            '43938809', '44234262', '43942562', '43939267', '43935804',
            '43935814', '44235446', '44238589', '43476255', '44238117',
            '43942245', '43935831', '44255508', '43935773', '43935525',
            '43349513', '43939364', '43942333', '44259358', '43334280',
            '43935879', '43474664', '43942483', '43868647', '43942582',
            '44269186', '43935857', '43939273', '44265932', '43328661',
            '43939436', '44575020', '44252784', '43473085', '43935955',
            '43329599', '43474084', '43942511', '43935852', '43325385',
            '43935788', '43942608', '43935829', '43942738', '43935875',
            '43939367', '44274797', '43328989', '43474829', '43942339',
            '43330602', '43939455', '43939372', '43943050', '43351389',
            '43328159', '43329373', '43935762', '43939467', '43943007',
            '43476291', '44272682', '43478322', '43343506', '43483181',
            '43347500', '43333264', '43858017', '43473511', '43332255',
            '43476010', '43350059', '44251364', '43475852', '43353967',
            '43849619', '43819343', '43339682', '43348858', '43333748',
            '44217143', '44232508', '43822751', '43939441', '43339402',
            '44284285', '43478099', '43356509', '43942969', '43348252',
            '43483990', '43936102', '43939877', '43935994', '44575015',
            '43939643', '44285709', '43352429', '43942965', '43364988',
            '44265579', '43939719', '43940213', '43368521', '43939725',
            '43361294', '43936167', '43293661', '43362128', '43940188',
            '43358238', '43936143', '44283137', '44284877', '43356836',
            '43939941', '44293857', '43363375', '43361159', '43365921',
            '43939949', '43941280', '43368183', '44291548', '43360300',
            '43366583', '43936275', '43370435', '43939860', '43361521',
            '43936314', '43942905', '43942981', '43292406', '43367691',
            '44317462'
        ]  # noqa: E501
        self.orcid = '0000-0002-6665-4934'  # ATLAS author.
        try:
            # Pick the token from settings_local.py first.
            self.oauth_token = inspire_service_orcid.conf.settings.OAUTH_TOKENS.get(
                self.orcid)
        except AttributeError:
            self.oauth_token = 'mytoken'
        self.client = OrcidClient(self.oauth_token, self.orcid)

    def test_happy_flow(self):
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            assert str(
                response['bulk'][0]['work']['put-code']) in self.putcodes
            assert str(
                response['bulk'][-1]['work']['put-code']) in self.putcodes

    def test_too_many_putcodes(self):
        from inspire_service_orcid import client
        with mock.patch.object(client,
                               'MAX_PUTCODES_PER_WORKS_DETAILS_REQUEST', 101):
            for response in self.client.get_bulk_works_details_iter(
                [str(x) for x in range(101)]):
                with pytest.raises(
                        exceptions.ExceedMaxNumberOfPutCodesException):
                    response.raise_for_result()

    def test_get_putcodes_and_urls(self):
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            putcodes_and_urls = list(response.get_putcodes_and_urls_iter())
            # Note: the recorded cassette returns the same result for each for loop.
            assert putcodes_and_urls[0] == (
                '43183518', 'http://inspirehep.net/record/1665234')
            assert putcodes_and_urls[-1] == (
                '44227246', 'http://inspirehep.net/record/1515025')

    def test_single_work_error(self):
        self.putcodes = ['51540408', '51496313']
        result = []
        for response in self.client.get_bulk_works_details_iter(self.putcodes):
            response.raise_for_result()
            assert response.ok
            result += (list(response.get_putcodes_and_urls_iter()))

        assert result == [('51496313',
                           'http://inspireheptest.cern.ch/record/20')]