Пример #1
0
    for restriction in xml.findall(
            './postprints/postrestrictions/postrestriction'):
        addRestriction(restriction, 'postprint', publisher)

    for restriction in xml.findall(
            './pdfversion/pdfrestrictions/pdfrestriction'):
        addRestriction(restriction, 'pdfversion', publisher)

    for condition in xml.findall('./conditions/condition'):
        if condition.text:
            c = PublisherCondition(publisher=publisher,
                                   text=condition.text.strip())
            c.save()

    # Update the publisher status
    publisher.oa_status = publisher.classify_oa_status()
    publisher.save(update_fields=['oa_status'])

    for link in xml.findall('./copyrightlinks/copyrightlink'):
        text = None
        url = None
        texts = link.findall('./copyrightlinktext')
        if texts:
            text = nstrip(texts[0].text)
        urls = link.findall('./copyrightlinkurl')
        if urls:
            url = nstrip(urls[0].text)
        if url and text:
            cplink = PublisherCopyrightLink(text=text,
                                            url=url,
                                            publisher=publisher)
Пример #2
0
    def get_or_create_publisher(self, romeo_xml_description):
        """
        Retrieves from the model, or creates into the model,
        the publisher corresponding to the <publisher> description
        from RoMEO.

        If the data from RoMEO is more fresh than what we have
        in cache, we update our model.
        """
        xml = romeo_xml_description
        romeo_id = None
        try:
            romeo_id = xml.attrib['id']
        except KeyError:
            raise MetadataSourceException('RoMEO did not provide a publisher id.')

        romeo_parent_id = None
        try:
            romeo_parent_id = xml.attrib['parentid']
        except KeyError:
            pass

        name = None
        try:
            raw_name = xml.findall('./name')[0].text.strip()
            name = fromstring(kill_html(sanitize_html(raw_name))).text
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the publisher\'s name.')

        alias = None
        try:
            alias = nstrip(xml.findall('./alias')[0].text)
            if alias:
                alias = fromstring(kill_html(sanitize_html(alias))).text
        except (KeyError, IndexError):
            pass

        last_update = self._get_romeo_date(xml, './dateupdated')

        # Check if we already have it.
        # Sadly the romeo_id is not unique (as publishers imported from doaj
        # all get the same id, so we have to use the name too).
        matches = None
        if re.match(r'\d+', romeo_id): # numeric ids are unambiguous
            matches = Publisher.objects.filter(romeo_id=romeo_id)
        elif alias:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__iexact=alias)
        else:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__isnull=True)
        if matches:
            first_match = matches[0]
            if first_match.last_updated is not None and first_match.last_updated >= last_update:
                return matches[0]

        # Otherwise, create it
        url = None
        try:
            url = nstrip(xml.findall('./homeurl')[0].text)
        except (KeyError, IndexError):
            pass

        preprint = None
        try:
            preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the preprint policy.')

        postprint = None
        try:
            postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the postprint policy.')

        pdfversion = None
        try:
            pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the pdf archiving policy.')

        # Compute OA status of the publisher
        status = 'UNK'

        if not matches:
            publisher = Publisher()
        else:
            publisher = matches[0]

        publisher.name = name
        publisher.alias = alias
        publisher.url = url
        publisher.preprint = preprint
        publisher.postprint = postprint
        publisher.pdfversion = pdfversion
        publisher.romeo_id = romeo_id
        publisher.romeo_parent_id = romeo_parent_id
        publisher.oa_status = status
        publisher.last_updated = last_update
        publisher.save()

        if matches:
            publisher.publishercopyrightlink_set.all().delete()
            publisher.publisherrestrictiondetail_set.all().delete()
            publisher.publishercondition_set.all().delete()

        # Add the conditions, restrictions, and copyright
        for restriction in xml.findall('./preprints/prerestrictions/prerestriction'):
            self.add_restriction(restriction, 'preprint', publisher)

        for restriction in xml.findall('./postprints/postrestrictions/postrestriction'):
            self.add_restriction(restriction, 'postprint', publisher)

        for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'):
            self.add_restriction(restriction, 'pdfversion', publisher)

        for condition in xml.findall('./conditions/condition'):
            if condition.text:
                c = PublisherCondition(publisher=publisher,
                                       text=condition.text.strip())
                c.save()

        # Update the publisher status
        publisher.oa_status = publisher.classify_oa_status()
        publisher.save(update_fields=['oa_status'])

        # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the
        # adequate task

        for link in xml.findall('./copyrightlinks/copyrightlink'):
            text = None
            url = None
            texts = link.findall('./copyrightlinktext')
            if texts:
                text = nstrip(texts[0].text)
            urls = link.findall('./copyrightlinkurl')
            if urls:
                url = nstrip(urls[0].text)
            if url and text:
                cplink = PublisherCopyrightLink(
                    text=text, url=url[:1024], publisher=publisher)
                cplink.save()

        return publisher