示例#1
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.')

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.')

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except (KeyError, IndexError):
        pass

    # Check if we already have it
    matches = None
    if alias:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__iexact=alias)
    else:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass
示例#2
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+
                'URL was: '+request)
    
    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+
                'URL was: '+request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass
示例#3
0
def fetch_journal(search_terms, matching_mode='exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    # Make the title HTML-safe before searching for it in the database or in the API
    if 'title' in search_terms:
        search_terms['title'] = kill_html(search_terms['title'])
    original_search_terms = search_terms.copy()

    # Check the arguments
    if not all(
            map(lambda x: x in allowed_fields, (key for key in search_terms))):
        raise ValueError('The search terms have to belong to ' +
                         str(allowed_fields) + 'but the dictionary I got is ' +
                         str(search_terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in search_terms:
        search_terms[key] = remove_diacritics(search_terms[key])

    # First check we don't have it already
    journal = find_journal_in_model(search_terms)
    if journal:
        return journal

    # Perform the query
    root = perform_romeo_query(search_terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))
    if not journals:
        # Retry with a less restrictive matching type
        if matching_mode == 'exact':
            return fetch_journal(original_search_terms, 'contains')
        return None
    if len(journals) > 1:
        print("Warning, " + str(len(journals)) +
              " journals match the RoMEO request, " +
              "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException(
            'RoMEO returned a journal without title.\n' + 'URL was: ' +
            request)
    if len(names) > 1:
        print("Warning, " + str(len(names)) +
              " names provided for one journal, " +
              "defaulting to the first one")
    name = kill_html(names[0].text)

    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except KeyError, IndexError:
        pass
示例#4
0
def addRestriction(xml, applies_to, publisher):
    text = nstrip(xml.text)
    if text:
        r = PublisherRestrictionDetail(publisher=publisher,
                                       applies_to=applies_to,
                                       text=text)
        r.save()
示例#5
0
def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException(
            'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request)

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' +
            request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass
示例#6
0
 def add_restriction(self, xml, applies_to, publisher):
     """
     Creates a sharing restriction (SAD!) for a publisher
     """
     text = nstrip(xml.text)
     if text:
         r = PublisherRestrictionDetail(
             publisher=publisher, applies_to=applies_to, text=text)
         r.save()
示例#7
0
def fetch_journal(search_terms, matching_mode = 'exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    # Make the title HTML-safe before searching for it in the database or in the API
    if 'title' in search_terms:
        search_terms['title'] = kill_html(search_terms['title'])
    original_search_terms = search_terms.copy()

    # Check the arguments
    if not all(map(lambda x: x in allowed_fields, (key for key in search_terms))):
        raise ValueError('The search terms have to belong to '+str(allowed_fields)+
                'but the dictionary I got is '+str(search_terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in search_terms:
        search_terms[key] = remove_diacritics(search_terms[key])

    # First check we don't have it already
    journal = find_journal_in_model(search_terms)
    if journal:
        return journal

    # Perform the query
    root = perform_romeo_query(search_terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))
    if not journals:
        # Retry with a less restrictive matching type
        if matching_mode == 'exact':
            return fetch_journal(original_search_terms, 'contains')
        return None
    if len(journals) > 1:
        print ("Warning, "+str(len(journals))+" journals match the RoMEO request, "+
                "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException('RoMEO returned a journal without title.\n'+
                'URL was: '+request)
    if len(names) > 1:
        print("Warning, "+str(len(names))+" names provided for one journal, "+
                "defaulting to the first one")
    name = kill_html(names[0].text)
    
    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except KeyError, IndexError:
        pass
示例#8
0
 def test_nstrip(self):
     self.assertTrue(nstrip(None) is None)
     self.assertEqual(nstrip('aa'), 'aa')
     self.assertEqual(nstrip('  aa \n'), 'aa')
示例#9
0
def addRestriction(xml, applies_to, publisher):
    text = nstrip(xml.text)
    if text:
        r = PublisherRestrictionDetail(publisher=publisher, applies_to=applies_to, text=text)
        r.save()
示例#10
0
    except KeyError, IndexError:
        pass

    # Check if we already have it
    matches = None
    if alias:
        matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name,alias__iexact=alias)
    else:
        matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name,alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass

    preprint = None
    try:
        preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the preprint policy.\n'+
                'URL was: '+request)

    postprint = None
    try:
        postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the postprint policy.\n'+
示例#11
0
    matches = None
    if alias:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__iexact=alias)
    else:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass

    preprint = None
    try:
        preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the preprint policy.\n' + 'URL was: ' +
            request)

    postprint = None
    try:
        postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
    except (KeyError, IndexError, AttributeError):
示例#12
0
def fetch_journal(search_terms, matching_mode='exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    terms = search_terms.copy()
    # Make the title HTML-safe before searching for it in the database or in
    # the API
    if 'title' in terms:
        terms['title'] = kill_html(terms['title'])

    # Check the arguments
    if not all(key in allowed_fields for key in terms):
        raise ValueError('The search terms have to belong to ' +
                         str(allowed_fields) + 'but the dictionary I got is ' +
                         str(terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in terms:
        terms[key] = remove_diacritics(terms[key])
        if len(terms[key]) > 256:
            return None

    # First check we don't have it already
    journal = find_journal_in_model(terms)
    if journal:
        return journal

    # Perform the query
    if matching_mode != 'exact':
        terms['qtype'] = matching_mode
    root = perform_romeo_query(terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))

    if not journals:
        return None
    elif len(journals) > 1:
        print("Warning, " + str(len(journals)) +
              " journals match the RoMEO request, " +
              "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException(
            'RoMEO returned a journal without title.\n' + 'Terms were: ' +
            unicode(terms))
    if len(names) > 1:
        print("Warning, " + str(len(names)) +
              " names provided for one journal, " +
              "defaulting to the first one")
    name = kill_html(names[0].text)

    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except (KeyError, IndexError):
        pass

    # Now we may have additional info, so it's worth trying again in the model
    model_journal = find_journal_in_model({'issn': issn, 'jtitle': name})
    if model_journal:
        return model_journal

    # Otherwise we need to find the publisher
    publishers = root.findall('./publishers/publisher')
    if not publishers:
        return None
    # TODO here we shouldn't default to the first one but look it up using the
    # <romeopub>
    publisher_desc = publishers[0]

    publisher = get_or_create_publisher(publisher_desc)

    result = Journal(title=name, issn=issn, publisher=publisher)
    result.save()
    return result
示例#13
0
    for condition in xml.findall('./conditions/condition'):
        if condition.text:
            c = PublisherCondition(publisher=publisher,
                                   text=condition.text.strip())
            c.save()

    # Update the publisher status
    publisher.oa_status = publisher.classify_oa_status()
    publisher.save(update_fields=['oa_status'])

    for link in xml.findall('./copyrightlinks/copyrightlink'):
        text = None
        url = None
        texts = link.findall('./copyrightlinktext')
        if texts:
            text = nstrip(texts[0].text)
        urls = link.findall('./copyrightlinkurl')
        if urls:
            url = nstrip(urls[0].text)
        if url and text:
            cplink = PublisherCopyrightLink(text=text,
                                            url=url,
                                            publisher=publisher)
            cplink.save()

    return publisher


def addRestriction(xml, applies_to, publisher):
    text = nstrip(xml.text)
    if text:
示例#14
0
    def get_or_create_publisher(self, romeo_xml_description):
        """
        Retrieves from the model, or creates into the model,
        the publisher corresponding to the <publisher> description
        from RoMEO.

        If the data from RoMEO is more fresh than what we have
        in cache, we update our model.
        """
        xml = romeo_xml_description
        romeo_id = None
        try:
            romeo_id = xml.attrib['id']
        except KeyError:
            raise MetadataSourceException('RoMEO did not provide a publisher id.')

        romeo_parent_id = None
        try:
            romeo_parent_id = xml.attrib['parentid']
        except KeyError:
            pass

        name = None
        try:
            raw_name = xml.findall('./name')[0].text.strip()
            name = fromstring(kill_html(sanitize_html(raw_name))).text
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the publisher\'s name.')

        alias = None
        try:
            alias = nstrip(xml.findall('./alias')[0].text)
            if alias:
                alias = fromstring(kill_html(sanitize_html(alias))).text
        except (KeyError, IndexError):
            pass

        last_update = self._get_romeo_date(xml, './dateupdated')

        # Check if we already have it.
        # Sadly the romeo_id is not unique (as publishers imported from doaj
        # all get the same id, so we have to use the name too).
        matches = None
        if re.match(r'\d+', romeo_id): # numeric ids are unambiguous
            matches = Publisher.objects.filter(romeo_id=romeo_id)
        elif alias:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__iexact=alias)
        else:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__isnull=True)
        if matches:
            first_match = matches[0]
            if first_match.last_updated is not None and first_match.last_updated >= last_update:
                return matches[0]

        # Otherwise, create it
        url = None
        try:
            url = nstrip(xml.findall('./homeurl')[0].text)
        except (KeyError, IndexError):
            pass

        preprint = None
        try:
            preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the preprint policy.')

        postprint = None
        try:
            postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the postprint policy.')

        pdfversion = None
        try:
            pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the pdf archiving policy.')

        # Compute OA status of the publisher
        status = 'UNK'

        if not matches:
            publisher = Publisher()
        else:
            publisher = matches[0]

        publisher.name = name
        publisher.alias = alias
        publisher.url = url
        publisher.preprint = preprint
        publisher.postprint = postprint
        publisher.pdfversion = pdfversion
        publisher.romeo_id = romeo_id
        publisher.romeo_parent_id = romeo_parent_id
        publisher.oa_status = status
        publisher.last_updated = last_update
        publisher.save()

        if matches:
            publisher.publishercopyrightlink_set.all().delete()
            publisher.publisherrestrictiondetail_set.all().delete()
            publisher.publishercondition_set.all().delete()

        # Add the conditions, restrictions, and copyright
        for restriction in xml.findall('./preprints/prerestrictions/prerestriction'):
            self.add_restriction(restriction, 'preprint', publisher)

        for restriction in xml.findall('./postprints/postrestrictions/postrestriction'):
            self.add_restriction(restriction, 'postprint', publisher)

        for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'):
            self.add_restriction(restriction, 'pdfversion', publisher)

        for condition in xml.findall('./conditions/condition'):
            if condition.text:
                c = PublisherCondition(publisher=publisher,
                                       text=condition.text.strip())
                c.save()

        # Update the publisher status
        publisher.oa_status = publisher.classify_oa_status()
        publisher.save(update_fields=['oa_status'])

        # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the
        # adequate task

        for link in xml.findall('./copyrightlinks/copyrightlink'):
            text = None
            url = None
            texts = link.findall('./copyrightlinktext')
            if texts:
                text = nstrip(texts[0].text)
            urls = link.findall('./copyrightlinkurl')
            if urls:
                url = nstrip(urls[0].text)
            if url and text:
                cplink = PublisherCopyrightLink(
                    text=text, url=url[:1024], publisher=publisher)
                cplink.save()

        return publisher