Exemplo n.º 1
0
    def get_object(self):
        queryset = self.get_queryset()
        pk = self.kwargs.get('pk', None)
        doi = self.kwargs.get('doi', None)
        if doi:
            doi = to_doi(doi)

        paper = None
        try:
            if pk is not None:
                paper = queryset.get(pk=pk)
            elif doi is not None:
                paper = Paper.get_by_doi(doi)
            else:
                raise AttributeError("Paper view expects a DOI or a pk")
        except ObjectDoesNotExist:
            pass

        if not paper:
            paper = Paper.create_by_doi(doi)
            if paper is None or paper.is_orphan():
                raise Http404(
                    _("No %(verbose_name)s found matching the query") %
                    {'verbose_name': Paper._meta.verbose_name})
        return paper
Exemplo n.º 2
0
    def _post_filter(self, urls):
        if '1' in self.metadata.get('oa', []):
            urls['pdf'] = urls.get('splash')

        # Special case for PMC as their metadata includes other urls
        pmc_match = pmc_id_re.match(self.header.identifier())
        if pmc_match:
            pmc_url = None
            for u in self.metadata.get('identifier',[]):
                # rationale : PMC urls are prioritary
                # but PMID urls can be used when no PMC url is provided
                # (because we know they link to PMC eventually, from the
                # identifier)
                if pmc_url_re.match(u) or (not pmc_url and pmid_url_re.match(u)):
                    pmc_url = u

            urls['splash'] = pmc_url
            urls['pdf'] = pmc_url

        # Special case for DOIs
        if urls.get('splash'):
            doi = to_doi(urls.get('splash'))
            if doi:
                doi_prefix = doi.split('/')[0]
                if doi_prefix in free_doi_prefixes:
                    urls['pdf'] = urls['splash']

        return urls
Exemplo n.º 3
0
    def get_object(self, queryset=None):
        if queryset is None:
            queryset = self.get_queryset()
        pk = self.kwargs.get('pk', None)
        doi = self.kwargs.get('doi', None)
        if doi:
            doi = unquote(doi)
            doi = to_doi(doi)

        paper = None
        try:
            if pk is not None:
                paper = queryset.get(pk=pk)
            elif doi is not None:
                paper = Paper.get_by_doi(doi)
            else:
                raise Http404(_("Paper view expects a DOI or a pk"))
        except ObjectDoesNotExist:
            pass

        if paper is None or paper.is_orphan():
            raise Http404(
                _("No %(verbose_name)s found matching the query") %
                {'verbose_name': Paper._meta.verbose_name})

        if not paper.visible:
            raise Http404(_("This paper has been deleted."))

        paper = queryset.prefetch_related('oairecord_set').get(pk=paper.pk)

        return paper
Exemplo n.º 4
0
    def get_object(self, queryset=None):
        if queryset is None:
            queryset = self.get_queryset()
        pk = self.kwargs.get('pk', None)
        doi = self.kwargs.get('doi', None)
        if doi:
            doi = unquote(doi)
            doi = to_doi(doi)

        paper = None
        try:
            if pk is not None:
                paper = queryset.get(pk=pk)
            elif doi is not None:
                paper = Paper.get_by_doi(doi)
            else:
                raise Http404(_("Paper view expects a DOI or a pk"))
        except ObjectDoesNotExist:
            pass

        if not paper:
            paper = Paper.create_by_doi(doi)
            if paper is None or paper.is_orphan():
                raise Http404(_("No %(verbose_name)s found matching the query") %
                              {'verbose_name': Paper._meta.verbose_name})

        if not paper.visible:
            raise Http404(_("This paper has been deleted."))

        return paper
Exemplo n.º 5
0
    def _post_filter(self, urls):
        if '1' in self.metadata.get('oa', []):
            urls['pdf'] = urls.get('splash')

        # Special case for PMC as their metadata includes other urls
        pmc_match = pmc_id_re.match(self.header.identifier())
        if pmc_match:
            pmc_url = None
            for u in self.metadata.get('identifier',[]):
                # rationale : PMC urls are prioritary
                # but PMID urls can be used when no PMC url is provided
                # (because we know they link to PMC eventually, from the
                # identifier)
                if pmc_url_re.match(u) or (not pmc_url and pmid_url_re.match(u)):
                    pmc_url = u

            urls['splash'] = pmc_url
            urls['pdf'] = pmc_url

        # Special case for DOIs
        if urls.get('splash'):
            doi = to_doi(urls.get('splash'))
            if doi:
                doi_prefix = doi.split('/')[0]
                if doi_prefix in free_doi_prefixes:
                    urls['pdf'] = urls['splash']

        return urls
Exemplo n.º 6
0
 def _get_doi(data):
     """
     :param data: citeproc metadata
     :returns: doi or None
     """
     doi = to_doi(data.get('DOI', ''))
     if doi is None:
         raise CiteprocDOIError('Invalid DOI in metadata')
     return doi
Exemplo n.º 7
0
 def dois(self):
     dois = []
     for extid in self.j('work-external-identifiers/work-external-identifier', []):
         if extid.get('work-external-identifier-type') == 'DOI':
             doi = to_doi(jpath('work-external-identifier-id/value', extid))
             if doi:
                 # If a DOI is available, create the paper using metadata from CrossRef.
                 # We don't do it yet, we only store the DOI, so that we can fetch them
                 # by batch later.
                 dois.append(doi)
     return dois
Exemplo n.º 8
0
 def doi(self):
     """
     Returns the DOI of this publication, if any.
     """
     for external_id in jpath('external-ids/external-id', self.json, []):
         if (external_id.get('external-id-type') == 'doi'
                 and external_id.get('external-id-relationship') == 'SELF'
                 and external_id.get('external-id-value')):
             doi = to_doi(external_id.get('external-id-value'))
             if doi:
                 return doi
     return None
Exemplo n.º 9
0
def redirect_by_doi(request, doi):
    """
    This view is inherited from doai.io, migrated to this code base
    to preserve the existing behaviour. We could instead
    redirect to unpaywall, but that would not include ResearchGate urls.
    """
    doi = unquote(doi)
    doi = to_doi(doi)
    if not doi:
        raise Http404(_("Invalid DOI."))
    paper = Paper.get_by_doi(doi)
    if paper and paper.pdf_url:
        return HttpResponsePermanentRedirect(paper.pdf_url)
    return HttpResponsePermanentRedirect(doi_to_url(doi))
Exemplo n.º 10
0
def redirect_by_doi(request, doi):
    """
    This view is inherited from doai.io, migrated to this code base
    to preserve the existing behaviour. We could instead
    redirect to unpaywall, but that would not include ResearchGate urls.
    """
    doi = unquote(doi)
    doi = to_doi(doi)
    if not doi:
        raise Http404(_("Invalid DOI."))
    paper = Paper.get_by_doi(doi)
    if paper and paper.pdf_url:
        return HttpResponsePermanentRedirect(paper.pdf_url)
    return HttpResponsePermanentRedirect(doi_to_url(doi))
Exemplo n.º 11
0
    def create_oairecord(self, record):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                print('no such paper for doi {doi}'.format(doi=doi))
                return

        url = record['url']

        # just to speed things up a bit...
        if paper.pdf_url == url:
            return

        identifier = 'oadoi:' + url
        source = self.oadoi_source

        if record['host_type'] == 'publisher':
            url = doi_to_url(doi)
            identifier = doi_to_crossref_identifier(doi)
            source = self.crossref_source

        record = BareOaiRecord(paper=paper,
                               doi=doi,
                               pubtype=paper.doctype,
                               source=source,
                               identifier=identifier,
                               splash_url=url,
                               pdf_url=record['url'])
        try:
            paper.add_oairecord(record)
            paper.update_availability()
            # TODO re-enable this
            #paper.update_index()
        except (DataError, ValueError):
            print('Record does not fit in the DB')
Exemplo n.º 12
0
    def add_oai_record(self, header, metadata, paper):
        """
        Add a record (from OAI-PMH) to the given paper
        """
        identifier = header.identifier()

        # description in oai_dc means abstract
        curdesc = ""
        for desc in metadata['description']:
            if len(desc) > len(curdesc):
                curdesc = desc
        curdesc = sanitize_html(curdesc)

        # Run extractor to find the URLs
        splash_url, pdf_url = self.extract_urls(
            header, metadata, self.oaisource.identifier)

        keywords = ' | '.join(metadata['subject'])
        contributors = ' '.join(metadata['contributor'])[:4096]

        typenorms = ['typenorm:'+tn for tn in metadata.get('typenorm', [])]
        pubtype_list = metadata.get('type', []) + typenorms
        pubtype = None
        for raw_pubtype in pubtype_list:
            pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype)
            if pubtype is not None:
                break

        if pubtype is None:
            pubtype = self.oaisource.default_pubtype

        # Find the DOI, if any
        doi = None
        for url in metadata['identifier']+metadata['relation']+metadata['source']:
            if not doi:
                doi = to_doi(url)

        record = BareOaiRecord(
                source=self.oaisource,
                identifier=identifier,
                description=curdesc,
                keywords=keywords,
                contributors=contributors,
                pubtype=pubtype,
                pdf_url=pdf_url,
                splash_url=splash_url,
                doi=doi)
        paper.add_oairecord(record)
Exemplo n.º 13
0
    def add_oai_record(self, header, metadata, paper):
        """
        Add a record (from OAI-PMH) to the given paper
        """
        identifier = header.identifier()

        # description in oai_dc means abstract
        curdesc = ""
        for desc in metadata['description']:
            if len(desc) > len(curdesc):
                curdesc = desc
        curdesc = sanitize_html(curdesc)

        # Run extractor to find the URLs
        splash_url, pdf_url = self.extract_urls(header, metadata,
                                                self.oaisource.identifier)

        keywords = ' | '.join(metadata['subject'])
        contributors = ' '.join(metadata['contributor'])[:4096]

        typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])]
        pubtype_list = metadata.get('type', []) + typenorms
        pubtype = None
        for raw_pubtype in pubtype_list:
            pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype)
            if pubtype is not None:
                break

        if pubtype is None:
            pubtype = self.oaisource.default_pubtype

        # Find the DOI, if any
        doi = None
        for url in metadata['identifier'] + metadata['relation'] + metadata[
                'source']:
            if not doi:
                doi = to_doi(url)

        record = BareOaiRecord(source=self.oaisource,
                               identifier=identifier,
                               description=curdesc,
                               keywords=keywords,
                               contributors=contributors,
                               pubtype=pubtype,
                               pdf_url=pdf_url,
                               splash_url=splash_url,
                               doi=doi)
        paper.add_oairecord(record)
Exemplo n.º 14
0
 def test_to_doi(self):
      self.assertEqual(to_doi('https://doi.org/10.1145/1721837.1721839'),
                       '10.1145/1721837.1721839')
      self.assertEqual(to_doi('https://doi.org/10.1145/1721837.1721839'),
                       '10.1145/1721837.1721839')
      self.assertEqual(to_doi('10.1145/1721837.1721839'),
                       '10.1145/1721837.1721839')
      self.assertEqual(to_doi('DOI: 10.1145/1721837.1721839'),
                       '10.1145/1721837.1721839')
      self.assertEqual(to_doi('info:eu-repo/semantics/altIdentifier/doi/10.1145/1721837.1721839'),
                       '10.1145/1721837.1721839')
      self.assertEqual(to_doi('10.1093/jhmas/XXXI.4.480'),
                       '10.1093/jhmas/xxxi.4.480')
Exemplo n.º 15
0
 def get_object(self):
     queryset = self.get_queryset()
     pk = self.kwargs.get('pk', None)
     doi = self.kwargs.get('doi', None)
     if doi:
         doi = to_doi(doi)
     try:
         if pk is not None:
             paper = queryset.filter(pk=pk).get()
         elif doi is not None:
             publi = Publication.objects.get(doi=doi)
             paper = publi.paper
         else:
             raise AttributeError("Paper view expects a DOI or a pk")
     except ObjectDoesNotExist:
         paper = Paper.create_by_doi(doi)
         if paper is None:
             raise Http404(_("No %(verbose_name)s found matching the query") %
                     {'verbose_name': Publication._meta.verbose_name})
     return paper
Exemplo n.º 16
0
    def save_doi_metadata(self, metadata, extra_orcids=None):
        """
        Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication

        :param extra_orcids: an optional orcids list, which will be unified
            with the orcids extracted from the metadata. This is useful for the ORCID interface.
        :returns: the paper, created if needed
        """
        # Normalize metadata
        if metadata is None or not isinstance(metadata, dict):
            raise ValueError('Invalid metadata format, expecting a dict')
        if not metadata.get('author'):
            raise ValueError('No author provided')

        if not metadata.get('title'):
            raise ValueError('No title')

        # the upstream function ensures that there is a non-empty title
        if not to_doi(metadata.get('DOI')):
            raise ValueError("No DOI, skipping")

        pubdate = get_publication_date(metadata)

        if pubdate is None:
            raise ValueError('No pubdate')

        title = metadata['title']
        # CrossRef metadata stores titles in lists
        if isinstance(title, list):
            title = title[0]
        subtitle = metadata.get('subtitle')
        if subtitle:
            if isinstance(subtitle, list):
                subtitle = subtitle[0]
            title += ': '+subtitle

        name_pairs = list(map(convert_to_name_pair, metadata['author']))
        if None in name_pairs:
            raise ValueError('Invalid author')
        authors = [BareName.create_bare(first, last) for first, last in
                   name_pairs]

        def get_affiliation(author_elem):
            for dct in author_elem.get('affiliation', []):
                if 'name' in dct:
                    return dct['name']

        def get_orcid(author_elem):
            orcid = validate_orcid(author_elem.get('ORCID'))
            if orcid:
                return orcid

        new_orcids = list(map(get_orcid, metadata['author']))
        if extra_orcids:
            # remove the extra_orcids if they already exist on different authors
            set_of_extra_orcids = set(x for x in extra_orcids if x != None)
            new_orcids = [(x if x not in set_of_extra_orcids else None)
                    for x in new_orcids]
            # now do the union
            orcids = [new or old for (old, new) in zip(
                extra_orcids, new_orcids)]
        else:
            orcids = new_orcids
        affiliations = list(map(get_affiliation, metadata['author']))

        paper = BarePaper.create(title, authors, pubdate,
                                 visible=True, affiliations=affiliations, orcids=orcids)

        result = create_publication(paper, metadata)

        if result is None:  # Creating the publication failed!
            # Make sure the paper only appears if it is still associated
            # with another source.
            paper.update_visible()
        else:
            paper = result[0]

        return paper
Exemplo n.º 17
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not metadata.get('container-title'):
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if isinstance(title, list):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and isinstance(issn, list):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)
    splash_url = doi_to_url(doi)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    doi_prefix = doi.split('/')[0]
    if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)):
        pdf_url = splash_url

    # Lookup journal
    journal = Journal.find(issn=issn, title=title)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = Publisher.find(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=OaiSource.objects.get(identifier='crossref'),
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
Exemplo n.º 18
0
    def process_records(self, listRecords):
        for record in listRecords:
            metadata = record[1]._map
            authors = get_oai_authors(metadata)

            # Filter the record
            if all(not elem.is_known for elem in authors):
                print "No relevant author, continue"
                continue
            if not 'title' in metadata or metadata['title'] == []:
                continue

            # Find the source
            sets = record[0].setSpec()
            source_identifier = None
            for s in sets:
                if s.startswith(PROXY_SOURCE_PREFIX):
                    source_identifier = s[len(PROXY_SOURCE_PREFIX):]
                    break
            source = None
            if source_identifier:
                try:
                    source = OaiSource.objects.get(
                        identifier=source_identifier)
                except OaiSource.DoesNotExist:
                    pass
            if not source:
                print "Invalid source '" + str(
                    source_identifier) + "' from the proxy, skipping"
                continue

            # Find the DOI, if any
            doi = None
            for identifier in metadata['identifier'] + metadata['relation']:
                if not doi:
                    doi = to_doi(identifier)

            # A publication date is necessary
            pubdate = find_earliest_oai_date(record)
            if not pubdate:
                print "No publication date, skipping"
                continue

            print 'Saving record %s' % record[0].identifier()
            paper = BarePaper.create(metadata['title'][0], authors, pubdate)

            if doi:
                try:
                    metadata = crossref.fetch_metadata_by_DOI(doi)
                    crossref.create_publication(paper, metadata)
                except MetadataSourceException as e:
                    print(
                        "Warning, metadata source exception while fetching DOI "
                        + doi + ":\n" + unicode(e))
                    pass

            if paper is None:
                print "Paper creation failed, skipping"
                continue

            # Save the record
            # TODO: we should check record validity *BEFORE* creating the paper
            try:
                add_oai_record(record, source, paper)
                yield paper
            except ValueError as e:
                print "Warning, OAI record " + record[0].identifier(
                ) + " skipped:\n" + unicode(e)
                paper.update_availability()
Exemplo n.º 19
0
    def process_records(self, listRecords):
        for record in listRecords:
            metadata = record[1]._map
            authors = get_oai_authors(metadata)

            # Filter the record
            if all(not elem.is_known for elem in authors):
                print "No relevant author, continue"
                continue
            if not 'title' in metadata or metadata['title'] == []:
                continue

            # Find the source
            sets = record[0].setSpec()
            source_identifier = None
            for s in sets:
                if s.startswith(PROXY_SOURCE_PREFIX):
                    source_identifier = s[len(PROXY_SOURCE_PREFIX):]
                    break
            source = None
            if source_identifier:
                try:
                    source = OaiSource.objects.get(identifier=source_identifier)
                except OaiSource.DoesNotExist:
                    pass
            if not source:
                print "Invalid source '"+str(source_identifier)+"' from the proxy, skipping"
                continue

            # Find the DOI, if any
            doi = None
            for identifier in metadata['identifier']+metadata['relation']:
                if not doi:
                    doi = to_doi(identifier)

            # A publication date is necessary
            pubdate = find_earliest_oai_date(record)
            if not pubdate:
                print "No publication date, skipping"
                continue

            print 'Saving record %s' % record[0].identifier()
            paper = BarePaper.create(metadata['title'][0], authors, pubdate)

            if doi:
                try:
                    metadata = crossref.fetch_metadata_by_DOI(doi)
                    crossref.create_publication(paper, metadata)
                except MetadataSourceException as e:
                    print("Warning, metadata source exception while fetching DOI "+doi+":\n"+unicode(e))
                    pass


            if paper is None:
                print "Paper creation failed, skipping"
                continue

            # Save the record
            # TODO: we should check record validity *BEFORE* creating the paper
            try:
                add_oai_record(record, source, paper)
                yield paper
            except ValueError as e:
                print "Warning, OAI record "+record[0].identifier()+" skipped:\n"+unicode(e)
                paper.update_availability()
Exemplo n.º 20
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not 'container-title' in metadata or not metadata['container-title']:
        return
    doi = to_doi(metadata.get('DOI',None))

    title = metadata['container-title']
    if type(title) == type([]):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN',None)
    if issn and type(issn) == type([]):
        issn = issn[0] # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume',None)
    pages = metadata.get('page',None)
    issue = metadata.get('issue',None)
    date_dict = metadata.get('issued',dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type','unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])])
    if any(map(is_oa_license, licenses)):
        pdf_url = 'http://dx.doi.org/'+doi

    # Lookup journal
    search_terms = {'jtitle':title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BarePublication(title=title, issue=issue, volume=volume,
            pubdate=pubdate, paper=paper, pages=pages,
            doi=doi, pubtype=pubtype, publisher_name=publisher_name,
            journal=journal, publisher=publisher, pdf_url=pdf_url)
    pub = paper.add_publication(barepub)
    cur_pubdate = paper.pubdate
    if type(cur_pubdate) != type(pubdate):
        cur_pubdate = cur_pubdate.date()
    if pubdate and pubdate > cur_pubdate:
        paper.pubdate = pubdate
    paper.update_availability()
    return paper, pub
Exemplo n.º 21
0
    def save_doi_metadata(self, metadata, extra_orcids=None):
        """
        Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication

        :param extra_orcids: an optional orcids list, which will be unified
            with the orcids extracted from the metadata. This is useful for the ORCID interface.
        :returns: the paper, created if needed
        """
        # Normalize metadata
        if metadata is None or not isinstance(metadata, dict):
            raise ValueError('Invalid metadata format, expecting a dict')
        if not metadata.get('author'):
            raise ValueError('No author provided')

        if not metadata.get('title'):
            raise ValueError('No title')

        # the upstream function ensures that there is a non-empty title
        if not to_doi(metadata.get('DOI')):
            raise ValueError("No DOI, skipping")

        pubdate = get_publication_date(metadata)

        if pubdate is None:
            raise ValueError('No pubdate')

        title = metadata['title']
        # CrossRef metadata stores titles in lists
        if isinstance(title, list):
            title = title[0]
        subtitle = metadata.get('subtitle')
        if subtitle:
            if isinstance(subtitle, list):
                subtitle = subtitle[0]
            title += ': '+subtitle

        name_pairs = map(convert_to_name_pair, metadata['author'])
        if None in name_pairs:
            raise ValueError('Invalid author')
        authors = [BareName.create_bare(first, last) for first, last in
                   name_pairs]

        def get_affiliation(author_elem):
            for dct in author_elem.get('affiliation', []):
                if 'name' in dct:
                    return dct['name']

        def get_orcid(author_elem):
            orcid = validate_orcid(author_elem.get('ORCID'))
            if orcid:
                return orcid

        new_orcids = map(get_orcid, metadata['author'])
        if extra_orcids:
            orcids = [new or old for (old, new) in zip(
                extra_orcids, new_orcids)]
        else:
            orcids = new_orcids
        affiliations = map(get_affiliation, metadata['author'])

        paper = BarePaper.create(title, authors, pubdate,
                                 visible=True, affiliations=affiliations, orcids=orcids)

        result = create_publication(paper, metadata)

        if result is None:  # Creating the publication failed!
            # Make sure the paper only appears if it is still associated
            # with another source.
            paper.update_visible()
        else:
            paper = result[0]

        return paper
Exemplo n.º 22
0
    def create_oairecord(self, record, update_index=True, create_missing_dois=True):
        """
        Given one line of the dump (represented as a dict),
        add it to the corresponding paper (if it exists)
        """
        doi = to_doi(record['doi'])
        if not doi:
            return
        prefix = doi.split('/')[0]
        if prefix in free_doi_prefixes:
            return
        if not record.get('oa_locations'):
            return

        paper = Paper.get_by_doi(doi)
        if not paper:
            if not create_missing_dois:
                return
            try:
                paper = Paper.create_by_doi(doi)
            except (MetadataSourceException, ValueError):
                return
            if not paper:
                logger.info('no such paper for doi {doi}'.format(doi=doi))
                return
        logger.info(doi)
        paper.cache_oairecords()

        for oa_location in record.get('oa_locations') or []:
            url = oa_location['url']

            # just to speed things up a bit...
            if paper.pdf_url == url:
                return

            identifier='oadoi:'+url
            source = self.oadoi_source

            if oa_location['host_type'] == 'publisher':
                url = doi_to_url(doi)
                identifier = doi_to_crossref_identifier(doi)
                source = self.crossref_source

            record = BareOaiRecord(
                paper=paper,
                doi=doi,
                pubtype=paper.doctype,
                source=source,
                identifier=identifier,
                splash_url=url,
                pdf_url=oa_location['url'])
            try:
                # We disable checks by DOI since we know the paper has been looked up by DOI already.
                old_pdf_url = paper.pdf_url
                paper.add_oairecord(record, check_by_doi=False)
                super(Paper, paper).update_availability()
                if old_pdf_url != paper.pdf_url:
                    paper.save()
                    if update_index:
                        paper.update_index()
            except (DataError, ValueError):
                logger.warning('Record does not fit in the DB')
Exemplo n.º 23
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not 'container-title' in metadata or not metadata['container-title']:
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if type(title) == type([]):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and type(issn) == type([]):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    if any(map(is_oa_license, licenses)):
        pdf_url = 'http://dx.doi.org/' + doi

    # Lookup journal
    search_terms = {'jtitle': title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BarePublication(title=title,
                              issue=issue,
                              volume=volume,
                              pubdate=pubdate,
                              paper=paper,
                              pages=pages,
                              doi=doi,
                              pubtype=pubtype,
                              publisher_name=publisher_name,
                              journal=journal,
                              publisher=publisher,
                              pdf_url=pdf_url)
    pub = paper.add_publication(barepub)
    cur_pubdate = paper.pubdate
    if type(cur_pubdate) != type(pubdate):
        cur_pubdate = cur_pubdate.date()
    if pubdate and pubdate > cur_pubdate:
        paper.pubdate = pubdate
    paper.update_availability()
    return paper, pub
Exemplo n.º 24
0
    def fetch_orcid_records(self, id, profile=None, use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        crps = CrossRefPaperSource(self.ccf)

        # Cleanup iD:
        id = validate_orcid(id)
        if id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(id=id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = [] # list of DOIs to fetch
        papers = [] # list of papers created
        records_found = 0 # how many records did we successfully import from the profile?

        # Fetch publications
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, [])
        for pub in pubs:
            def j(path, default=None):
                return jpath(path, pub, default)

            # DOI
            doi = None
            for extid in j('work-external-identifiers/work-external-identifier', []):
                if extid.get('work-external-identifier-type') == 'DOI':
                    doi = to_doi(jpath('work-external-identifier-id/value', extid))
                    if doi:
                        # If a DOI is available, create the paper using metadata from CrossRef.
                        # We don't do it yet, we only store the DOI, so that we can fetch them
                        # by batch later.
                        dois.append(doi)

            if doi and use_doi:
                continue

            # Extract information from ORCiD

            # Title
            title = j('work-title/title/value')
            if title is None:
                print "Warning: Skipping ORCID publication: no title"
            
            # Type
            doctype = orcid_to_doctype(j('work-type', 'other'))

            # Contributors (ignored for now as they are very often not present)
            def get_contrib(js):
                return {
                     'orcid':jpath('contributor-orcid', js),
                     'name': jpath('credit-name/value', js),
                    }
            contributors = map(get_contrib, j('work-contributors/contributor',[]))

            author_names = filter(lambda x: x is not None, map(
                                  lambda x: x['name'], contributors))
            authors = map(parse_comma_name, author_names)
            pubdate = None
            # ORCiD internal id
            identifier = j('put-code')
            affiliations = map(lambda x: x['orcid'], contributors)
            # Pubdate
            year = parse_int(j('publication-date/year/value'), 1970)
            month = parse_int(j('publication-date/month/value'), 01)
            day = parse_int(j('publication-date/day/value'), 01)
            pubdate = None
            try:
                pubdate = date(year=year, month=01, day=01)
                pubdate = date(year=year, month=month, day=01)
                pubdate = date(year=year, month=month, day=day)
            except ValueError:
                if pubdate is None:
                    print "Invalid publication date in ORCID publication, skipping"
                    continue

            # Citation type: metadata format
            citation_format = j('work-citation/work-citation-type')
            print citation_format
            bibtex = j('work-citation/citation')

            if bibtex is not None:
                try:
                    entry = parse_bibtex(bibtex)

                    if entry.get('author', []) == []:
                        print "Warning: Skipping ORCID publication: no authors."
                        print j('work-citation/citation')
                    if not authors:
                        authors = entry['author']
                except ValueError:
                    pass

            affiliations = affiliate_author_with_orcid(ref_name, id, authors, initial_affiliations=affiliations)

            authors = map(name_lookup_cache.lookup, authors)

            if not authors:
                print "No authors found, skipping"
                continue

            # Create paper:
            paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations)

            record = BareOaiRecord(
                    source=orcid_oai_source,
                    identifier=identifier,
                    splash_url='http://orcid.org/'+id,
                    pubtype=doctype)

            paper.add_oairecord(record)
            yield paper

        if use_doi:
            for metadata in crps.search_for_dois_incrementally('', {'orcid':id}):
                try:
                    paper = crps.save_doi_metadata(metadata)
                    if paper:
                        yield paper
                except ValueError as e:
                    print "Saving CrossRef record from ORCID failed: %s" % unicode(e)

            # Now we add the DOIs found in the ORCID profile.
            doi_metadata = fetch_dois(dois)
            for metadata in doi_metadata:
                try:
                    authors = map(convert_to_name_pair, metadata['author'])
                    affiliations = affiliate_author_with_orcid(ref_name, id, authors)
                    paper = crps.save_doi_metadata(metadata, affiliations)
                    if not paper:
                        continue
                    record = BareOaiRecord(
                            source=orcid_oai_source,
                            identifier='orcid:'+id+':'+metadata['DOI'],
                            splash_url='http://orcid.org/'+id,
                            pubtype=paper.doctype)
                    paper.add_oairecord(record)
                    yield paper
                except (KeyError, ValueError, TypeError):
                    pass
Exemplo n.º 25
0
def _create_publication(paper, metadata):
    if not metadata:
        return
    if not metadata.get('container-title'):
        return
    doi = to_doi(metadata.get('DOI', None))

    title = metadata['container-title']
    if isinstance(title, list):
        title = title[0]
    title = title[:512]

    issn = metadata.get('ISSN', None)
    if issn and isinstance(issn, list):
        issn = issn[0]  # TODO pass all the ISSN to the RoMEO interface
    volume = metadata.get('volume', None)
    pages = metadata.get('page', None)
    issue = metadata.get('issue', None)
    date_dict = metadata.get('issued', dict())
    pubdate = None
    if 'date-parts' in date_dict:
        dateparts = date_dict.get('date-parts')[0]
        pubdate = date_from_dateparts(dateparts)
    # for instance it outputs dates like 2014-2-3
    publisher_name = metadata.get('publisher', None)
    if publisher_name:
        publisher_name = publisher_name[:512]

    pubtype = metadata.get('type', 'unknown')
    pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype)
    splash_url = doi_to_url(doi)

    # PDF availability
    pdf_url = None
    licenses = set([(license or {}).get('URL')
                    for license in metadata.get('license', [])])
    doi_prefix = doi.split('/')[0]
    if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)):
        pdf_url = splash_url

    # Lookup journal
    search_terms = {'jtitle': title}
    if issn:
        search_terms['issn'] = issn
    journal = fetch_journal(search_terms)

    publisher = None
    if journal:
        publisher = journal.publisher
        AliasPublisher.increment(publisher_name, journal.publisher)
    else:
        publisher = fetch_publisher(publisher_name)

    barepub = BareOaiRecord(
            paper=paper,
            journal_title=title,
            issue=issue,
            volume=volume,
            pubdate=pubdate,
            pages=pages,
            doi=doi,
            pubtype=pubtype,
            publisher_name=publisher_name,
            journal=journal,
            publisher=publisher,
            pdf_url=pdf_url,
            splash_url=splash_url,
            source=OaiSource.objects.get(identifier='crossref'),
            identifier=doi_to_crossref_identifier(doi))
    rec = paper.add_oairecord(barepub)
    paper.update_availability()
    return paper, rec
Exemplo n.º 26
0
    def fetch_orcid_records(self, id, profile=None, use_doi=True):
        """
        Queries ORCiD to retrieve the publications associated with a given ORCiD.
        It also fetches such papers from the CrossRef search interface.

        :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON).
        :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow)
        :returns: a generator, where all the papers found are yielded. (some of them could be in
                free form, hence not imported)
        """
        crps = CrossRefPaperSource(self.ccf)

        # Cleanup iD:
        id = validate_orcid(id)
        if id is None:
            raise MetadataSourceException('Invalid ORCiD identifier')

        # Get ORCiD profile
        try:
            if profile is None:
                profile = OrcidProfile(id=id)
            else:
                profile = OrcidProfile(json=profile)
        except MetadataSourceException as e:
            print e
            return

        # Reference name
        ref_name = profile.name
        # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i
        dois = []  # list of DOIs to fetch
        papers = []  # list of papers created
        records_found = 0  # how many records did we successfully import from the profile?

        # Fetch publications
        pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work',
                     profile, [])
        for pub in pubs:

            def j(path, default=None):
                return jpath(path, pub, default)

            # DOI
            doi = None
            for extid in j(
                    'work-external-identifiers/work-external-identifier', []):
                if extid.get('work-external-identifier-type') == 'DOI':
                    doi = to_doi(
                        jpath('work-external-identifier-id/value', extid))
                    if doi:
                        # If a DOI is available, create the paper using metadata from CrossRef.
                        # We don't do it yet, we only store the DOI, so that we can fetch them
                        # by batch later.
                        dois.append(doi)

            if doi and use_doi:
                continue

            # Extract information from ORCiD

            # Title
            title = j('work-title/title/value')
            if title is None:
                print "Warning: Skipping ORCID publication: no title"

            # Type
            doctype = orcid_to_doctype(j('work-type', 'other'))

            # Contributors (ignored for now as they are very often not present)
            def get_contrib(js):
                return {
                    'orcid': jpath('contributor-orcid', js),
                    'name': jpath('credit-name/value', js),
                }

            contributors = map(get_contrib,
                               j('work-contributors/contributor', []))

            author_names = filter(lambda x: x is not None,
                                  map(lambda x: x['name'], contributors))
            authors = map(parse_comma_name, author_names)
            pubdate = None
            # ORCiD internal id
            identifier = j('put-code')
            affiliations = map(lambda x: x['orcid'], contributors)
            # Pubdate
            year = parse_int(j('publication-date/year/value'), 1970)
            month = parse_int(j('publication-date/month/value'), 01)
            day = parse_int(j('publication-date/day/value'), 01)
            pubdate = None
            try:
                pubdate = date(year=year, month=01, day=01)
                pubdate = date(year=year, month=month, day=01)
                pubdate = date(year=year, month=month, day=day)
            except ValueError:
                if pubdate is None:
                    print "Invalid publication date in ORCID publication, skipping"
                    continue

            # Citation type: metadata format
            citation_format = j('work-citation/work-citation-type')
            print citation_format
            bibtex = j('work-citation/citation')

            if bibtex is not None:
                try:
                    entry = parse_bibtex(bibtex)

                    if entry.get('author', []) == []:
                        print "Warning: Skipping ORCID publication: no authors."
                        print j('work-citation/citation')
                    if not authors:
                        authors = entry['author']
                except ValueError:
                    pass

            affiliations = affiliate_author_with_orcid(
                ref_name, id, authors, initial_affiliations=affiliations)

            authors = map(name_lookup_cache.lookup, authors)

            if not authors:
                print "No authors found, skipping"
                continue

            # Create paper:
            paper = BarePaper.create(title, authors, pubdate, 'VISIBLE',
                                     affiliations)

            record = BareOaiRecord(source=orcid_oai_source,
                                   identifier=identifier,
                                   splash_url='http://orcid.org/' + id,
                                   pubtype=doctype)

            paper.add_oairecord(record)
            yield paper

        if use_doi:
            for metadata in crps.search_for_dois_incrementally(
                    '', {'orcid': id}):
                try:
                    paper = crps.save_doi_metadata(metadata)
                    if paper:
                        yield paper
                except ValueError as e:
                    print "Saving CrossRef record from ORCID failed: %s" % unicode(
                        e)

            # Now we add the DOIs found in the ORCID profile.
            doi_metadata = fetch_dois(dois)
            for metadata in doi_metadata:
                try:
                    authors = map(convert_to_name_pair, metadata['author'])
                    affiliations = affiliate_author_with_orcid(
                        ref_name, id, authors)
                    paper = crps.save_doi_metadata(metadata, affiliations)
                    if not paper:
                        continue
                    record = BareOaiRecord(source=orcid_oai_source,
                                           identifier='orcid:' + id + ':' +
                                           metadata['DOI'],
                                           splash_url='http://orcid.org/' + id,
                                           pubtype=paper.doctype)
                    paper.add_oairecord(record)
                    yield paper
                except (KeyError, ValueError, TypeError):
                    pass
Exemplo n.º 27
0
    def save_doi_metadata(self,
                          metadata,
                          extra_affiliations=None,
                          allow_unknown_authors=False):
        """
        Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication

        :param extra_affiliations: an optional affiliations list, which will be unified
            with the affiliations extracted from the metadata. This is useful for the ORCID interface.
        :param allow_unknown_authors: create the paper even if no author matches our researchers
        :returns: the paper, created if needed
        """
        # Normalize metadata
        if metadata is None or type(metadata) != dict:
            if metadata is not None:
                print "WARNING: Invalid metadata: type is " + str(
                    type(metadata))
                print "The doi proxy is doing something nasty!"
            raise ValueError('Invalid metadata format, expecting a dict')
        if not 'author' in metadata:
            raise ValueError('No author provided')

        if not 'title' in metadata or not metadata['title']:
            raise ValueError('No title')

        # the upstream function ensures that there is a non-empty title
        if not 'DOI' in metadata or not metadata['DOI']:
            raise ValueError("No DOI, skipping")
        doi = to_doi(metadata['DOI'])

        pubdate = get_publication_date(metadata)

        if pubdate is None:
            raise ValueError('No pubdate')

        title = metadata['title']
        # CrossRef metadata stores titles in lists
        if type(title) == list:
            title = title[0]
        subtitle = metadata.get('subtitle')
        if subtitle:
            if type(subtitle) == list:
                subtitle = subtitle[0]
            title += ': ' + subtitle
        authors = map(name_lookup_cache.lookup,
                      map(convert_to_name_pair, metadata['author']))
        authors = filter(lambda x: x != None, authors)
        if (not allow_unknown_authors
                and all(not elem.is_known
                        for elem in authors)) or authors == []:
            raise ValueError('No known author')

        def get_affiliation(author_elem):
            # First, look for an ORCID id
            orcid = validate_orcid(author_elem.get('ORCID'))
            if orcid:
                return orcid
            # Otherwise return the plain affiliation, if any
            for dct in author_elem.get('affiliation', []):
                if 'name' in dct:
                    return dct['name']

        affiliations = map(get_affiliation, metadata['author'])
        if extra_affiliations and len(affiliations) == len(extra_affiliations):
            for i in range(len(affiliations)):
                if affiliation_is_greater(extra_affiliations[i],
                                          affiliations[i]):
                    affiliations[i] = extra_affiliations[i]

        paper = BarePaper.create(title, authors, pubdate, 'VISIBLE',
                                 affiliations)

        result = create_publication(paper, metadata)

        if result is None:  # Creating the publication failed!
            paper.update_visibility()
            # Make sure the paper only appears if it is still associated
            # with another source.
            # TODO add unit test for this
        else:
            paper = result[0]

        return paper
Exemplo n.º 28
0
    def save_doi_metadata(self, metadata, extra_affiliations=None, allow_unknown_authors=False):
        """
        Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication

        :param extra_affiliations: an optional affiliations list, which will be unified
            with the affiliations extracted from the metadata. This is useful for the ORCID interface.
        :param allow_unknown_authors: create the paper even if no author matches our researchers
        :returns: the paper, created if needed
        """        
        # Normalize metadata
        if metadata is None or type(metadata) != dict:
            if metadata is not None:
                print "WARNING: Invalid metadata: type is "+str(type(metadata))
                print "The doi proxy is doing something nasty!"
            raise ValueError('Invalid metadata format, expecting a dict')
        if not 'author' in metadata:
            raise ValueError('No author provided')

        if not 'title' in metadata or not metadata['title']:
            raise ValueError('No title')

        # the upstream function ensures that there is a non-empty title
        if not 'DOI' in metadata or not metadata['DOI']:
            raise ValueError("No DOI, skipping")
        doi = to_doi(metadata['DOI'])

        pubdate = get_publication_date(metadata)

        if pubdate is None:
            raise ValueError('No pubdate')
        
        title = metadata['title']
        # CrossRef metadata stores titles in lists
        if type(title) == list:
            title = title[0]
        subtitle = metadata.get('subtitle')
        if subtitle:
            if type(subtitle) == list:
                subtitle = subtitle[0]
            title += ': '+subtitle
        authors = map(name_lookup_cache.lookup, map(convert_to_name_pair, metadata['author']))
        authors = filter(lambda x: x != None, authors)
        if (not allow_unknown_authors and all(not elem.is_known for elem in authors)) or authors == []:
            raise ValueError('No known author')

        def get_affiliation(author_elem):
            # First, look for an ORCID id
            orcid = validate_orcid(author_elem.get('ORCID'))
            if orcid:
                return orcid
            # Otherwise return the plain affiliation, if any
            for dct in author_elem.get('affiliation', []):
                if 'name' in dct:
                    return dct['name']

        affiliations = map(get_affiliation, metadata['author'])
        if extra_affiliations and len(affiliations) == len(extra_affiliations):
            for i in range(len(affiliations)):
                if affiliation_is_greater(extra_affiliations[i],affiliations[i]):
                    affiliations[i] = extra_affiliations[i]

        paper = BarePaper.create(title, authors, pubdate, 
                'VISIBLE', affiliations)

        result = create_publication(paper, metadata)

        if result is None: # Creating the publication failed!
            paper.update_visibility()
            # Make sure the paper only appears if it is still associated
            # with another source.
            # TODO add unit test for this
        else:
            paper = result[0]

        return paper