Пример #1
0
 def get_arxiv_id(self, xml):
     custom_metas = xml.getElementsByTagName("custom-meta")
     ext_link = None
     for meta in custom_metas:
         if get_value_in_tag(meta, "meta-name") == "arxiv-id":
             ext_link = format_arxiv_id(get_value_in_tag(meta, "meta-value").encode('utf-8'))
     return ext_link
Пример #2
0
 def get_references(self, xml_doc):
     for ref in xml_doc.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(ref, "ce:label")
         if self.CONSYN:
             innerrefs = ref.getElementsByTagName("sb:reference")
             if not innerrefs:
                 yield self._get_ref(ref, label)
             for inner in innerrefs:
                 yield self._get_ref(inner, label)
         else:
             authors = []
             for author in ref.getElementsByTagName("sb:author"):
                 given_name = get_value_in_tag(author, "ce:given-name")
                 surname = get_value_in_tag(author, "ce:surname")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi = get_value_in_tag(ref, "ce:doi")
             issue = get_value_in_tag(ref, "sb:issue")
             page = get_value_in_tag(ref, "sb:first-page")
             title = get_value_in_tag(ref, "sb:maintitle")
             volume = get_value_in_tag(ref, "sb:volume-nr")
             tmp_issues = ref.getElementsByTagName('sb:issue')
             if tmp_issues:
                 year = get_value_in_tag(tmp_issues[0], "sb:date")[:4]
             else:
                 year = ''
             textref = ref.getElementsByTagName("ce:textref")
             if textref:
                 textref = xml_to_text(textref[0])
             ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
             yield (label, authors, doi, issue, page, title, volume,
                    year, textref, ext_link)
Пример #3
0
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             plain_text = get_value_in_tag(reference, "mixed-citation")
         references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text))
     self.references = references
Пример #4
0
 def test_format_arxiv_id(self):
     """Test arXiv formatting."""
     self.assertEqual(format_arxiv_id("arXiv:1312.1300"), "arXiv:1312.1300")
     self.assertEqual(format_arxiv_id("1312.1300"), "arXiv:1312.1300")
     self.assertEqual(format_arxiv_id("1312.13005"), "arXiv:1312.13005")
     self.assertEqual(format_arxiv_id("arxiv:hep/1312002"), "hep/1312002")
     self.assertEqual(format_arxiv_id("hep/1312002"), "hep/1312002")
     self.assertEqual(format_arxiv_id("arXiv:1234.12345"), "arXiv:1234.12345")
Пример #5
0
 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         try:
             ref_type = reference.getElementsByTagName('mixed-citation')[0]
             ref_type = ref_type.getAttribute('publication-type').encode('utf-8')
         except:
             ref_type = reference.getElementsByTagName('citation')[0]
             ref_type = ref_type.getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(self.get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             try:
                 plain_text = get_value_in_tag(reference,
                                               "mixed-citation",
                                               tag_to_remove=self.tag_to_remove)
             except:
                 plain_text = get_value_in_tag(reference,
                                               "citation",
                                               tag_to_remove=self.tag_to_remove)
         references.append((label, authors, doi,
                            issue, page, page_last,
                            title, volume, year,
                            ext_link, plain_text))
     self.references = references
Пример #6
0
 def test_format_arxiv_id(self):
     self.assertEqual(format_arxiv_id("arXiv:1312.1300"), "arXiv:1312.1300")
     self.assertEqual(format_arxiv_id("1312.1300"), "arXiv:1312.1300")
     self.assertEqual(format_arxiv_id("arxiv:hep/1312/1300", True), "hep/1312/1300")
     self.assertEqual(format_arxiv_id("arxiv:hep/1312/1300"), "arxiv:hep/1312/1300")
Пример #7
0
 def _get_reference(self, ref):
     """Retrieve the data for a reference."""
     label = get_value_in_tag(ref, 'label')
     label = re.sub('\D', '', label)
     for innerref in ref.getElementsByTagName('mixed-citation'):
         ref_type = innerref.getAttribute('publication-type')
         institution = get_value_in_tag(innerref, 'institution')
         report_no = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'other':
                 if tag.hasChildNodes():
                     report_no = get_all_text(tag)
         doi = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'doi':
                 doi = xml_to_text(tag)
         collaboration = get_value_in_tag(innerref, 'collab')
         authors = []
         person_groups = innerref.getElementsByTagName('person-group')
         for author_group in person_groups:
             if author_group.getAttribute('person-group-type') == 'author':
                 for author in author_group.getElementsByTagName('string-name'):
                     if author.hasChildNodes():
                         authors.append(get_all_text(author))
         editors = []
         for editor_group in person_groups:
             if editor_group.getAttribute('person-group-type') == 'editor':
                 for editor in editor_group.getElementsByTagName('string-name'):
                     if editor.hasChildNodes():
                         editors.append(get_all_text(editor))
         journal = get_value_in_tag(innerref, 'source')
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         volume += get_value_in_tag(innerref, 'volume')
         if journal == 'J.High Energy Phys.' or journal == 'JHEP':
             issue = get_value_in_tag(innerref, 'issue')
             volume = volume[2:] + issue
             journal = 'JHEP'
         page = get_value_in_tag(innerref, 'page-range')
         year = get_value_in_tag(innerref, 'year')
         external_link = get_value_in_tag(innerref, 'ext-link')
         arxiv = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'arxiv':
                 if tag.hasChildNodes():
                     arxiv = get_all_text(tag)
         arxiv = format_arxiv_id(arxiv)
         publisher = get_value_in_tag(innerref, 'publisher-name')
         publisher_location = get_value_in_tag(innerref, 'publisher-loc')
         if publisher_location:
             publisher = publisher_location + ': ' + publisher
         unstructured_text = []
         for child in innerref.childNodes:
             if child.nodeType == child.TEXT_NODE:
                 text = child.nodeValue.strip()
                 text = re.sub(r'[\[\]\(\.;\)]', '', text).strip()
                 if text.startswith(','):
                     text = text[1:].strip()
                 if text.endswith('Report No'):
                     text = institution + " " + text
                     institution = ''
                     text = text.strip()
                 elif text.endswith(' ed'):
                     text += '.'
                 elif text.endswith('PhD thesis,'):
                     if institution:
                         text += ' ' + institution
                         institution = ''
                     else:
                         text = text[:-1]
                 elif text.startswith('Seminar,'):
                     article_title = get_value_in_tag(innerref, 'article-title')
                     text = institution + " Seminar, \"" + article_title + "\""
                     institution = ''
                 elif text == u'\u201d':
                     text = ''
                 ignore_text = ['in', 'pp', 'edited by']
                 if text.startswith('Vol'):
                     temp = re.sub(r'\D', '', text)
                     if temp:
                         volume += temp
                 elif len(text) > 1 and text not in ignore_text\
                         and not (text.isdigit() or text[:-1].isdigit()):
                     unstructured_text.append(text)
         if unstructured_text:
             unstructured_text = " ".join(unstructured_text)
         if ref_type == 'book':
             if volume and not volume.lower().startswith('vol'):
                 volume = 'Vol ' + volume
             if volume and page:
                 volume = volume + ', pp ' + page
         yield ref_type, doi, authors, collaboration, journal, volume, page, year,\
             label, arxiv, publisher, institution, unstructured_text, external_link,\
             report_no, editors
Пример #8
0
 def _get_ref(self, ref, label):
     doi = get_value_in_tag(ref, "ce:doi")
     page = get_value_in_tag(ref, "sb:first-page")
     issue = get_value_in_tag(ref, "sb:issue")
     title = get_value_in_tag(ref, "sb:maintitle")
     volume = get_value_in_tag(ref, "sb:volume-nr")
     tmp_issues = ref.getElementsByTagName('sb:issue')
     if tmp_issues:
         year = get_value_in_tag(tmp_issues[0], "sb:date")
     else:
         year = ''
     textref = ref.getElementsByTagName("ce:textref")
     if textref:
         textref = xml_to_text(textref[0])
     ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
     authors = []
     for author in ref.getElementsByTagName("sb:author"):
         given_name = get_value_in_tag(author, "ce:given-name")
         surname = get_value_in_tag(author, "ce:surname")
         if given_name:
             name = "%s, %s" % (surname, given_name)
         else:
             name = surname
         authors.append(name)
     if ext_link and ext_link.lower().startswith('arxiv'):
         # check if the identifier contains
         # digits seperated by dot
         regex = r'\d*\.\d*'
         if not re.search(regex, ext_link):
             ext_link = ext_link[6:]
     comment = get_value_in_tag(ref, "sb:comment")
     links = []
     for link in ref.getElementsByTagName("ce:inter-ref"):
         links.append(xml_to_text(link))
     title = ""
     try:
         container = ref.getElementsByTagName("sb:contribution")[0]
         title = container.getElementsByTagName("sb:maintitle")[0]
         title = xml_to_text(title)
     except IndexError:
         title = ''
     except TypeError:
         title = ''
     isjournal = ref.getElementsByTagName("sb:issue")
     journal = ""
     if isjournal:
         isjournal = True
         if not page:
             page = comment
         container = ref.getElementsByTagName("sb:issue")[0]
         journal = get_value_in_tag(container, "sb:maintitle")
     edited_book = ref.getElementsByTagName("sb:edited-book")
     editors = []
     book_title = ""
     publisher = ""
     if edited_book:
         # treat as a journal
         if ref.getElementsByTagName("sb:book-series"):
             container = ref.getElementsByTagName("sb:book-series")[0]
             journal = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(ref, "sb:date")
             isjournal = True
         # conference
         elif ref.getElementsByTagName("sb:conference"):
             container = ref.getElementsByTagName("sb:edited-book")[0]
             maintitle = get_value_in_tag(container, "sb:maintitle")
             conference = get_value_in_tag(
                 container, "sb:conference")
             date = get_value_in_tag(container, "sb:date")
             # use this variable in order to get in the 'm' field
             publisher = maintitle + ", " + conference + ", " + date
         else:
             container = ref.getElementsByTagName(
                 "sb:edited-book")[0]
             if ref.getElementsByTagName("sb:editors"):
                 for editor in ref.getElementsByTagName("sb:editor"):
                     surname = get_value_in_tag(editor, "ce:surname")
                     firstname = get_value_in_tag(editor, "ce:given-name")
                     editors.append("%s,%s" % (surname, firstname))
             if title:
                 book_title = get_value_in_tag(
                     container, "sb:maintitle")
             else:
                 title = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(container, "sb:date")
             if ref.getElementsByTagName("sb:publisher"):
                 container = ref.getElementsByTagName("sb:publisher")[0]
                 location = get_value_in_tag(container, "sb:location")
                 publisher = get_value_in_tag(container, "sb:name")
                 if location:
                     publisher = location + ": " + publisher
     if ref.getElementsByTagName("sb:book"):
         if ref.getElementsByTagName("sb:book-series"):
             book_series = ref.getElementsByTagName(
                 "sb:book-series")[0]
             title += ", " + \
                 get_value_in_tag(book_series, "sb:maintitle")
             title += ", " + \
                 get_value_in_tag(book_series, "sb:volume-nr")
         publisher = get_value_in_tag(ref, "sb:publisher")
     if not year:
         year = get_value_in_tag(ref, "sb:date")
     year = re.sub(r'\D', '', year)
     return (label, authors, doi, issue, page, title, volume,
             year, textref, ext_link, isjournal, comment, journal,
             publisher, editors, book_title)
Пример #9
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
            #for keyword in keywords['other']:
                #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,))
                logger.warning("Record %s doesn't contain PDF file." % (doi,))
        if exists(f_path_pdfa):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,))
                logger.warning("Record %s doesn't contain PDF/A file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Пример #10
0
 def _get_ref(self, ref, label):
     doi = get_value_in_tag(ref, "ce:doi")
     page = get_value_in_tag(ref, "sb:first-page")
     if not page:
         page = get_value_in_tag(ref, "sb:article-number")
     issue = get_value_in_tag(ref, "sb:issue")
     title = get_value_in_tag(ref, "sb:maintitle")
     volume = get_value_in_tag(ref, "sb:volume-nr")
     tmp_issues = ref.getElementsByTagName('sb:issue')
     if tmp_issues:
         year = get_value_in_tag(tmp_issues[0], "sb:date")
     else:
         year = ''
     textref = ref.getElementsByTagName("ce:textref")
     if textref:
         textref = xml_to_text(textref[0])
     ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
     authors = []
     for author in ref.getElementsByTagName("sb:author"):
         given_name = get_value_in_tag(author, "ce:given-name")
         surname = get_value_in_tag(author, "ce:surname")
         if given_name:
             name = "%s, %s" % (surname, given_name)
         else:
             name = surname
         authors.append(name)
     if ext_link and ext_link.lower().startswith('arxiv'):
         # check if the identifier contains
         # digits seperated by dot
         regex = r'\d*\.\d*'
         if not re.search(regex, ext_link):
             ext_link = ext_link[6:]
     comment = get_value_in_tag(ref, "sb:comment")
     links = []
     for link in ref.getElementsByTagName("ce:inter-ref"):
         links.append(xml_to_text(link))
     title = ""
     try:
         container = ref.getElementsByTagName("sb:contribution")[0]
         title = container.getElementsByTagName("sb:maintitle")[0]
         title = xml_to_text(title)
     except IndexError:
         title = ''
     except TypeError:
         title = ''
     isjournal = ref.getElementsByTagName("sb:issue")
     journal = ""
     if isjournal:
         isjournal = True
         if not page:
             page = comment
         container = ref.getElementsByTagName("sb:issue")[0]
         journal = get_value_in_tag(container, "sb:maintitle")
     edited_book = ref.getElementsByTagName("sb:edited-book")
     editors = []
     book_title = ""
     publisher = ""
     if edited_book:
         # treat as a journal
         if ref.getElementsByTagName("sb:book-series"):
             container = ref.getElementsByTagName("sb:book-series")[0]
             journal = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(ref, "sb:date")
             isjournal = True
         # conference
         elif ref.getElementsByTagName("sb:conference"):
             container = ref.getElementsByTagName("sb:edited-book")[0]
             maintitle = get_value_in_tag(container, "sb:maintitle")
             conference = get_value_in_tag(container, "sb:conference")
             date = get_value_in_tag(container, "sb:date")
             # use this variable in order to get in the 'm' field
             publisher = maintitle + ", " + conference + ", " + date
         else:
             container = ref.getElementsByTagName("sb:edited-book")[0]
             if ref.getElementsByTagName("sb:editors"):
                 for editor in ref.getElementsByTagName("sb:editor"):
                     surname = get_value_in_tag(editor, "ce:surname")
                     firstname = get_value_in_tag(editor, "ce:given-name")
                     editors.append("%s,%s" % (surname, firstname))
             if title:
                 book_title = get_value_in_tag(container, "sb:maintitle")
             else:
                 title = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(container, "sb:date")
             if ref.getElementsByTagName("sb:publisher"):
                 container = ref.getElementsByTagName("sb:publisher")[0]
                 location = get_value_in_tag(container, "sb:location")
                 publisher = get_value_in_tag(container, "sb:name")
                 if location:
                     publisher = location + ": " + publisher
     if ref.getElementsByTagName("sb:book"):
         if ref.getElementsByTagName("sb:book-series"):
             book_series = ref.getElementsByTagName("sb:book-series")[0]
             title += ", " + \
                 get_value_in_tag(book_series, "sb:maintitle")
             title += ", " + \
                 get_value_in_tag(book_series, "sb:volume-nr")
         publisher = get_value_in_tag(ref, "sb:publisher")
     if not year:
         year = get_value_in_tag(ref, "sb:date")
     year = re.sub(r'\D', '', year)
     return (label, authors, doi, issue, page, title, volume, year, textref,
             ext_link, isjournal, comment, journal, publisher, editors,
             book_title)
Пример #11
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec,
                         '260',
                         subfields=[
                             ('c',
                              super(NLMParser,
                                    self).get_publication_date(xml, logger))
                         ])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(
            NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec,
                             '037',
                             subfields=[('9', 'arXiv'),
                                        ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' %
                              (author.get('surname'), author.get('given_name')
                               or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec,
                         '540',
                         subfields=[
                             ('a', 'CC-BY-3.0'),
                             ('u',
                              'http://creativecommons.org/licenses/by/3.0/')
                         ])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec,
                                 '084',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
        #for keyword in keywords['other']:
        #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(
                    ('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(
                    ('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs',
                           basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True,
                                   prefix="Oxford paper: %s is missing PDF." %
                                   (doi, ))
                logger.warning("Record %s doesn't contain PDF file." % (doi, ))
        if exists(f_path_pdfa):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdfa), ('n', 'main'),
                                        ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(
                    alert_admin=True,
                    prefix="Oxford paper: %s is missing PDF/A." % (doi, ))
                logger.warning("Record %s doesn't contain PDF/A file." %
                               (doi, ))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Пример #12
0
 def _get_reference(self, ref):
     """Retrieve the data for a reference."""
     label = get_value_in_tag(ref, 'label')
     label = re.sub('\D', '', label)
     for innerref in ref.getElementsByTagName('mixed-citation'):
         ref_type = innerref.getAttribute('publication-type')
         institution = get_value_in_tag(innerref, 'institution')
         report_no = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'other':
                 if tag.hasChildNodes():
                     report_no = get_all_text(tag)
         doi = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'doi':
                 doi = xml_to_text(tag)
         collaboration = get_value_in_tag(innerref, 'collab')
         authors = []
         person_groups = innerref.getElementsByTagName('person-group')
         for author_group in person_groups:
             if author_group.getAttribute('person-group-type') == 'author':
                 for author in author_group.getElementsByTagName(
                         'string-name'):
                     if author.hasChildNodes():
                         authors.append(get_all_text(author))
         editors = []
         for editor_group in person_groups:
             if editor_group.getAttribute('person-group-type') == 'editor':
                 for editor in editor_group.getElementsByTagName(
                         'string-name'):
                     if editor.hasChildNodes():
                         editors.append(get_all_text(editor))
         journal = get_value_in_tag(innerref, 'source')
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         volume += get_value_in_tag(innerref, 'volume')
         if journal == 'J.High Energy Phys.' or journal == 'JHEP':
             issue = get_value_in_tag(innerref, 'issue')
             volume = volume[2:] + issue
             journal = 'JHEP'
         page = get_value_in_tag(innerref, 'page-range')
         year = get_value_in_tag(innerref, 'year')
         external_link = get_value_in_tag(innerref, 'ext-link')
         arxiv = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'arxiv':
                 if tag.hasChildNodes():
                     arxiv = get_all_text(tag)
         arxiv = format_arxiv_id(arxiv)
         publisher = get_value_in_tag(innerref, 'publisher-name')
         publisher_location = get_value_in_tag(innerref, 'publisher-loc')
         if publisher_location:
             publisher = publisher_location + ': ' + publisher
         unstructured_text = []
         for child in innerref.childNodes:
             if child.nodeType == child.TEXT_NODE:
                 text = child.nodeValue.strip()
                 text = re.sub(r'[\[\]\(\.;\)]', '', text).strip()
                 if text.startswith(','):
                     text = text[1:].strip()
                 if text.endswith('Report No'):
                     text = institution + " " + text
                     institution = ''
                     text = text.strip()
                 elif text.endswith(' ed'):
                     text += '.'
                 elif text.endswith('PhD thesis,'):
                     if institution:
                         text += ' ' + institution
                         institution = ''
                     else:
                         text = text[:-1]
                 elif text.startswith('Seminar,'):
                     article_title = get_value_in_tag(
                         innerref, 'article-title')
                     text = institution + " Seminar, \"" + article_title + "\""
                     institution = ''
                 elif text == u'\u201d':
                     text = ''
                 ignore_text = ['in', 'pp', 'edited by']
                 if text.startswith('Vol'):
                     temp = re.sub(r'\D', '', text)
                     if temp:
                         volume += temp
                 elif len(text) > 1 and text not in ignore_text\
                         and not (text.isdigit() or text[:-1].isdigit()):
                     unstructured_text.append(text)
         if unstructured_text:
             unstructured_text = " ".join(unstructured_text)
         if ref_type == 'book':
             if volume and not volume.lower().startswith('vol'):
                 volume = 'Vol ' + volume
             if volume and page:
                 volume = volume + ', pp ' + page
         yield ref_type, doi, authors, collaboration, journal, volume, page, year,\
             label, arxiv, publisher, institution, unstructured_text, external_link,\
             report_no, editors