Python xml_to_text示例，harvestingkit.minidom_utils.xml_to_text Python示例

示例#1

0

显示文件

 def get_ref_link(self, xml, name):
     links = xml.getElementsByTagName('ext-link')
     ret = None
     for link in links:
         if name in link.getAttribute("xlink:href").encode('utf-8'):
             ret = xml_to_text(link).strip()
     if not ret:
         links = xml.getElementsByTagName('elocation-id')
         for link in links:
             if name in link.getAttribute("content-type").encode('utf-8'):
                 ret = xml_to_text(link).strip()
     return ret

示例#2

0

显示文件

文件： jats_utils.py 项目： Dziolas/harvesting-kit

 def get_ref_link(self, xml, name):
     links = xml.getElementsByTagName('ext-link')
     ret = None
     for link in links:
         if name in link.getAttribute("xlink:href").encode('utf-8'):
             ret = xml_to_text(link).strip()
     if not ret:
         links = xml.getElementsByTagName('elocation-id')
         for link in links:
             if name in link.getAttribute("content-type").encode('utf-8'):
                 ret = xml_to_text(link).strip()
     return ret

示例#3

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#4

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#5

0

显示文件

文件： jats_utils.py 项目： Dziolas/harvesting-kit

 def get_keywords(self, xml):
     try:
         kwd_groups = xml.getElementsByTagName('kwd-group')
         pacs = []
         other = []
         for kwd_group in kwd_groups:
             if kwd_group.getAttribute('kwd-group-type').encode('utf-8') == "pacs":
                 pacs = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")]
             else:
                 other = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")]
         return {"pacs": pacs, "other": other}
     except Exception:
         print >> sys.stderr, "Can't find keywords"

示例#6

0

显示文件

 def get_keywords(self, xml):
     try:
         kwd_groups = xml.getElementsByTagName('kwd-group')
         pacs = []
         other = []
         for kwd_group in kwd_groups:
             if kwd_group.getAttribute('kwd-group-type').encode('utf-8') == "pacs":
                 pacs = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")]
             else:
                 other = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")]
         return {"pacs": pacs, "other": other}
     except Exception:
         print >> sys.stderr, "Can't find keywords"

示例#7

0

显示文件

文件： pos_package.py 项目： Dziolas/harvesting-kit

 def _get_authors(self):
     authors = []
     for pextag in self.document.getElementsByTagName('pex-dc:creator'):
         affiliations = []
         for auttag in pextag.getElementsByTagName('pex-dc:name'):
             author = xml_to_text(auttag)
             lastname = author.split()[-1]
             givenames = " ".join(author.split()[:-1])
             givenames = collapse_initials(givenames)
             name = "%s, %s" % (lastname, givenames)
             name = safe_title(name)
             for afftag in pextag.getElementsByTagName('pex-dc:affiliation'):
                 affiliations.append(xml_to_text(afftag))
             authors.append((name, affiliations))
     return authors

示例#8

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_ref_link(self, xml_doc, name):
     links = xml_doc.getElementsByTagName('ce:inter-ref')
     ret = None
     for link in links:
         if name in link.getAttribute("xlink:href").encode('utf-8'):
             ret = xml_to_text(link).strip()
     return ret

示例#9

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

    def _author_dic_from_xml(self, author):
        tmp = {}
        surname = get_value_in_tag(author, "ce:surname")
        if surname:
            tmp["surname"] = surname
        given_name = get_value_in_tag(author, "ce:given-name")
        if given_name:
            tmp["given_name"] = given_name
        initials = get_value_in_tag(author, "ce:initials")
        if initials:
            tmp["initials"] = initials
        orcid = author.getAttribute('orcid').encode('utf-8')
        if orcid:
            tmp["orcid"] = orcid
        emails = author.getElementsByTagName("ce:e-address")
        for email in emails:
            if email.getAttribute("type").encode('utf-8') in ('email', ''):
                tmp["email"] = xml_to_text(email)
                break
        cross_refs = author.getElementsByTagName("ce:cross-ref")
        if cross_refs:
            tmp["cross_ref"] = []
            for cross_ref in cross_refs:
                tmp["cross_ref"].append(
                    cross_ref.getAttribute("refid").encode('utf-8'))

        return tmp

示例#10

0

显示文件

文件： edpsciences_package.py 项目： GiorgosPa/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName("ref"):
         label = ref.getAttribute("id")
         label = sub(r"\D", "", label)
         text_ref = ""
         ext_link = ""
         for mixed in ref.getElementsByTagName("mixed-citation"):
             ref_type = mixed.getAttribute("publication-type")
             if ref_type == "thesis":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "conf-proc":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "other" or ref_type == "web":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
                 ext_link = get_value_in_tag(mixed, "ext-link")
             elif ref_type == "book":
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName("string-name"):
             surname = get_value_in_tag(auth, "surname")
             given_names = get_value_in_tag(auth, "given-names")
             given_names = collapse_initials(given_names)
             authors.append("%s, %s" % (surname, given_names))
         year = get_value_in_tag(ref, "year")
         source = get_value_in_tag(ref, "source")
         volume = get_value_in_tag(ref, "volume")
         page = get_value_in_tag(ref, "fpage")
         if ref_type == "journal":
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page

示例#11

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_pacscodes(self):
     pacscodes = []
     for tag in self.document.getElementsByTagName('kwd-group'):
         if tag.getAttribute('kwd-group-type') == 'pacs':
             for code in tag.getElementsByTagName('kwd'):
                 pacscodes.append(xml_to_text(code))
     return pacscodes

示例#12

0

显示文件

def _get_orcids(xml_doc):
    orcid_pattern = '\d{4}-\d{4}-\d{4}-\d{3}[\d|X]'
    result = []

    def _append_orcid(orcid):
        if orcid and is_valid_orcid(orcid):
            result.append('ORCID:{0}'.format(orcid))
        else:
            result.append('')

    xml_authors = xml_doc.getElementsByTagName("ce:author")
    for xml_author in xml_authors:
        try:
            orcid = xml_author.getAttribute('orcid')
            _append_orcid(orcid)
        except IndexError:
            result.append('')
    if result:
        return result

    xml_authors = xml_doc.getElementsByTagName("contrib")
    for xml_author in xml_authors:
        try:
            contrib_id = xml_author.getElementsByTagName('contrib-id')[0]
            if contrib_id.getAttribute('contrib-id-type') == 'orcid':
                orcid_raw = xml_to_text(contrib_id)
                orcid = re.search(orcid_pattern, orcid_raw).group()
                _append_orcid(orcid)
        except (IndexError, AttributeError):
            result.append('')
    return result

示例#13

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_keywords(self):
     keywords = []
     for tag in self.document.getElementsByTagName('kwd-group'):
         if tag.getAttribute('kwd-group-type') != 'pacs':
             for kwd in tag.getElementsByTagName('kwd'):
                 keywords.append(xml_to_text(kwd))
     return keywords

示例#14

0

显示文件

 def _get_pacscodes(self):
     pacscodes = []
     for tag in self.document.getElementsByTagName('kwd-group'):
         if tag.getAttribute('kwd-group-type') == 'pacs':
             for code in tag.getElementsByTagName('kwd'):
                 pacscodes.append(xml_to_text(code))
     return pacscodes

示例#15

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references

示例#16

0

显示文件

文件： nlm_utils.py 项目： GiorgosPa/harvesting-kit

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             plain_text = get_value_in_tag(reference, "mixed-citation")
         references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text))
     self.references = references

示例#17

0

显示文件

    def _author_dic_from_xml(self, author):
        tmp = {}
        surname = get_value_in_tag(author, "ce:surname")
        if surname:
            tmp["surname"] = surname
        given_name = get_value_in_tag(author, "ce:given-name")
        if given_name:
            tmp["given_name"] = given_name
        initials = get_value_in_tag(author, "ce:initials")
        if initials:
            tmp["initials"] = initials
        orcid = author.getAttribute('orcid').encode('utf-8')
        if orcid:
            tmp["orcid"] = orcid
        emails = author.getElementsByTagName("ce:e-address")
        for email in emails:
            if email.getAttribute("type").encode('utf-8') in ('email', ''):
                tmp["email"] = xml_to_text(email)
                break
        cross_refs = author.getElementsByTagName("ce:cross-ref")
        if cross_refs:
            tmp["cross_ref"] = []
            for cross_ref in cross_refs:
                tmp["cross_ref"].append(
                    cross_ref.getAttribute("refid").encode('utf-8'))

        return tmp

示例#18

0

显示文件

 def _get_keywords(self):
     keywords = []
     for tag in self.document.getElementsByTagName('kwd-group'):
         if tag.getAttribute('kwd-group-type') != 'pacs':
             for kwd in tag.getElementsByTagName('kwd'):
                 keywords.append(xml_to_text(kwd))
     return keywords

示例#19

0

显示文件

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link, authors, year, source,
                volume, page)

示例#20

0

显示文件

文件： aps_package.py 项目： ksachs/harvesting-kit

 def _get_authors(self):
     authors = []
     affiliations = {}
     for tag in self.document.getElementsByTagName('aff'):
         aid = tag.getAttribute('id')
         affiliation = xml_to_text(tag)
         affiliation = ' '.join(affiliation.split()[1:])
         affiliations[aid] = affiliation
     for tag in self.document.getElementsByTagName('contrib'):
         if tag.getAttribute('contrib-type') == 'author':
             rid = ''
             for aff in tag.getElementsByTagName('xref'):
                 if aff.getAttribute('ref-type') == 'aff':
                     rid = aff.getAttribute('rid')
                 if len(rid.split()) > 1:
                     rid = rid.split()[0]
             given_names = get_value_in_tag(tag, 'given-names')
             given_names = collapse_initials(given_names)
             surname = get_value_in_tag(tag, 'surname')
             name = "%s, %s" % (surname, given_names)
             try:
                 authors.append((name, affiliations[rid]))
             except KeyError:
                 authors.append((name, ''))
     return authors

示例#21

0

显示文件

文件： edpsciences_package.py 项目： kaplun/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link,
                authors, year, source, volume, page)

示例#22

0

显示文件

 def get_ref_link(self, xml_doc, name):
     links = xml_doc.getElementsByTagName('ce:inter-ref')
     ret = None
     for link in links:
         if name in link.getAttribute("xlink:href").encode('utf-8'):
             ret = xml_to_text(link).strip()
     return ret

示例#23

0

显示文件

def convert_record(record, response_date, request):
    header = record.getElementsByTagName("header")[0]
    oai_identifier = get_value_in_tag(header, "identifier")
    datestamp = get_value_in_tag(header, "datestamp")
    status = header.getAttribute("status").encode('utf8')
    rec = create_record()
    record_add_field(rec, tag="035", subfields=[('a', oai_identifier),
                                                ('u', request),
                                                ('9', 'Hindawi'),
                                                ('d', datestamp),
                                                ('h', response_date),
                                                ('m', 'marc21'),
                                                ('t', 'false')])
    new = True
    if find_records_from_extoaiid(oai_identifier, 'Hindawi'):
        new = False
    if status == 'deleted':
        if new:
            ## deleting a record we didn't have? Who cares :-)
            return None, True
        else:
            record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')])
            return record_xml_output(rec), False
    for datafield in record.getElementsByTagName("datafield"):
        tag = datafield.getAttribute("tag").encode('utf-8')
        ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' '
        ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' '
        subfields = []
        for subfield in datafield.getElementsByTagName("subfield"):
            code = subfield.getAttribute("code").encode('utf-8')
            value = xml_to_text(subfield)
            subfields.append((code, value))
        record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields)
    return record_xml_output(rec), new

示例#24

0

显示文件

文件： elsevier_package.py 项目： ksachs/harvesting-kit

 def get_references(self, xml_doc):
     for ref in xml_doc.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(ref, "ce:label")
         if self.CONSYN:
             innerrefs = ref.getElementsByTagName("sb:reference")
             if not innerrefs:
                 yield self._get_ref(ref, label) 
             for inner in innerrefs:
                 yield self._get_ref(inner, label)
         else:
             authors = []
             for author in ref.getElementsByTagName("sb:author"):
                 given_name = get_value_in_tag(author, "ce:given-name")
                 surname = get_value_in_tag(author, "ce:surname")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi = get_value_in_tag(ref, "ce:doi")
             issue = get_value_in_tag(ref, "sb:issue")
             page = get_value_in_tag(ref, "sb:first-page")
             title = get_value_in_tag(ref, "sb:maintitle")
             volume = get_value_in_tag(ref, "sb:volume-nr")
             tmp_issues = ref.getElementsByTagName('sb:issue')
             if tmp_issues:
                 year = get_value_in_tag(tmp_issues[0], "sb:date")[:4]
             else:
                 year = ''
             textref = ref.getElementsByTagName("ce:textref")
             if textref:
                 textref = xml_to_text(textref[0])
             ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
             yield (label, authors, doi, issue, page, title, volume,
                    year, textref, ext_link)

示例#25

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references

示例#26

0

显示文件

 def _get_subject(self):
     subjects = []
     for tag in self.document.getElementsByTagName('subj-group'):
         if tag.getAttribute('subj-group-type') == 'toc-minor' or \
                 tag.getAttribute('subj-group-type') == 'section':
             for subject in tag.getElementsByTagName('subject'):
                 subjects.append(xml_to_text(subject))
     return ', '.join(subjects)

示例#27

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_subject(self):
     subjects = []
     for tag in self.document.getElementsByTagName('subj-group'):
         if tag.getAttribute('subj-group-type') == 'toc-minor' or \
                 tag.getAttribute('subj-group-type') == 'section':
             for subject in tag.getElementsByTagName('subject'):
                 subjects.append(xml_to_text(subject))
     return ', '.join(subjects)

示例#28

0

显示文件

 def _get_authors(self):
     authors = []
     for pextag in self.document.getElementsByTagName('pex-dc:creator'):
         affiliations = []
         for auttag in pextag.getElementsByTagName('pex-dc:name'):
             author = xml_to_text(auttag)
             lastname = author.split()[-1]
             givenames = " ".join(author.split()[:-1])
             givenames = collapse_initials(givenames)
             name = "%s, %s" % (lastname, givenames)
             name = safe_title(name)
             for afftag in pextag.getElementsByTagName(
                     'pex-dc:affiliation'):
                 if afftag:
                     affiliations.append(xml_to_text(afftag))
             authors.append((name, affiliations))
     return authors

示例#29

0

显示文件

 def _get_orcid(self, xml_author):
     try:
         contrib_id = xml_author.getElementsByTagName('contrib-id')[0]
         if contrib_id.getAttribute('contrib-id-type') == 'orcid':
             orcid_raw = xml_to_text(contrib_id)
             pattern = '\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\d|X]'
             return re.search(pattern, orcid_raw).group()
     except (IndexError, AttributeError):
         return None

示例#30

0

显示文件

文件： jats_utils.py 项目： Dziolas/harvesting-kit

 def _get_orcid(self, xml_author):
     try:
         contrib_id = xml_author.getElementsByTagName('contrib-id')[0]
         if contrib_id.getAttribute('contrib-id-type') == 'orcid':
             orcid_raw = xml_to_text(contrib_id)
             pattern = '\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\d|X]'
             return re.search(pattern, orcid_raw).group()
     except (IndexError, AttributeError):
         return None

示例#31

0

显示文件

文件： elsevier_package.py 项目： fschwenn/harvesting-kit

 def _affiliation_from_sa_field(self, affiliation):
     sa_affiliation = affiliation.getElementsByTagName('sa:affiliation')
     if sa_affiliation:
         return xml_to_text(sa_affiliation[0], ', ')
     else:
         affiliation = re.sub(r'^(\d+\ ?)',"",get_value_in_tag(affiliation, "ce:textfn"))
         if affiliation:
             return affiliation
         else:
             raise IndexError

示例#32

0

显示文件

    def get_doi(self, xml):
        ids = xml.getElementsByTagName('article-id')
        ret = ""
        for i in ids:
            if i.getAttribute('pub-id-type').encode('utf-8') == 'doi':
                ret = xml_to_text(i)

        if not ret:
            print >> sys.stdout, "Can't find DOI."
        return ret

示例#33

0

显示文件

文件： pos_package.py 项目： GiorgosPa/harvesting-kit

 def _get_authors(self):
     authors = []
     for tag in self.document.getElementsByTagName('dc:creator'):
         author = xml_to_text(tag)
         lastname = author.split()[-1]
         givenames = author.split()[:-1]
         lastname, givenames = fix_name_capitalization(lastname, givenames)
         givenames = collapse_initials(givenames)
         authors.append("%s, %s" % (lastname, givenames))
     return authors

示例#34

0

显示文件

文件： world_scientific_package.py 项目： kaplun/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             surname, given_names = fix_name_capitalization(
                 surname, given_names.split()
             )
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             authors.append((name, affiliations, emails))
     return authors

示例#35

0

显示文件

文件： jats_utils.py 项目： Dziolas/harvesting-kit

    def get_doi(self, xml):
        ids = xml.getElementsByTagName('article-id')
        ret = ""
        for i in ids:
            if i.getAttribute('pub-id-type').encode('utf-8') == 'doi':
                ret = xml_to_text(i)

        if not ret:
            print >> sys.stdout, "Can't find DOI."
        return ret

示例#36

0

显示文件

文件： world_scientific_package.py 项目： fschwenn/harvesting-kit

 def get_collection(self, journal):
     """Return this articles' collection."""
     conference = ''
     for tag in self.document.getElementsByTagName('conference'):
         conference = xml_to_text(tag)
     if conference or journal == "International Journal of Modern Physics: Conference Series":
         return [('a', 'HEP'), ('a', 'ConferencePaper')]
     elif self._get_article_type() == "review-article":
         return [('a', 'HEP'), ('a', 'Review')]
     else:
         return [('a', 'HEP'), ('a', 'Published')]

示例#37

0

显示文件

文件： elsevier_package.py 项目： GiorgosPa/harvesting-kit

 def get_authors(self, xml_doc):
     authors = []
     for author in xml_doc.getElementsByTagName("ce:author"):
         tmp = {}
         surname = get_value_in_tag(author, "ce:surname")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "ce:given-name")
         if given_name:
             tmp["given_name"] = given_name
         initials = get_value_in_tag(author, "ce:initials")
         if initials:
             tmp["initials"] = initials
         orcid = author.getAttribute('orcid').encode('utf-8')
         if orcid:
             tmp["orcid"] = orcid
         emails = author.getElementsByTagName("ce:e-address")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         cross_refs = author.getElementsByTagName("ce:cross-ref")
         if cross_refs:
             tmp["cross_ref"] = []
             for cross_ref in cross_refs:
                 tmp["cross_ref"].append(
                     cross_ref.getAttribute("refid").encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml_doc.getElementsByTagName("ce:affiliation"):
         aff_id = affiliation.getAttribute("id").encode('utf-8')
         text = re.sub(
             r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn"))
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get(
             "cross_ref", []) if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         message = "Implicit affiliations are used, "
         message += ("but there's more than one affiliation: "
                     + str(affiliations))
         print(message, file=sys.stderr)
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#38

0

显示文件

文件： world_scientific_package.py 项目： fschwenn/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             name = safe_title(name)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             collaborations = []
             for collaboration in contrib.getElementsByTagName("collab"):
                 collaborations.append(xml_to_text(collaboration))
             authors.append((name, affiliations, emails, collaborations))
     return authors

示例#39

0

显示文件

 def get_collection(self, journal):
     """Return this articles' collection."""
     conference = ''
     for tag in self.document.getElementsByTagName('conference'):
         conference = xml_to_text(tag)
     if conference or journal == "International Journal of Modern Physics: Conference Series":
         return [('a', 'HEP'), ('a', 'ConferencePaper')]
     elif self._get_article_type() == "review-article":
         return [('a', 'HEP'), ('a', 'Review')]
     else:
         return [('a', 'HEP'), ('a', 'Published')]

示例#40

0

显示文件

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             name = safe_title(name)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             collaborations = []
             for collaboration in contrib.getElementsByTagName("collab"):
                 collaborations.append(xml_to_text(collaboration))
             authors.append((name, affiliations, emails, collaborations))
     return authors

示例#41

0

显示文件

 def _affiliation_from_sa_field(self, affiliation):
     sa_affiliation = affiliation.getElementsByTagName('sa:affiliation')
     if sa_affiliation:
         return xml_to_text(sa_affiliation[0], ', ')
     else:
         affiliation = re.sub(r'^(\d+\ ?)', "",
                              get_value_in_tag(affiliation, "ce:textfn"))
         if affiliation:
             return affiliation
         else:
             raise IndexError

示例#42

0

显示文件

文件： pos_package.py 项目： caitriana/harvesting-kit

 def _get_authors(self):
     authors = []
     for tag in self.document.getElementsByTagName('dc:creator'):
         author = xml_to_text(tag)
         lastname = author.split()[-1]
         lastname = lastname[0] + lastname[1:].lower()
         givennames = ''
         for name in author.split()[:-1]:
             name = name[0] + name[1:].lower()
             givennames += name + ' '
         authors.append("%s, %s" % (lastname, givennames.strip()))
     return authors

示例#43

0

显示文件

    def get_issn(self, xml):
        issns = xml.getElementsByTagName('issn')
        ret = None

        for issn in issns:
            if issn.getAttribute("pub-type").encode('utf-8') == 'epub':
                ret = issn.getAttribute("pub-type").encode('utf-8')

        if not ret and issns:
            ret = xml_to_text(issns[0])

        return ret

示例#44

0

显示文件

文件： jats_utils.py 项目： Dziolas/scoap3_old

    def get_issn(self, xml):
        issns = xml.getElementsByTagName('issn')
        ret = None

        for issn in issns:
            if issn.getAttribute("pub-type").encode('utf-8') == 'epub':
                ret = issn.getAttribute("pub-type").encode('utf-8')

        if not ret and issns:
            ret = xml_to_text(issns[0])

        return ret

示例#45

0

显示文件

文件： scoap3_fix_bad_elsevier_affiliations.py 项目： Dziolas/invenio-checker

def author_dic_from_xml(author):
    return {key: val for key, val in {
        'surname': get_value_in_tag(author, "ce:surname"),
        'given_name': get_value_in_tag(author, "ce:given-name"),
        'initials': get_value_in_tag(author, "ce:initials"),
        'orcid': unicode(author.getAttribute('orcid')),
        'email': next((xml_to_text(email)
                       for email in author.getElementsByTagName("ce:e-address")
                       if unicode(email.getAttribute("type")) in ('email', '')),
                      None),
        'cross_ref': [unicode(cross_ref.getAttribute("refid")) for cross_ref
                      in author.getElementsByTagName("ce:cross-ref")]
    }.items() if val is not None}

示例#46

0

显示文件

 def _get_affiliations(self):
     affiliations = {}
     for tag in self.document.getElementsByTagName('aff'):
         aid = tag.getAttribute('id')
         affiliation = xml_to_text(tag)
         if affiliation:
             #removes the label
             try:
                 int(affiliation.split()[0])
                 affiliation = ' '.join(affiliation.split()[1:])
             except ValueError:
                 pass
         affiliations[aid] = affiliation
     return affiliations

示例#47

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_affiliations(self):
     affiliations = {}
     for tag in self.document.getElementsByTagName('aff'):
         aid = tag.getAttribute('id')
         affiliation = xml_to_text(tag)
         if affiliation:
             #removes the label
             try:
                 int(affiliation.split()[0])
                 affiliation = ' '.join(affiliation.split()[1:])
             except ValueError:
                 pass
         affiliations[aid] = affiliation
     return affiliations

示例#48

0

显示文件

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("ref"):
         plain_text = None
         try:
             ref_type = reference.getElementsByTagName('mixed-citation')[0]
             ref_type = ref_type.getAttribute('publication-type').encode('utf-8')
         except:
             ref_type = reference.getElementsByTagName('citation')[0]
             ref_type = ref_type.getAttribute('publication-type').encode('utf-8')
         label = get_value_in_tag(reference, "label").strip('.')
         authors = []
         for author in reference.getElementsByTagName("name"):
             given_name = get_value_in_tag(author, "given-names")
             surname = get_value_in_tag(author, "surname")
             if given_name:
                 name = "%s, %s" % (surname, given_name)
             else:
                 name = surname
             if name.strip().split() == []:
                 name = get_value_in_tag(author, "string-name")
             authors.append(name)
         doi_tag = reference.getElementsByTagName("pub-id")
         doi = ""
         for tag in doi_tag:
             if tag.getAttribute("pub-id-type") == "doi":
                 doi = xml_to_text(tag)
         issue = get_value_in_tag(reference, "issue")
         page = get_value_in_tag(reference, "fpage")
         page_last = get_value_in_tag(reference, "lpage")
         title = get_value_in_tag(reference, "source")
         volume = get_value_in_tag(reference, "volume")
         year = get_value_in_tag(reference, "year")
         ext_link = format_arxiv_id(self.get_ref_link(reference, "arxiv"))
         if ref_type != 'journal':
             try:
                 plain_text = get_value_in_tag(reference,
                                               "mixed-citation",
                                               tag_to_remove=self.tag_to_remove)
             except:
                 plain_text = get_value_in_tag(reference,
                                               "citation",
                                               tag_to_remove=self.tag_to_remove)
         references.append((label, authors, doi,
                            issue, page, page_last,
                            title, volume, year,
                            ext_link, plain_text))
     self.references = references

示例#49

0

显示文件

 def _get_author_emails(self):
     author_emails = {}
     for tag in self.document.getElementsByTagName('author-notes'):
         email_elements = tag.getElementsByTagName('corresp')
         email_elements += tag.getElementsByTagName('fn')
         for tg in email_elements:
             nid = tg.getAttribute('id')
             email = xml_to_text(tg)
             email = email.replace(';', '')
             #removes the label
             if email.split() > 1:
                 emails = email.split()[1:]
             valid_emails = []
             for email in emails:
                 if '@' in email and '.' in email:
                     valid_emails.append(email)
             author_emails[nid] = valid_emails
     return author_emails

示例#50

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_author_emails(self):
     author_emails = {}
     for tag in self.document.getElementsByTagName('author-notes'):
         email_elements = tag.getElementsByTagName('corresp')
         email_elements += tag.getElementsByTagName('fn')
         for tg in email_elements:
             nid = tg.getAttribute('id')
             email = xml_to_text(tg)
             email = email.replace(';', '')
             #removes the label
             if email.split() > 1:
                 emails = email.split()[1:]
             valid_emails = []
             for email in emails:
                 if '@' in email and '.' in email:
                     valid_emails.append(email)
             author_emails[nid] = valid_emails
     return author_emails

示例#51

0

显示文件

文件： scoap3_fix_bad_elsevier_affiliations.py 项目： dset0x/invenio-checker-old

def author_dic_from_xml(author):
    return {
        key: val
        for key, val in {
            'surname':
            get_value_in_tag(author, "ce:surname"),
            'given_name':
            get_value_in_tag(author, "ce:given-name"),
            'initials':
            get_value_in_tag(author, "ce:initials"),
            'orcid':
            unicode(author.getAttribute('orcid')),
            'email':
            next((xml_to_text(email)
                  for email in author.getElementsByTagName("ce:e-address")
                  if unicode(email.getAttribute("type")) in ('email',
                                                             '')), None),
            'cross_ref': [
                unicode(cross_ref.getAttribute("refid"))
                for cross_ref in author.getElementsByTagName("ce:cross-ref")
            ]
        }.items() if val is not None
    }

示例#52

0

显示文件

文件： chk_add_orcid.py 项目： katrinleinweber/scoap3

def _get_orcids(xml_doc):
    result = []

    xml_authors = xml_doc.getElementsByTagName("ce:author")
    for xml_author in xml_authors:
        try:
            orcid = xml_author.getAttribute('orcid')
            result.append(orcid)
        except IndexError:
            result.append('')

    xml_authors = xml_doc.getElementsByTagName("contrib")
    for xml_author in xml_authors:
        try:
            contrib_id = xml_author.getElementsByTagName('contrib-id')[0]
            if contrib_id.getAttribute('contrib-id-type') == 'orcid':
                orcid_raw = xml_to_text(contrib_id)
                pattern = '\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\d|X]'
                result.append(re.search(pattern, orcid_raw).group())
        except (IndexError, AttributeError):
            result.append('')

    return result

示例#53

0

显示文件

 def _get_note(self, note_id):
     for tag in self.document.getElementsByTagName('fn'):
         if tag.getAttribute('id') == note_id:
             for label in tag.getElementsByTagName('label'):
                 tag.removeChild(label)
             return xml_to_text(tag)

示例#54

0

显示文件

    def get_record_rich(self, filename, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName('ArticleID')
        for article in articles:
            article_type = article.getAttribute('Type')
            if not article_type == 'Article':
                return ''
            doi = get_value_in_tag(self.document, 'DOI')
            date = ''
            for tag in self.document.getElementsByTagName('Accepted'):
                year = get_value_in_tag(tag, 'Year')
                month = get_value_in_tag(tag, 'Month').zfill(2)
                day = get_value_in_tag(tag, 'Day').zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName('OnlineDate'):
                    year = get_value_in_tag(tag, 'Year')
                    month = get_value_in_tag(tag, 'Month').zfill(2)
                    day = get_value_in_tag(tag, 'Day').zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, 'FirstPage')
            last_page = get_value_in_tag(article, 'LastPage')
            subjects = article.getElementsByTagName('Keyword')
            subjects = map(xml_to_text, subjects)
            subject = ', '.join(subjects)
            copyright_statement = get_value_in_tag(article, 'Copyright')
        journal = get_value_in_tag(self.document, 'JournalTitle')
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName('IssueID')
        for issue in issues:
            volume += get_value_in_tag(issue, 'Volume')
            year = get_value_in_tag(issue, 'Year')
        title = get_value_in_tag(self.document, 'Title')
        authors = self.document.getElementsByTagName('Author')
        affiliations = self.document.getElementsByTagName('Affiliation')

        def affiliation_pair(a):
            return a.getAttribute('ID'), get_value_in_tag(
                a, 'UnstructuredAffiliation')

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, 'LastName')
            first_name = get_value_in_tag(a, 'FirstName')
            middle_name = get_value_in_tag(a, 'MiddleName')
            if middle_name:
                name = '%s, %s %s' % (surname, first_name, middle_name)
            else:
                name = '%s, %s' % (surname, first_name)
            try:
                affid = a.getElementsByTagName(
                    'AffiliationID')[0].getAttribute('Label')
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ''
            except KeyError:
                affiliation = ''
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, 'Abstract')
        references = self.document.getElementsByTagName('Bibliomixed')

        for reference in references:
            subfields = []
            label = reference.getAttribute('N')
            if label:
                subfields.append(('o', label))
            bibliosets = reference.getElementsByTagName('Biblioset')
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, 'Date')
                ref_journal = get_value_in_tag(tag, 'JournalShortTitle')
                ref_journal, ref_volume = fix_journal_name(
                    ref_journal, self.journal_mappings)
                ref_volume += get_value_in_tag(tag, 'Volume')
                ref_page = get_value_in_tag(tag, 'ArtPageNums')
                if ref_year:
                    subfields.append(('y', ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(
                        ('s',
                         '%s,%s,%s' % (ref_journal, ref_volume, ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            if ref_extract_callback:
                ref_xml = ref_extract_callback(text_ref)
                dom = parseString(ref_xml)
                fields = dom.getElementsByTagName("datafield")[0]
                fields = fields.getElementsByTagName("subfield")
                if fields:
                    subfields.append(('9', 'refextract'))
                for field in fields:
                    data = field.firstChild.data
                    code = field.getAttribute("code")
                    if code == 'm' and bibliosets:
                        continue
                    else:
                        subfields.append((code, data))
            else:
                subfields.append(('m', text_ref))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)

        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'EDPSciences')])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '700', subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(('c', '%s-%s' % (first_page, last_page)))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        if copyright_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', copyright_statement)])
        if subject:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'EDPSciences'), ('a', subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""

示例#55

0

显示文件

 def _get_ref(self, ref, label):
     doi = get_value_in_tag(ref, "ce:doi")
     page = get_value_in_tag(ref, "sb:first-page")
     if not page:
         page = get_value_in_tag(ref, "sb:article-number")
     issue = get_value_in_tag(ref, "sb:issue")
     title = get_value_in_tag(ref, "sb:maintitle")
     volume = get_value_in_tag(ref, "sb:volume-nr")
     tmp_issues = ref.getElementsByTagName('sb:issue')
     if tmp_issues:
         year = get_value_in_tag(tmp_issues[0], "sb:date")
     else:
         year = ''
     textref = ref.getElementsByTagName("ce:textref")
     if textref:
         textref = xml_to_text(textref[0])
     ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
     authors = []
     for author in ref.getElementsByTagName("sb:author"):
         given_name = get_value_in_tag(author, "ce:given-name")
         surname = get_value_in_tag(author, "ce:surname")
         if given_name:
             name = "%s, %s" % (surname, given_name)
         else:
             name = surname
         authors.append(name)
     if ext_link and ext_link.lower().startswith('arxiv'):
         # check if the identifier contains
         # digits seperated by dot
         regex = r'\d*\.\d*'
         if not re.search(regex, ext_link):
             ext_link = ext_link[6:]
     comment = get_value_in_tag(ref, "sb:comment")
     links = []
     for link in ref.getElementsByTagName("ce:inter-ref"):
         links.append(xml_to_text(link))
     title = ""
     try:
         container = ref.getElementsByTagName("sb:contribution")[0]
         title = container.getElementsByTagName("sb:maintitle")[0]
         title = xml_to_text(title)
     except IndexError:
         title = ''
     except TypeError:
         title = ''
     isjournal = ref.getElementsByTagName("sb:issue")
     journal = ""
     if isjournal:
         isjournal = True
         if not page:
             page = comment
         container = ref.getElementsByTagName("sb:issue")[0]
         journal = get_value_in_tag(container, "sb:maintitle")
     edited_book = ref.getElementsByTagName("sb:edited-book")
     editors = []
     book_title = ""
     publisher = ""
     if edited_book:
         # treat as a journal
         if ref.getElementsByTagName("sb:book-series"):
             container = ref.getElementsByTagName("sb:book-series")[0]
             journal = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(ref, "sb:date")
             isjournal = True
         # conference
         elif ref.getElementsByTagName("sb:conference"):
             container = ref.getElementsByTagName("sb:edited-book")[0]
             maintitle = get_value_in_tag(container, "sb:maintitle")
             conference = get_value_in_tag(container, "sb:conference")
             date = get_value_in_tag(container, "sb:date")
             # use this variable in order to get in the 'm' field
             publisher = maintitle + ", " + conference + ", " + date
         else:
             container = ref.getElementsByTagName("sb:edited-book")[0]
             if ref.getElementsByTagName("sb:editors"):
                 for editor in ref.getElementsByTagName("sb:editor"):
                     surname = get_value_in_tag(editor, "ce:surname")
                     firstname = get_value_in_tag(editor, "ce:given-name")
                     editors.append("%s,%s" % (surname, firstname))
             if title:
                 book_title = get_value_in_tag(container, "sb:maintitle")
             else:
                 title = get_value_in_tag(container, "sb:maintitle")
             year = get_value_in_tag(container, "sb:date")
             if ref.getElementsByTagName("sb:publisher"):
                 container = ref.getElementsByTagName("sb:publisher")[0]
                 location = get_value_in_tag(container, "sb:location")
                 publisher = get_value_in_tag(container, "sb:name")
                 if location:
                     publisher = location + ": " + publisher
     if ref.getElementsByTagName("sb:book"):
         if ref.getElementsByTagName("sb:book-series"):
             book_series = ref.getElementsByTagName("sb:book-series")[0]
             title += ", " + \
                 get_value_in_tag(book_series, "sb:maintitle")
             title += ", " + \
                 get_value_in_tag(book_series, "sb:volume-nr")
         publisher = get_value_in_tag(ref, "sb:publisher")
     if not year:
         year = get_value_in_tag(ref, "sb:date")
     year = re.sub(r'\D', '', year)
     return (label, authors, doi, issue, page, title, volume, year, textref,
             ext_link, isjournal, comment, journal, publisher, editors,
             book_title)

示例#56

0

显示文件

    def get_record(self,
                   path=None,
                   no_pdf=False,
                   test=False,
                   refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', start_date), ('t', 'published')])
        else:
            record_add_field(rec,
                             '260',
                             subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec,
                             '540',
                             subfields=[('a', license), ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get('given_name')
                           or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec,
                                     '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            for keyword in keywords:
                record_add_field(rec,
                                 '653',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' % (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec,
                                     '856',
                                     ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path),
                                                ('t', 'INSPIRE-PUBLIC'),
                                                ('d', 'Fulltext')])
                else:
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path), ('t', 'Elsevier'),
                                                ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(rec,
                                     '653',
                                     ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, )
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(".pdf;pdfa",
                                                     exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: ' +
                                   doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s' %
                                          (doi, ))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi, )
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec,
                                 '980',
                                 subfields=[('a', 'SCOAP3'),
                                            ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""

示例#57

0

显示文件

    def get_record(self, fileName, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_jp directory

        :param fileName: the name of the file to parse.
        :type fileName: string
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(fileName)
        article_type = self._get_article_type()
        if article_type not in ['research-article', 'introduction', 'letter']:
            return ''
        rec = create_record()
        title, subtitle, notes = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        subjects = self.document.getElementsByTagName('kwd')
        subjects = map(xml_to_text, subjects)
        for note_id in notes:
            note = self._get_note(note_id)
            if note:
                record_add_field(rec, '500', subfields=[('a', note)])
        for subject in subjects:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'EDPSciences'), ('a', subject)])
        keywords = self._get_keywords()
        for keyword in keywords:
            record_add_field(rec,
                             '653',
                             ind1='1',
                             subfields=[('a', keyword), ('9', 'author')])
        journal, volume, issue, year, date, doi, page,\
            fpage, lpage = self._get_publication_information()
        astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.']
        if journal in astronomy_journals:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'INSPIRE'),
                                        ('a', 'Astrophysics')])
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        abstract = self._get_abstract()
        abstract = self._format_abstract(abstract)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'EDPSciences')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        if license_type == 'open-access':
            self._attach_fulltext(rec, doi)
        number_of_pages = self._get_page_count()
        if number_of_pages:
            record_add_field(rec, '300', subfields=[('a', number_of_pages)])
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec,
                             '542',
                             subfields=[('d', c_holder), ('g', c_year),
                                        ('e', 'Article')])
        elif c_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', c_statement), ('e', 'Article')])
        subfields = []
        if journal:
            subfields.append(('p', journal))
        if issue:
            subfields.append(('n', issue))
        if volume:
            subfields.append(('v', volume))
        if fpage and lpage:
            subfields.append(('c', '%s-%s' % (fpage, lpage)))
        elif page:
            subfields.append(('c', page))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        conference = ''
        for tag in self.document.getElementsByTagName('conference'):
            conference = xml_to_text(tag)
        if conference:
            record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')])
            record_add_field(rec, '500', subfields=[('a', conference)])
        self._add_references(rec, ref_extract_callback)
        self._add_authors(rec)
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""

示例#58

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_keywords(self, xml):
     try:
         return [xml_to_text(keyword) for keyword in xml.getElementsByTagName("Keyword")]
     except Exception, err:
         print >> sys.stderr, "Can't find keywords. %s" % (err,)