Python get_value_in_tag示例，harvestingkit.minidom_utils.get_value_in_tag Python示例

示例#1

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_publication_date(self, xml_doc):
     """Return the best effort start_date."""
     start_date = get_value_in_tag(xml_doc, 'oa:openAccessEffective')
     if start_date:
         start_date = datetime.datetime.strptime(
             start_date, "%Y-%m-%dT%H:%M:%SZ"
         )
         return start_date.strftime("%Y-%m-%d")
     start_date = get_value_in_tag(xml_doc, "prism:coverDate")
     if not start_date:
         start_date = get_value_in_tag(xml_doc, "prism:coverDisplayDate")
         import dateutil.parser
         try:
             date = dateutil.parser.parse(start_date)
         except ValueError:
             return ''
         # Special case where we ignore the deduced day form dateutil
         # in case it was not given in the first place.
         if len(start_date.split(" ")) == 3:
             return date.strftime("%Y-%m-%d")
         else:
             return date.strftime("%Y-%m")
     else:
         if len(start_date) is 8:
             start_date = time.strftime(
                 '%Y-%m-%d', time.strptime(start_date, '%Y%m%d'))
         elif len(start_date) is 6:
             start_date = time.strftime(
                 '%Y-%m', time.strptime(start_date, '%Y%m'))
         return start_date

示例#2

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

    def _author_dic_from_xml(self, author):
        tmp = {}
        surname = get_value_in_tag(author, "ce:surname")
        if surname:
            tmp["surname"] = surname
        given_name = get_value_in_tag(author, "ce:given-name")
        if given_name:
            tmp["given_name"] = given_name
        initials = get_value_in_tag(author, "ce:initials")
        if initials:
            tmp["initials"] = initials
        orcid = author.getAttribute('orcid').encode('utf-8')
        if orcid:
            tmp["orcid"] = orcid
        emails = author.getElementsByTagName("ce:e-address")
        for email in emails:
            if email.getAttribute("type").encode('utf-8') in ('email', ''):
                tmp["email"] = xml_to_text(email)
                break
        cross_refs = author.getElementsByTagName("ce:cross-ref")
        if cross_refs:
            tmp["cross_ref"] = []
            for cross_ref in cross_refs:
                tmp["cross_ref"].append(
                    cross_ref.getAttribute("refid").encode('utf-8'))

        return tmp

示例#3

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         # Springer puts colaborations in additional "contrib" tag so to
         # avoid having fake author with all affiliations we skip "contrib"
         # tag with "contrib" subtags.
         if contrib.getElementsByTagName('contrib'):
             continue
         if contrib.getElementsByTagName('collab'):
             continue
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             corresp = []
             for tag in contrib.getElementsByTagName('xref'):
                 if tag.getAttribute('ref-type') == 'aff':
                     for rid in tag.getAttribute('rid').split():
                         if rid.lower().startswith('a'):
                             affiliations.append(rid)
                         elif rid.lower().startswith('n'):
                             corresp.append(rid)
                 elif tag.getAttribute('ref-type') == 'corresp' or\
                         tag.getAttribute('ref-type') == 'author-notes':
                     for rid in tag.getAttribute('rid').split():
                         corresp.append(rid)
             authors.append((name, affiliations, corresp))
     return authors

示例#4

0

显示文件

 def _extract_date(date):
     year = get_value_in_tag(date, 'year')
     month = get_value_in_tag(date, 'month').zfill(2)
     month = month if month != '00' else '01'
     day = get_value_in_tag(date, 'day').zfill(2)
     day = day if day != '00' else '01'
     return '%s-%s-%s' % (year, month, day)

示例#5

0

显示文件

文件： edpsciences_package.py 项目： kaplun/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link,
                authors, year, source, volume, page)

示例#6

0

显示文件

文件： aps_package.py 项目： ksachs/harvesting-kit

 def _get_authors(self):
     authors = []
     affiliations = {}
     for tag in self.document.getElementsByTagName('aff'):
         aid = tag.getAttribute('id')
         affiliation = xml_to_text(tag)
         affiliation = ' '.join(affiliation.split()[1:])
         affiliations[aid] = affiliation
     for tag in self.document.getElementsByTagName('contrib'):
         if tag.getAttribute('contrib-type') == 'author':
             rid = ''
             for aff in tag.getElementsByTagName('xref'):
                 if aff.getAttribute('ref-type') == 'aff':
                     rid = aff.getAttribute('rid')
                 if len(rid.split()) > 1:
                     rid = rid.split()[0]
             given_names = get_value_in_tag(tag, 'given-names')
             given_names = collapse_initials(given_names)
             surname = get_value_in_tag(tag, 'surname')
             name = "%s, %s" % (surname, given_names)
             try:
                 authors.append((name, affiliations[rid]))
             except KeyError:
                 authors.append((name, ''))
     return authors

示例#7

0

显示文件

    def get_publication_information(self, xml):
        jid = get_value_in_tag(xml, "journal-title")
        journal = ""
        if "European Physical Journal" in jid:
            journal = "EPJC"

        try:
            art = xml.getElementsByTagName('article-meta')[0]
        except IndexError as err:
            register_exception()
            print >> sys.stderr, "ERROR: XML corrupted: %s" % err
            pass
        except Exception as err:
            register_exception()
            print >> sys.stderr, "ERROR: Exception captured: %s" % err
            pass

        issn = self.get_issn(art)
        volume = get_value_in_tag(art, "volume")
        issue = get_value_in_tag(art, "issue")
        year = self.get_date(art)
        first_page = get_value_in_tag(art, "fpage")
        last_page = get_value_in_tag(art, "lpage")
        doi = self.get_doi(art)

        return (journal, issn, volume, issue, first_page, last_page, year, doi)

示例#8

0

显示文件

 def get_publication_date(self, xml_doc):
     """Return the best effort start_date."""
     start_date = get_value_in_tag(xml_doc, "prism:coverDate")
     if not start_date:
         start_date = get_value_in_tag(xml_doc, "prism:coverDisplayDate")
         if not start_date:
             start_date = get_value_in_tag(xml_doc,
                                           'oa:openAccessEffective')
             if start_date:
                 start_date = datetime.datetime.strptime(
                     start_date, "%Y-%m-%dT%H:%M:%SZ")
                 return start_date.strftime("%Y-%m-%d")
         import dateutil.parser
         try:
             date = dateutil.parser.parse(start_date)
         except ValueError:
             return ''
         # Special case where we ignore the deduced day form dateutil
         # in case it was not given in the first place.
         if len(start_date.split(" ")) == 3:
             return date.strftime("%Y-%m-%d")
         else:
             return date.strftime("%Y-%m")
     else:
         if len(start_date) is 8:
             start_date = time.strftime('%Y-%m-%d',
                                        time.strptime(start_date, '%Y%m%d'))
         elif len(start_date) is 6:
             start_date = time.strftime('%Y-%m',
                                        time.strptime(start_date, '%Y%m'))
         return start_date

示例#9

0

显示文件

文件： world_scientific_package.py 项目： fschwenn/harvesting-kit

 def _extract_date(date):
     year = get_value_in_tag(date, 'year')
     month = get_value_in_tag(date, 'month').zfill(2)
     month = month if month != '00' else '01'
     day = get_value_in_tag(date, 'day').zfill(2)
     day = day if day != '00' else '01'
     return '%s-%s-%s' % (year, month, day)

示例#10

0

显示文件

    def _author_dic_from_xml(self, author):
        tmp = {}
        surname = get_value_in_tag(author, "ce:surname")
        if surname:
            tmp["surname"] = surname
        given_name = get_value_in_tag(author, "ce:given-name")
        if given_name:
            tmp["given_name"] = given_name
        initials = get_value_in_tag(author, "ce:initials")
        if initials:
            tmp["initials"] = initials
        orcid = author.getAttribute('orcid').encode('utf-8')
        if orcid:
            tmp["orcid"] = orcid
        emails = author.getElementsByTagName("ce:e-address")
        for email in emails:
            if email.getAttribute("type").encode('utf-8') in ('email', ''):
                tmp["email"] = xml_to_text(email)
                break
        cross_refs = author.getElementsByTagName("ce:cross-ref")
        if cross_refs:
            tmp["cross_ref"] = []
            for cross_ref in cross_refs:
                tmp["cross_ref"].append(
                    cross_ref.getAttribute("refid").encode('utf-8'))

        return tmp

示例#11

0

显示文件

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link, authors, year, source,
                volume, page)

示例#12

0

显示文件

def convert_record(record, response_date, request):
    header = record.getElementsByTagName("header")[0]
    oai_identifier = get_value_in_tag(header, "identifier")
    datestamp = get_value_in_tag(header, "datestamp")
    status = header.getAttribute("status").encode('utf8')
    rec = create_record()
    record_add_field(rec, tag="035", subfields=[('a', oai_identifier),
                                                ('u', request),
                                                ('9', 'Hindawi'),
                                                ('d', datestamp),
                                                ('h', response_date),
                                                ('m', 'marc21'),
                                                ('t', 'false')])
    new = True
    if find_records_from_extoaiid(oai_identifier, 'Hindawi'):
        new = False
    if status == 'deleted':
        if new:
            ## deleting a record we didn't have? Who cares :-)
            return None, True
        else:
            record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')])
            return record_xml_output(rec), False
    for datafield in record.getElementsByTagName("datafield"):
        tag = datafield.getAttribute("tag").encode('utf-8')
        ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' '
        ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' '
        subfields = []
        for subfield in datafield.getElementsByTagName("subfield"):
            code = subfield.getAttribute("code").encode('utf-8')
            value = xml_to_text(subfield)
            subfields.append((code, value))
        record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields)
    return record_xml_output(rec), new

示例#13

0

显示文件

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         # Springer puts colaborations in additional "contrib" tag so to
         # avoid having fake author with all affiliations we skip "contrib"
         # tag with "contrib" subtags.
         if contrib.getElementsByTagName('contrib'):
             continue
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             corresp = []
             for tag in contrib.getElementsByTagName('xref'):
                 if tag.getAttribute('ref-type') == 'aff':
                     for rid in tag.getAttribute('rid').split():
                         if rid.lower().startswith('a'):
                             affiliations.append(rid)
                         elif rid.lower().startswith('n'):
                             corresp.append(rid)
                 elif tag.getAttribute('ref-type') == 'corresp' or\
                         tag.getAttribute('ref-type') == 'author-notes':
                     for rid in tag.getAttribute('rid').split():
                         corresp.append(rid)
             authors.append((name, affiliations, corresp))
     return authors

示例#14

0

显示文件

文件： jats_utils.py 项目： Dziolas/harvesting-kit

    def get_publication_information(self, xml):
        jid = get_value_in_tag(xml, "journal-title")
        journal = ""
        if "European Physical Journal" in jid:
            journal = "EPJC"

        try:
            art = xml.getElementsByTagName('article-meta')[0]
        except IndexError as err:
            register_exception()
            print >> sys.stderr, "ERROR: XML corrupted: %s" % err
            pass
        except Exception as err:
            register_exception()
            print >> sys.stderr, "ERROR: Exception captured: %s" % err
            pass

        issn = self.get_issn(art)
        volume = get_value_in_tag(art, "volume")
        issue = get_value_in_tag(art, "issue")
        year = self.get_date(art)
        first_page = get_value_in_tag(art, "fpage")
        last_page = get_value_in_tag(art, "lpage")
        doi = self.get_doi(art)

        return (journal, issn, volume, issue, first_page, last_page, year, doi)

示例#15

0

显示文件

文件： nlm_utils.py 项目： GiorgosPa/harvesting-kit

 def get_arxiv_id(self, xml):
     custom_metas = xml.getElementsByTagName("custom-meta")
     ext_link = None
     for meta in custom_metas:
         if get_value_in_tag(meta, "meta-name") == "arxiv-id":
             ext_link = format_arxiv_id(get_value_in_tag(meta, "meta-value").encode('utf-8'))
     return ext_link

示例#16

0

显示文件

文件： edpsciences_package.py 项目： GiorgosPa/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName("ref"):
         label = ref.getAttribute("id")
         label = sub(r"\D", "", label)
         text_ref = ""
         ext_link = ""
         for mixed in ref.getElementsByTagName("mixed-citation"):
             ref_type = mixed.getAttribute("publication-type")
             if ref_type == "thesis":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "conf-proc":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "other" or ref_type == "web":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
                 ext_link = get_value_in_tag(mixed, "ext-link")
             elif ref_type == "book":
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName("string-name"):
             surname = get_value_in_tag(auth, "surname")
             given_names = get_value_in_tag(auth, "given-names")
             given_names = collapse_initials(given_names)
             authors.append("%s, %s" % (surname, given_names))
         year = get_value_in_tag(ref, "year")
         source = get_value_in_tag(ref, "source")
         volume = get_value_in_tag(ref, "volume")
         page = get_value_in_tag(ref, "fpage")
         if ref_type == "journal":
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page

示例#17

0

显示文件

 def get_copyright(self, xml_doc):
     try:
         copyright = get_value_in_tag(xml_doc, "ce:copyright")
         if not copyright:
             copyright = get_value_in_tag(xml_doc, "prism:copyright")
         return copyright
     except Exception:
         print("Can't find copyright", file=sys.stderr)

示例#18

0

显示文件

 def get_arxiv_id(self, xml):
     custom_metas = xml.getElementsByTagName("custom-meta")
     ext_link = None
     for meta in custom_metas:
         if get_value_in_tag(meta, "meta-name") == "arxiv-id":
             ext_link = format_arxiv_id(
                 get_value_in_tag(meta, "meta-value").encode('utf-8'))
     return ext_link

示例#19

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_copyright(self, xml_doc):
     try:
         copyright = get_value_in_tag(xml_doc, "ce:copyright")
         if not copyright:
             copyright = get_value_in_tag(xml_doc, "prism:copyright")
         return copyright
     except Exception:
         print("Can't find copyright", file=sys.stderr)

示例#20

0

显示文件

 def _get_journal(self):
     try:
         title = get_value_in_tag(self.document, 'abbrev-journal-title')
         if not title:
             title = get_value_in_tag(self.document, 'journal-title')
         return title.strip()
     except Exception:
         print("Can't find journal-title", file=sys.stderr)
         return ''

示例#21

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_journal(self):
     try:
         title = get_value_in_tag(self.document, 'abbrev-journal-title')
         if not title:
             title = get_value_in_tag(self.document, 'journal-title')
         return title.strip()
     except Exception:
         print("Can't find journal-title", file=sys.stderr)
         return ''

示例#22

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_copyright(self):
     try:
         copyright_holder = get_value_in_tag(self.document, 'copyright-holder')
         copyright_year = get_value_in_tag(self.document, 'copyright-year')
         copyright_statement = get_value_in_tag(self.document, 'copyright-statement')
         return copyright_holder, copyright_year, copyright_statement
     except Exception:
         print("Can't find copyright", file=sys.stderr)
         return '', '', ''

示例#23

0

显示文件

文件： aps_package.py 项目： ksachs/harvesting-kit

 def _get_publition_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     article_id = get_value_in_tag(self.document, 'elocation-id')
     volume += get_value_in_tag(self.document, 'volume')
     issue = get_value_in_tag(self.document, 'issue')
     year = get_value_in_tag(self.document, 'copyright-year')
     return (journal, volume, issue, year, date, doi, article_id)

示例#24

0

显示文件

文件： elsevier_package.py 项目： ksachs/harvesting-kit

 def get_references(self, xml_doc):
     for ref in xml_doc.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(ref, "ce:label")
         if self.CONSYN:
             innerrefs = ref.getElementsByTagName("sb:reference")
             if not innerrefs:
                 yield self._get_ref(ref, label) 
             for inner in innerrefs:
                 yield self._get_ref(inner, label)
         else:
             authors = []
             for author in ref.getElementsByTagName("sb:author"):
                 given_name = get_value_in_tag(author, "ce:given-name")
                 surname = get_value_in_tag(author, "ce:surname")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi = get_value_in_tag(ref, "ce:doi")
             issue = get_value_in_tag(ref, "sb:issue")
             page = get_value_in_tag(ref, "sb:first-page")
             title = get_value_in_tag(ref, "sb:maintitle")
             volume = get_value_in_tag(ref, "sb:volume-nr")
             tmp_issues = ref.getElementsByTagName('sb:issue')
             if tmp_issues:
                 year = get_value_in_tag(tmp_issues[0], "sb:date")[:4]
             else:
                 year = ''
             textref = ref.getElementsByTagName("ce:textref")
             if textref:
                 textref = xml_to_text(textref[0])
             ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv'))
             yield (label, authors, doi, issue, page, title, volume,
                    year, textref, ext_link)

示例#25

0

显示文件

文件： elsevier_package.py 项目： fschwenn/harvesting-kit

 def get_keywords(self, xml_doc):
     head = xml_doc.getElementsByTagName("ja:head")
     if not head: 
         head = xml_doc.getElementsByTagName("cja:head")
     if not head:
         keywords = xml_doc.getElementsByTagName("ce:keyword")
     else:
         keywords = head[0].getElementsByTagName("ce:keyword")
     return [get_value_in_tag(keyword, "ce:text") 
             for keyword in keywords 
             if get_value_in_tag(keyword, "ce:text")]

示例#26

0

显示文件

 def _get_copyright(self):
     try:
         copyright_holder = get_value_in_tag(self.document,
                                             'copyright-holder')
         copyright_year = get_value_in_tag(self.document, 'copyright-year')
         copyright_statement = get_value_in_tag(self.document,
                                                'copyright-statement')
         return copyright_holder, copyright_year, copyright_statement
     except Exception:
         print("Can't find copyright", file=sys.stderr)
         return '', '', ''

示例#27

0

显示文件

文件： elsevier_package.py 项目： GiorgosPa/harvesting-kit

 def get_authors(self, xml_doc):
     authors = []
     for author in xml_doc.getElementsByTagName("ce:author"):
         tmp = {}
         surname = get_value_in_tag(author, "ce:surname")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "ce:given-name")
         if given_name:
             tmp["given_name"] = given_name
         initials = get_value_in_tag(author, "ce:initials")
         if initials:
             tmp["initials"] = initials
         orcid = author.getAttribute('orcid').encode('utf-8')
         if orcid:
             tmp["orcid"] = orcid
         emails = author.getElementsByTagName("ce:e-address")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         cross_refs = author.getElementsByTagName("ce:cross-ref")
         if cross_refs:
             tmp["cross_ref"] = []
             for cross_ref in cross_refs:
                 tmp["cross_ref"].append(
                     cross_ref.getAttribute("refid").encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml_doc.getElementsByTagName("ce:affiliation"):
         aff_id = affiliation.getAttribute("id").encode('utf-8')
         text = re.sub(
             r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn"))
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get(
             "cross_ref", []) if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         message = "Implicit affiliations are used, "
         message += ("but there's more than one affiliation: "
                     + str(affiliations))
         print(message, file=sys.stderr)
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#28

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#29

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_authors(self, xml):
     authors = []
     for author in xml.getElementsByTagName("Author"):
         tmp = {}
         surname = get_value_in_tag(author, "FamilyName")
         if surname:
             tmp["surname"] = surname
         given_name = get_value_in_tag(author, "GivenName")
         if given_name:
             tmp["given_name"] = given_name.replace('\n', ' ')
         # initials = get_value_in_tag(author, "ce:initials")
         # if initials:
         #     tmp["initials"] = initials
         # It's not there
         # orcid = author.getAttribute('orcid').encode('utf-8')
         # if orcid:
         #     tmp["orcid"] = orcid
         emails = author.getElementsByTagName("Email")
         for email in emails:
             if email.getAttribute("type").encode('utf-8') in ('email', ''):
                 tmp["email"] = xml_to_text(email)
                 break
         # cross_refs = author.getElementsByTagName("ce:cross-ref")
         # if cross_refs:
         #     tmp["cross_ref"] = []
         #     for cross_ref in cross_refs:
         #         tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8'))
         tmp["affiliations_ids"] = []
         aids = author.getAttribute("AffiliationIDS").split()
         for aid in aids:
             tmp["affiliations_ids"].append(aid.encode('utf-8'))
         authors.append(tmp)
     affiliations = {}
     for affiliation in xml.getElementsByTagName("Affiliation"):
         aff_id = affiliation.getAttribute("ID").encode('utf-8')
         text = xml_to_text(affiliation, delimiter=', ')
         affiliations[aff_id] = text
     implicit_affilations = True
     for author in authors:
         matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations]
         if matching_ref:
             implicit_affilations = False
             author["affiliation"] = []
             for i in xrange(0, len(matching_ref)):
                 author["affiliation"].append(affiliations[matching_ref[i]])
     if implicit_affilations and len(affiliations) > 1:
         print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations
     if implicit_affilations and len(affiliations) >= 1:
         for author in authors:
             author["affiliation"] = []
             for aff in affiliations.values():
                 author["affiliation"].append(aff)
     return authors

示例#30

0

显示文件

 def get_keywords(self, xml_doc):
     head = xml_doc.getElementsByTagName("ja:head")
     if not head:
         head = xml_doc.getElementsByTagName("cja:head")
     if not head:
         keywords = xml_doc.getElementsByTagName("ce:keyword")
     else:
         keywords = head[0].getElementsByTagName("ce:keyword")
     return [
         get_value_in_tag(keyword, "ce:text") for keyword in keywords
         if get_value_in_tag(keyword, "ce:text")
     ]

示例#31

0

显示文件

 def _get_publication_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     issue = get_value_in_tag(self.document, 'issue')
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     volume += get_value_in_tag(self.document, 'volume')
     page = get_value_in_tag(self.document, 'elocation-id')
     fpage = get_value_in_tag(self.document, 'fpage')
     lpage = get_value_in_tag(self.document, 'lpage')
     year = date[:4]
     return (journal, volume, issue, year, date, doi, page, fpage, lpage)

示例#32

0

显示文件

文件： jats_package.py 项目： Dziolas/harvesting-kit

 def _get_publication_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     issue = get_value_in_tag(self.document, 'issue')
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     volume += get_value_in_tag(self.document, 'volume')
     page = get_value_in_tag(self.document, 'elocation-id')
     fpage = get_value_in_tag(self.document, 'fpage')
     lpage = get_value_in_tag(self.document, 'lpage')
     year = date[:4]
     return (journal, volume, issue, year, date, doi, page, fpage, lpage)

示例#33

0

显示文件

文件： scoap3_fix_bad_elsevier_affiliations.py 项目： Dziolas/invenio-checker

def author_dic_from_xml(author):
    return {key: val for key, val in {
        'surname': get_value_in_tag(author, "ce:surname"),
        'given_name': get_value_in_tag(author, "ce:given-name"),
        'initials': get_value_in_tag(author, "ce:initials"),
        'orcid': unicode(author.getAttribute('orcid')),
        'email': next((xml_to_text(email)
                       for email in author.getElementsByTagName("ce:e-address")
                       if unicode(email.getAttribute("type")) in ('email', '')),
                      None),
        'cross_ref': [unicode(cross_ref.getAttribute("refid")) for cross_ref
                      in author.getElementsByTagName("ce:cross-ref")]
    }.items() if val is not None}

示例#34

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references

示例#35

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_publication_date(self, xml):
     article_info = xml.getElementsByTagName("ArticleInfo")[0]
     article_history = article_info.getElementsByTagName("ArticleHistory")[0]
     online_date = article_history.getElementsByTagName("OnlineDate")
     if online_date:
         online_date = online_date[0]
         year = get_value_in_tag(online_date, "Year")
         month = get_value_in_tag(online_date, "Month")
         day = get_value_in_tag(online_date, "Day")
         try:
             return "%04d-%02d-%02d" % (int(year), int(month), int(day))
         except Exception, err:
             print >> sys.stderr, "Can't reliably extract the publication date: %s" % err
             return ""

示例#36

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_publication_date(self, xml):
     try:
         article_info = xml.getElementsByTagName("ArticleInfo")[0]
         article_history = article_info.getElementsByTagName("ArticleHistory")[0]
         online_date = article_history.getElementsByTagName("OnlineDate")
         if online_date:
             online_date = online_date[0]
             year = get_value_in_tag(online_date, "Year")
             month = get_value_in_tag(online_date, "Month")
             day = get_value_in_tag(online_date, "Day")
             return "%04d-%02d-%02d" % (int(year), int(month), int(day))
     except Exception, err:
         print >> sys.stderr, "Can't reliably extract the publication date: %s" % err
         return ""

示例#37

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_references(self, xml):
     references = []
     for reference in xml.getElementsByTagName("Citation"):
         if not reference.getElementsByTagName("BibArticle"):
             references.append((get_value_in_tag(reference,
                                                 "BibUnstructured"),
                                '', '', '', '', '', '', ''))
         else:
             label = get_value_in_tag(reference, "ArticleTitle")
             authors = []
             for author in reference.getElementsByTagName("BibAuthorName"):
                 given_name = get_value_in_tag(author, "Initials")
                 surname = get_value_in_tag(author, "FamilyName")
                 if given_name:
                     name = "%s, %s" % (surname, given_name)
                 else:
                     name = surname
                 authors.append(name)
             doi_tag = reference.getElementsByTagName("Occurrence")
             doi = ""
             for tag in doi_tag:
                 if tag.getAttribute("Type") == "DOI":
                     doi = xml_to_text(tag)
             ## What is it exactly?
             # issue = get_value_in_tag(reference, "sb:issue")
             issue = ""
             page = get_value_in_tag(reference, "FirstPage")
             title = get_value_in_tag(reference, "JournalTitle")
             volume = get_value_in_tag(reference, "VolumeID")
             year = get_value_in_tag(reference, "Year")
             references.append((label, authors, doi, issue, page, title, volume, year))
     return references

示例#38

0

显示文件

文件： aps_package.py 项目： ksachs/harvesting-kit

 def _get_journal(self):
     try:
         title = get_value_in_tag(self.document, 'abbrev-journal-title')
         if not title:
             title = get_value_in_tag(self.document, 'journal-title')
         try:
             title = self.journal_mappings[title.upper()]
         except KeyError:
             pass
         title = title.replace('. ', '.')
         return title
     except Exception:
         print >> sys.stderr, "Can't find journal-title"
         return ''

示例#39

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_keywords(self, xml_doc):
     if self.CONSYN:
         try:
             head = xml_doc.getElementsByTagName("ja:head")[0]
             keywords = head.getElementsByTagName("ce:keyword")
             return [get_value_in_tag(keyword, "ce:text")
                     for keyword in keywords]
         except Exception:
             print("Can't find keywords", file=sys.stderr)
     else:
         try:
             keywords = xml_doc.getElementsByTagName("ce:keyword")
             return [get_value_in_tag(keyword, "ce:text")
                     for keyword in keywords]
         except Exception:
             print("Can't find keywords", file=sys.stderr)

示例#40

0

显示文件

文件： edpsciences_package.py 项目： GiorgosPa/harvesting-kit

 def author_pair(a):
     surname = get_value_in_tag(a, "LastName")
     first_name = get_value_in_tag(a, "FirstName")
     middle_name = get_value_in_tag(a, "MiddleName")
     if middle_name:
         name = "%s, %s %s" % (surname, first_name, middle_name)
     else:
         name = "%s, %s" % (surname, first_name)
     try:
         affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label")
         affiliation = affiliations[affid]
     except IndexError:
         affiliation = ""
     except KeyError:
         affiliation = ""
     return name, affiliation

示例#41

0

显示文件

文件： chk_find_erratum_ahep.py 项目： Dziolas/scoap3-1

def check_records(records):
    for record in records:
        ## Stupid hack because bibcheck filters does not work as expected
        if record_get_field_value(record, '980', code='b') == "Hindawi":
            record.warn("Working on this record")
            recdoc = BibRecDocs(int(record.record_id))
            doc = recdoc.get_bibdoc(recdoc.get_bibdoc_names()[0])
            try:
                xml_file = open(doc.get_file("xml").get_full_path())
            except:
                record.warn("No document can be found")
                continue
            xml2 = xml.dom.minidom.parseString(xml_file.read())
            subject = get_value_in_tag(xml2, "subject")
            if subject in ["Editorial", "Erratum", "Corrigendum", "Addendum","Letter to the Editor"]:
                field = record_get_field_value(record, '980', code='c')
                if field:
                    if field in ['ERRATUM', 'ADDENDUM', 'EDITORIAL','CORRIGENDUM', 'LETTER TO THE EDITOR']:
                        for position, value in record.iterfield('980__c'):
                            record.amend_field(position, subject.upper())
                            break
                    else:
                        for position, value in record.iterfield('980__%'):
                            record.add_subfield(position, 'c', subject.upper())
                            break
                else:
                    for position, value in record.iterfield('980__%'):
                        record.add_subfield(position, 'c', subject.upper())
                        break
            elif subject not in ["Review Article","Research Article","Retraction"]:
                raise Exception("This subject: %s does not exit in SCOAP3 system" % (subject,))

示例#42

0

显示文件

    def _add_group_affiliation(self, author, xml_author):
        affs = [
            get_value_in_tag(aff, "ce:textfn") for aff in
            xml_author.parentNode.getElementsByTagName('ce:affiliation')
        ]

        return self._add_affiliations_to_author(author, affs)

示例#43

0

显示文件

 def get_references(self, xml_doc):
     for ref in xml_doc.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(ref, "ce:label")
         innerrefs = ref.getElementsByTagName("sb:reference")
         if not innerrefs:
             yield self._get_ref(ref, label)
         for inner in innerrefs:
             yield self._get_ref(inner, label)

示例#44

0

显示文件

文件： app_utils.py 项目： Dziolas/scoap3_old

 def get_publication_information(self, xml):
     try:
         doi = get_value_in_tag(xml, "ArticleDOI")
         if not doi:
             raise ValueError("DOI not found")
     except Exception, err:
         print >> sys.stderr, "Can't find doi: %s" % err
         raise

示例#45

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_doi(self, xml):
     doi = ""
     try:
         doi = get_value_in_tag(xml, "ArticleDOI")
         if not doi:
             print >> sys.stderr, "DOI not found"
     except Exception, err:
         print >> sys.stderr, "Can't find doi: %s" % err

示例#46

0

显示文件

 def author_pair(a):
     surname = get_value_in_tag(a, 'LastName')
     first_name = get_value_in_tag(a, 'FirstName')
     middle_name = get_value_in_tag(a, 'MiddleName')
     if middle_name:
         name = '%s, %s %s' % (surname, first_name, middle_name)
     else:
         name = '%s, %s' % (surname, first_name)
     try:
         affid = a.getElementsByTagName(
             'AffiliationID')[0].getAttribute('Label')
         affiliation = affiliations[affid]
     except IndexError:
         affiliation = ''
     except KeyError:
         affiliation = ''
     return name, affiliation

示例#47

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_license(self, xml_doc):
     license = ''
     license_url = ''
     for tag in xml_doc.getElementsByTagName('oa:openAccessInformation'):
         license_url = get_value_in_tag(tag, 'oa:userLicense')
     if license_url.startswith('http://creativecommons.org/licenses/by/3.0'):
         license = 'CC-BY-3.0'
     return license, license_url

示例#48

0

显示文件

文件： elsevier_package.py 项目： Dziolas/harvesting-kit

 def get_references(self, xml_doc):
     for ref in xml_doc.getElementsByTagName("ce:bib-reference"):
         label = get_value_in_tag(ref, "ce:label")
         innerrefs = ref.getElementsByTagName("sb:reference")
         if not innerrefs:
             yield self._get_ref(ref, label)
         for inner in innerrefs:
             yield self._get_ref(inner, label)

示例#49

0

显示文件

 def get_identifier(self):
     """ Returns the identifier of the paper corresponding
         to this record containing the conference which it
         was published and the proceeding number."""
     try:
         return get_value_in_tag(self.document, 'identifier')
     except Exception:
         print >> sys.stderr, "Can't find identifier"
         return ''

示例#50

0

显示文件

 def _get_publisher(self):
     try:
         publisher = get_value_in_tag(self.document, 'pex-dc:publisher')
         if publisher == 'Sissa Medialab':
             publisher = 'SISSA'
         return publisher
     except Exception:
         print >> sys.stderr, "Can't find publisher"
         return ''

示例#51

0

显示文件

 def _get_copyright(self):
     try:
         record_copyright = get_value_in_tag(self.document, 'pex-dc:rights')
         if record_copyright == 'Creative Commons Attribution-NonCommercial-ShareAlike':
             record_copyright = 'CC-BY-NC-SA'
         return record_copyright
     except Exception:
         print >> sys.stderr, "Can't find copyright"
         return ''

示例#52

0

显示文件

 def get_license(self, xml_doc):
     license = ''
     license_url = ''
     for tag in xml_doc.getElementsByTagName('oa:openAccessInformation'):
         license_url = get_value_in_tag(tag, 'oa:userLicense')
     if license_url.startswith(
             'http://creativecommons.org/licenses/by/3.0'):
         license = 'CC-BY-3.0'
     return license, license_url

示例#53

0

显示文件

 def get_publication_information(self, xml_doc, path='', timeout=60):
     if self.CONSYN:
         publication = get_value_in_tag(xml_doc, "prism:publicationName")
         doi = get_value_in_tag(xml_doc, "prism:doi")
         issn = get_value_in_tag(xml_doc, "prism:issn")
         issue = get_value_in_tag(xml_doc, "prism:number")
         first_page = get_value_in_tag(xml_doc, "prism:startingPage")
         last_page = get_value_in_tag(xml_doc, "prism:endingPage")
         journal = publication.split(",")[0]
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         try:
             vol = publication.split(",")[1].strip()
             if vol.startswith("Section"):
                 vol = vol[7:].strip()
             if vol and not volume:
                 volume = vol
         except IndexError:
             pass
         vol = get_value_in_tag(xml_doc, "prism:volume")
         if vol is "" and path is not "":
             # if volume is not present try to harvest it
             try:
                 session = requests.session()
                 url = 'http://www.sciencedirect.com/science/article/pii'\
                       + path.split('/')[-1]
                 headers = {'user-agent': make_user_agent()}
                 r = session.get(url, headers=headers, timeout=timeout)
                 parsed_html = BeautifulSoup(r.text)
                 info = parsed_html.body.find('p',
                                              attrs={
                                                  'class': 'volIssue'
                                              }).text.split()
                 for s in info:
                     if unicode(s).find(u'\xe2') > 0:
                         first_page = s.rsplit(u'\xe2')[0]
                         last_page = s.rsplit(u'\x93')[1]
                 if info[1].lower() != 'online':
                     vol = info[1][:-1]
             except:
                 pass
         if vol:
             volume += vol
         start_date = self.get_publication_date(xml_doc)
         year = start_date.split("-")[0]
         doi = get_value_in_tag(xml_doc, "ce:doi")
         return (journal, issn, volume, issue, first_page, last_page, year,
                 start_date, doi)
     else:
         doi = self._get_doi(xml_doc)
         try:
             return self._dois[doi] + (doi, )
         except KeyError:
             return ('', '', '', '', '', '', '', '', doi)

示例#54

0

显示文件

文件： app_utils.py 项目： tsgit/harvesting-kit

 def get_arxiv_id(self, xml):
     article_note = xml.getElementsByTagName('ArticleNote')
     if article_note:
         article_note = article_note[0]
     else:
         return ""
     arxiv_id = get_value_in_tag(article_note, "RefSource")
     if RE_ARXIV_ID.match(arxiv_id):
         return "arXiv:%s" % arxiv_id
     return ""

示例#55

0

显示文件

文件： scoap3_fix_bad_elsevier_affiliations.py 项目： dset0x/invenio-checker-old

def find_affiliations(xml_doc):
    tmp = {}
    for aff in xml_doc.getElementsByTagName("ce:affiliation"):
        aff_id = aff.getAttribute("id").encode('utf-8')
        try:
            tmp[aff_id] = _affiliation_from_sa_field(aff)
        except:
            tmp[aff_id] = re.sub(r'^(\d+\ ?)', "",
                                 get_value_in_tag(aff, "ce:textfn"))
    return tmp

示例#56

0

显示文件

 def _affiliation_from_sa_field(self, affiliation):
     sa_affiliation = affiliation.getElementsByTagName('sa:affiliation')
     if sa_affiliation:
         return xml_to_text(sa_affiliation[0], ', ')
     else:
         affiliation = re.sub(r'^(\d+\ ?)', "",
                              get_value_in_tag(affiliation, "ce:textfn"))
         if affiliation:
             return affiliation
         else:
             raise IndexError

示例#57

0

显示文件

 def _get_license(self):
     license = ''
     license_type = ''
     license_url = ''
     for tag in self.document.getElementsByTagName('license'):
         license = get_value_in_tag(tag, 'ext-link')
         license_type = tag.getAttribute('license-type')
         license_url = get_attribute_in_tag(tag, 'ext-link', 'xlink:href')
     if license_url:
         license_url = license_url[0]
     return license, license_type, license_url

示例#58

0

显示文件

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             name = safe_title(name)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             collaborations = []
             for collaboration in contrib.getElementsByTagName("collab"):
                 collaborations.append(xml_to_text(collaboration))
             authors.append((name, affiliations, emails, collaborations))
     return authors