def get_publication_date(self, xml_doc): """Return the best effort start_date.""" start_date = get_value_in_tag(xml_doc, 'oa:openAccessEffective') if start_date: start_date = datetime.datetime.strptime( start_date, "%Y-%m-%dT%H:%M:%SZ" ) return start_date.strftime("%Y-%m-%d") start_date = get_value_in_tag(xml_doc, "prism:coverDate") if not start_date: start_date = get_value_in_tag(xml_doc, "prism:coverDisplayDate") import dateutil.parser try: date = dateutil.parser.parse(start_date) except ValueError: return '' # Special case where we ignore the deduced day form dateutil # in case it was not given in the first place. if len(start_date.split(" ")) == 3: return date.strftime("%Y-%m-%d") else: return date.strftime("%Y-%m") else: if len(start_date) is 8: start_date = time.strftime( '%Y-%m-%d', time.strptime(start_date, '%Y%m%d')) elif len(start_date) is 6: start_date = time.strftime( '%Y-%m', time.strptime(start_date, '%Y%m')) return start_date
def _author_dic_from_xml(self, author): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append( cross_ref.getAttribute("refid").encode('utf-8')) return tmp
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): # Springer puts colaborations in additional "contrib" tag so to # avoid having fake author with all affiliations we skip "contrib" # tag with "contrib" subtags. if contrib.getElementsByTagName('contrib'): continue if contrib.getElementsByTagName('collab'): continue if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) affiliations = [] corresp = [] for tag in contrib.getElementsByTagName('xref'): if tag.getAttribute('ref-type') == 'aff': for rid in tag.getAttribute('rid').split(): if rid.lower().startswith('a'): affiliations.append(rid) elif rid.lower().startswith('n'): corresp.append(rid) elif tag.getAttribute('ref-type') == 'corresp' or\ tag.getAttribute('ref-type') == 'author-notes': for rid in tag.getAttribute('rid').split(): corresp.append(rid) authors.append((name, affiliations, corresp)) return authors
def _extract_date(date): year = get_value_in_tag(date, 'year') month = get_value_in_tag(date, 'month').zfill(2) month = month if month != '00' else '01' day = get_value_in_tag(date, 'day').zfill(2) day = day if day != '00' else '01' return '%s-%s-%s' % (year, month, day)
def _get_references(self): for ref in self.document.getElementsByTagName('ref'): label = ref.getAttribute('id') label = sub(r'\D', '', label) text_ref = '' ext_link = '' for mixed in ref.getElementsByTagName('mixed-citation'): ref_type = mixed.getAttribute('publication-type') if ref_type == 'thesis': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'conf-proc': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'other' or ref_type == 'web': text_ref = get_value_in_tag(ref, 'mixed-citation') ext_link = get_value_in_tag(mixed, 'ext-link') elif ref_type == 'book': text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName('string-name'): surname = get_value_in_tag(auth, 'surname') given_names = get_value_in_tag(auth, 'given-names') given_names = collapse_initials(given_names) authors.append('%s, %s' % (surname, given_names)) year = get_value_in_tag(ref, 'year') source = get_value_in_tag(ref, 'source') volume = get_value_in_tag(ref, 'volume') page = get_value_in_tag(ref, 'fpage') if ref_type == 'journal': source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield (label, ref_type, text_ref, ext_link, authors, year, source, volume, page)
def _get_authors(self): authors = [] affiliations = {} for tag in self.document.getElementsByTagName('aff'): aid = tag.getAttribute('id') affiliation = xml_to_text(tag) affiliation = ' '.join(affiliation.split()[1:]) affiliations[aid] = affiliation for tag in self.document.getElementsByTagName('contrib'): if tag.getAttribute('contrib-type') == 'author': rid = '' for aff in tag.getElementsByTagName('xref'): if aff.getAttribute('ref-type') == 'aff': rid = aff.getAttribute('rid') if len(rid.split()) > 1: rid = rid.split()[0] given_names = get_value_in_tag(tag, 'given-names') given_names = collapse_initials(given_names) surname = get_value_in_tag(tag, 'surname') name = "%s, %s" % (surname, given_names) try: authors.append((name, affiliations[rid])) except KeyError: authors.append((name, '')) return authors
def get_publication_information(self, xml): jid = get_value_in_tag(xml, "journal-title") journal = "" if "European Physical Journal" in jid: journal = "EPJC" try: art = xml.getElementsByTagName('article-meta')[0] except IndexError as err: register_exception() print >> sys.stderr, "ERROR: XML corrupted: %s" % err pass except Exception as err: register_exception() print >> sys.stderr, "ERROR: Exception captured: %s" % err pass issn = self.get_issn(art) volume = get_value_in_tag(art, "volume") issue = get_value_in_tag(art, "issue") year = self.get_date(art) first_page = get_value_in_tag(art, "fpage") last_page = get_value_in_tag(art, "lpage") doi = self.get_doi(art) return (journal, issn, volume, issue, first_page, last_page, year, doi)
def get_publication_date(self, xml_doc): """Return the best effort start_date.""" start_date = get_value_in_tag(xml_doc, "prism:coverDate") if not start_date: start_date = get_value_in_tag(xml_doc, "prism:coverDisplayDate") if not start_date: start_date = get_value_in_tag(xml_doc, 'oa:openAccessEffective') if start_date: start_date = datetime.datetime.strptime( start_date, "%Y-%m-%dT%H:%M:%SZ") return start_date.strftime("%Y-%m-%d") import dateutil.parser try: date = dateutil.parser.parse(start_date) except ValueError: return '' # Special case where we ignore the deduced day form dateutil # in case it was not given in the first place. if len(start_date.split(" ")) == 3: return date.strftime("%Y-%m-%d") else: return date.strftime("%Y-%m") else: if len(start_date) is 8: start_date = time.strftime('%Y-%m-%d', time.strptime(start_date, '%Y%m%d')) elif len(start_date) is 6: start_date = time.strftime('%Y-%m', time.strptime(start_date, '%Y%m')) return start_date
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = create_record() record_add_field(rec, tag="035", subfields=[('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false')]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): # Springer puts colaborations in additional "contrib" tag so to # avoid having fake author with all affiliations we skip "contrib" # tag with "contrib" subtags. if contrib.getElementsByTagName('contrib'): continue if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) affiliations = [] corresp = [] for tag in contrib.getElementsByTagName('xref'): if tag.getAttribute('ref-type') == 'aff': for rid in tag.getAttribute('rid').split(): if rid.lower().startswith('a'): affiliations.append(rid) elif rid.lower().startswith('n'): corresp.append(rid) elif tag.getAttribute('ref-type') == 'corresp' or\ tag.getAttribute('ref-type') == 'author-notes': for rid in tag.getAttribute('rid').split(): corresp.append(rid) authors.append((name, affiliations, corresp)) return authors
def get_arxiv_id(self, xml): custom_metas = xml.getElementsByTagName("custom-meta") ext_link = None for meta in custom_metas: if get_value_in_tag(meta, "meta-name") == "arxiv-id": ext_link = format_arxiv_id(get_value_in_tag(meta, "meta-value").encode('utf-8')) return ext_link
def _get_references(self): for ref in self.document.getElementsByTagName("ref"): label = ref.getAttribute("id") label = sub(r"\D", "", label) text_ref = "" ext_link = "" for mixed in ref.getElementsByTagName("mixed-citation"): ref_type = mixed.getAttribute("publication-type") if ref_type == "thesis": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "conf-proc": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "other" or ref_type == "web": text_ref = get_value_in_tag(ref, "mixed-citation") ext_link = get_value_in_tag(mixed, "ext-link") elif ref_type == "book": text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName("string-name"): surname = get_value_in_tag(auth, "surname") given_names = get_value_in_tag(auth, "given-names") given_names = collapse_initials(given_names) authors.append("%s, %s" % (surname, given_names)) year = get_value_in_tag(ref, "year") source = get_value_in_tag(ref, "source") volume = get_value_in_tag(ref, "volume") page = get_value_in_tag(ref, "fpage") if ref_type == "journal": source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page
def get_copyright(self, xml_doc): try: copyright = get_value_in_tag(xml_doc, "ce:copyright") if not copyright: copyright = get_value_in_tag(xml_doc, "prism:copyright") return copyright except Exception: print("Can't find copyright", file=sys.stderr)
def get_arxiv_id(self, xml): custom_metas = xml.getElementsByTagName("custom-meta") ext_link = None for meta in custom_metas: if get_value_in_tag(meta, "meta-name") == "arxiv-id": ext_link = format_arxiv_id( get_value_in_tag(meta, "meta-value").encode('utf-8')) return ext_link
def _get_journal(self): try: title = get_value_in_tag(self.document, 'abbrev-journal-title') if not title: title = get_value_in_tag(self.document, 'journal-title') return title.strip() except Exception: print("Can't find journal-title", file=sys.stderr) return ''
def _get_copyright(self): try: copyright_holder = get_value_in_tag(self.document, 'copyright-holder') copyright_year = get_value_in_tag(self.document, 'copyright-year') copyright_statement = get_value_in_tag(self.document, 'copyright-statement') return copyright_holder, copyright_year, copyright_statement except Exception: print("Can't find copyright", file=sys.stderr) return '', '', ''
def _get_publition_information(self): journal = self._get_journal() date = self._get_date() doi = self._get_doi() journal, volume = fix_journal_name(journal, self.journal_mappings) article_id = get_value_in_tag(self.document, 'elocation-id') volume += get_value_in_tag(self.document, 'volume') issue = get_value_in_tag(self.document, 'issue') year = get_value_in_tag(self.document, 'copyright-year') return (journal, volume, issue, year, date, doi, article_id)
def get_references(self, xml_doc): for ref in xml_doc.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(ref, "ce:label") if self.CONSYN: innerrefs = ref.getElementsByTagName("sb:reference") if not innerrefs: yield self._get_ref(ref, label) for inner in innerrefs: yield self._get_ref(inner, label) else: authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi = get_value_in_tag(ref, "ce:doi") issue = get_value_in_tag(ref, "sb:issue") page = get_value_in_tag(ref, "sb:first-page") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date")[:4] else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) yield (label, authors, doi, issue, page, title, volume, year, textref, ext_link)
def get_keywords(self, xml_doc): head = xml_doc.getElementsByTagName("ja:head") if not head: head = xml_doc.getElementsByTagName("cja:head") if not head: keywords = xml_doc.getElementsByTagName("ce:keyword") else: keywords = head[0].getElementsByTagName("ce:keyword") return [get_value_in_tag(keyword, "ce:text") for keyword in keywords if get_value_in_tag(keyword, "ce:text")]
def get_authors(self, xml_doc): authors = [] for author in xml_doc.getElementsByTagName("ce:author"): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append( cross_ref.getAttribute("refid").encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml_doc.getElementsByTagName("ce:affiliation"): aff_id = affiliation.getAttribute("id").encode('utf-8') text = re.sub( r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get( "cross_ref", []) if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: message = "Implicit affiliations are used, " message += ("but there's more than one affiliation: " + str(affiliations)) print(message, file=sys.stderr) if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("Author"): tmp = {} surname = get_value_in_tag(author, "FamilyName") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "GivenName") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # initials = get_value_in_tag(author, "ce:initials") # if initials: # tmp["initials"] = initials # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid emails = author.getElementsByTagName("Email") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] aids = author.getAttribute("AffiliationIDS").split() for aid in aids: tmp["affiliations_ids"].append(aid.encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("Affiliation"): aff_id = affiliation.getAttribute("ID").encode('utf-8') text = xml_to_text(affiliation, delimiter=', ') affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_keywords(self, xml_doc): head = xml_doc.getElementsByTagName("ja:head") if not head: head = xml_doc.getElementsByTagName("cja:head") if not head: keywords = xml_doc.getElementsByTagName("ce:keyword") else: keywords = head[0].getElementsByTagName("ce:keyword") return [ get_value_in_tag(keyword, "ce:text") for keyword in keywords if get_value_in_tag(keyword, "ce:text") ]
def _get_publication_information(self): journal = self._get_journal() date = self._get_date() doi = self._get_doi() issue = get_value_in_tag(self.document, 'issue') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(self.document, 'volume') page = get_value_in_tag(self.document, 'elocation-id') fpage = get_value_in_tag(self.document, 'fpage') lpage = get_value_in_tag(self.document, 'lpage') year = date[:4] return (journal, volume, issue, year, date, doi, page, fpage, lpage)
def author_dic_from_xml(author): return {key: val for key, val in { 'surname': get_value_in_tag(author, "ce:surname"), 'given_name': get_value_in_tag(author, "ce:given-name"), 'initials': get_value_in_tag(author, "ce:initials"), 'orcid': unicode(author.getAttribute('orcid')), 'email': next((xml_to_text(email) for email in author.getElementsByTagName("ce:e-address") if unicode(email.getAttribute("type")) in ('email', '')), None), 'cross_ref': [unicode(cross_ref.getAttribute("refid")) for cross_ref in author.getElementsByTagName("ce:cross-ref")] }.items() if val is not None}
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("Citation"): if not reference.getElementsByTagName("BibArticle"): references.append((get_value_in_tag(reference, "BibUnstructured"), '', '', '', '', '', '', '')) else: label = get_value_in_tag(reference, "ArticleTitle") authors = [] for author in reference.getElementsByTagName("BibAuthorName"): given_name = get_value_in_tag(author, "Initials") surname = get_value_in_tag(author, "FamilyName") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi_tag = reference.getElementsByTagName("Occurrence") doi = "" for tag in doi_tag: if tag.getAttribute("Type") == "DOI": doi = xml_to_text(tag) ## What is it exactly? # issue = get_value_in_tag(reference, "sb:issue") issue = "" page = get_value_in_tag(reference, "FirstPage") title = get_value_in_tag(reference, "JournalTitle") volume = get_value_in_tag(reference, "VolumeID") year = get_value_in_tag(reference, "Year") references.append((label, authors, doi, issue, page, title, volume, year)) return references
def get_publication_date(self, xml): article_info = xml.getElementsByTagName("ArticleInfo")[0] article_history = article_info.getElementsByTagName("ArticleHistory")[0] online_date = article_history.getElementsByTagName("OnlineDate") if online_date: online_date = online_date[0] year = get_value_in_tag(online_date, "Year") month = get_value_in_tag(online_date, "Month") day = get_value_in_tag(online_date, "Day") try: return "%04d-%02d-%02d" % (int(year), int(month), int(day)) except Exception, err: print >> sys.stderr, "Can't reliably extract the publication date: %s" % err return ""
def get_publication_date(self, xml): try: article_info = xml.getElementsByTagName("ArticleInfo")[0] article_history = article_info.getElementsByTagName("ArticleHistory")[0] online_date = article_history.getElementsByTagName("OnlineDate") if online_date: online_date = online_date[0] year = get_value_in_tag(online_date, "Year") month = get_value_in_tag(online_date, "Month") day = get_value_in_tag(online_date, "Day") return "%04d-%02d-%02d" % (int(year), int(month), int(day)) except Exception, err: print >> sys.stderr, "Can't reliably extract the publication date: %s" % err return ""
def _get_journal(self): try: title = get_value_in_tag(self.document, 'abbrev-journal-title') if not title: title = get_value_in_tag(self.document, 'journal-title') try: title = self.journal_mappings[title.upper()] except KeyError: pass title = title.replace('. ', '.') return title except Exception: print >> sys.stderr, "Can't find journal-title" return ''
def get_keywords(self, xml_doc): if self.CONSYN: try: head = xml_doc.getElementsByTagName("ja:head")[0] keywords = head.getElementsByTagName("ce:keyword") return [get_value_in_tag(keyword, "ce:text") for keyword in keywords] except Exception: print("Can't find keywords", file=sys.stderr) else: try: keywords = xml_doc.getElementsByTagName("ce:keyword") return [get_value_in_tag(keyword, "ce:text") for keyword in keywords] except Exception: print("Can't find keywords", file=sys.stderr)
def author_pair(a): surname = get_value_in_tag(a, "LastName") first_name = get_value_in_tag(a, "FirstName") middle_name = get_value_in_tag(a, "MiddleName") if middle_name: name = "%s, %s %s" % (surname, first_name, middle_name) else: name = "%s, %s" % (surname, first_name) try: affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label") affiliation = affiliations[affid] except IndexError: affiliation = "" except KeyError: affiliation = "" return name, affiliation
def check_records(records): for record in records: ## Stupid hack because bibcheck filters does not work as expected if record_get_field_value(record, '980', code='b') == "Hindawi": record.warn("Working on this record") recdoc = BibRecDocs(int(record.record_id)) doc = recdoc.get_bibdoc(recdoc.get_bibdoc_names()[0]) try: xml_file = open(doc.get_file("xml").get_full_path()) except: record.warn("No document can be found") continue xml2 = xml.dom.minidom.parseString(xml_file.read()) subject = get_value_in_tag(xml2, "subject") if subject in ["Editorial", "Erratum", "Corrigendum", "Addendum","Letter to the Editor"]: field = record_get_field_value(record, '980', code='c') if field: if field in ['ERRATUM', 'ADDENDUM', 'EDITORIAL','CORRIGENDUM', 'LETTER TO THE EDITOR']: for position, value in record.iterfield('980__c'): record.amend_field(position, subject.upper()) break else: for position, value in record.iterfield('980__%'): record.add_subfield(position, 'c', subject.upper()) break else: for position, value in record.iterfield('980__%'): record.add_subfield(position, 'c', subject.upper()) break elif subject not in ["Review Article","Research Article","Retraction"]: raise Exception("This subject: %s does not exit in SCOAP3 system" % (subject,))
def _add_group_affiliation(self, author, xml_author): affs = [ get_value_in_tag(aff, "ce:textfn") for aff in xml_author.parentNode.getElementsByTagName('ce:affiliation') ] return self._add_affiliations_to_author(author, affs)
def get_references(self, xml_doc): for ref in xml_doc.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(ref, "ce:label") innerrefs = ref.getElementsByTagName("sb:reference") if not innerrefs: yield self._get_ref(ref, label) for inner in innerrefs: yield self._get_ref(inner, label)
def get_publication_information(self, xml): try: doi = get_value_in_tag(xml, "ArticleDOI") if not doi: raise ValueError("DOI not found") except Exception, err: print >> sys.stderr, "Can't find doi: %s" % err raise
def get_doi(self, xml): doi = "" try: doi = get_value_in_tag(xml, "ArticleDOI") if not doi: print >> sys.stderr, "DOI not found" except Exception, err: print >> sys.stderr, "Can't find doi: %s" % err
def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID')[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation
def get_license(self, xml_doc): license = '' license_url = '' for tag in xml_doc.getElementsByTagName('oa:openAccessInformation'): license_url = get_value_in_tag(tag, 'oa:userLicense') if license_url.startswith('http://creativecommons.org/licenses/by/3.0'): license = 'CC-BY-3.0' return license, license_url
def get_identifier(self): """ Returns the identifier of the paper corresponding to this record containing the conference which it was published and the proceeding number.""" try: return get_value_in_tag(self.document, 'identifier') except Exception: print >> sys.stderr, "Can't find identifier" return ''
def _get_publisher(self): try: publisher = get_value_in_tag(self.document, 'pex-dc:publisher') if publisher == 'Sissa Medialab': publisher = 'SISSA' return publisher except Exception: print >> sys.stderr, "Can't find publisher" return ''
def _get_copyright(self): try: record_copyright = get_value_in_tag(self.document, 'pex-dc:rights') if record_copyright == 'Creative Commons Attribution-NonCommercial-ShareAlike': record_copyright = 'CC-BY-NC-SA' return record_copyright except Exception: print >> sys.stderr, "Can't find copyright" return ''
def get_license(self, xml_doc): license = '' license_url = '' for tag in xml_doc.getElementsByTagName('oa:openAccessInformation'): license_url = get_value_in_tag(tag, 'oa:userLicense') if license_url.startswith( 'http://creativecommons.org/licenses/by/3.0'): license = 'CC-BY-3.0' return license, license_url
def get_publication_information(self, xml_doc, path='', timeout=60): if self.CONSYN: publication = get_value_in_tag(xml_doc, "prism:publicationName") doi = get_value_in_tag(xml_doc, "prism:doi") issn = get_value_in_tag(xml_doc, "prism:issn") issue = get_value_in_tag(xml_doc, "prism:number") first_page = get_value_in_tag(xml_doc, "prism:startingPage") last_page = get_value_in_tag(xml_doc, "prism:endingPage") journal = publication.split(",")[0] journal, volume = fix_journal_name(journal, self.journal_mappings) try: vol = publication.split(",")[1].strip() if vol.startswith("Section"): vol = vol[7:].strip() if vol and not volume: volume = vol except IndexError: pass vol = get_value_in_tag(xml_doc, "prism:volume") if vol is "" and path is not "": # if volume is not present try to harvest it try: session = requests.session() url = 'http://www.sciencedirect.com/science/article/pii'\ + path.split('/')[-1] headers = {'user-agent': make_user_agent()} r = session.get(url, headers=headers, timeout=timeout) parsed_html = BeautifulSoup(r.text) info = parsed_html.body.find('p', attrs={ 'class': 'volIssue' }).text.split() for s in info: if unicode(s).find(u'\xe2') > 0: first_page = s.rsplit(u'\xe2')[0] last_page = s.rsplit(u'\x93')[1] if info[1].lower() != 'online': vol = info[1][:-1] except: pass if vol: volume += vol start_date = self.get_publication_date(xml_doc) year = start_date.split("-")[0] doi = get_value_in_tag(xml_doc, "ce:doi") return (journal, issn, volume, issue, first_page, last_page, year, start_date, doi) else: doi = self._get_doi(xml_doc) try: return self._dois[doi] + (doi, ) except KeyError: return ('', '', '', '', '', '', '', '', doi)
def get_arxiv_id(self, xml): article_note = xml.getElementsByTagName('ArticleNote') if article_note: article_note = article_note[0] else: return "" arxiv_id = get_value_in_tag(article_note, "RefSource") if RE_ARXIV_ID.match(arxiv_id): return "arXiv:%s" % arxiv_id return ""
def find_affiliations(xml_doc): tmp = {} for aff in xml_doc.getElementsByTagName("ce:affiliation"): aff_id = aff.getAttribute("id").encode('utf-8') try: tmp[aff_id] = _affiliation_from_sa_field(aff) except: tmp[aff_id] = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(aff, "ce:textfn")) return tmp
def _affiliation_from_sa_field(self, affiliation): sa_affiliation = affiliation.getElementsByTagName('sa:affiliation') if sa_affiliation: return xml_to_text(sa_affiliation[0], ', ') else: affiliation = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) if affiliation: return affiliation else: raise IndexError
def _get_license(self): license = '' license_type = '' license_url = '' for tag in self.document.getElementsByTagName('license'): license = get_value_in_tag(tag, 'ext-link') license_type = tag.getAttribute('license-type') license_url = get_attribute_in_tag(tag, 'ext-link', 'xlink:href') if license_url: license_url = license_url[0] return license, license_type, license_url
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) name = safe_title(name) affiliations = [] for aff in contrib.getElementsByTagName('aff'): affiliations.append(xml_to_text(aff)) emails = [] for email in contrib.getElementsByTagName('email'): emails.append(xml_to_text(email)) collaborations = [] for collaboration in contrib.getElementsByTagName("collab"): collaborations.append(xml_to_text(collaboration)) authors.append((name, affiliations, emails, collaborations)) return authors