def get_ref_link(self, xml, name): links = xml.getElementsByTagName('ext-link') ret = None for link in links: if name in link.getAttribute("xlink:href").encode('utf-8'): ret = xml_to_text(link).strip() if not ret: links = xml.getElementsByTagName('elocation-id') for link in links: if name in link.getAttribute("content-type").encode('utf-8'): ret = xml_to_text(link).strip() return ret
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("Author"): tmp = {} surname = get_value_in_tag(author, "FamilyName") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "GivenName") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # initials = get_value_in_tag(author, "ce:initials") # if initials: # tmp["initials"] = initials # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid emails = author.getElementsByTagName("Email") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] aids = author.getAttribute("AffiliationIDS").split() for aid in aids: tmp["affiliations_ids"].append(aid.encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("Affiliation"): aff_id = affiliation.getAttribute("ID").encode('utf-8') text = xml_to_text(affiliation, delimiter=', ') affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_keywords(self, xml): try: kwd_groups = xml.getElementsByTagName('kwd-group') pacs = [] other = [] for kwd_group in kwd_groups: if kwd_group.getAttribute('kwd-group-type').encode('utf-8') == "pacs": pacs = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")] else: other = [xml_to_text(keyword, tag_to_remove=self.tag_to_remove) for keyword in kwd_group.getElementsByTagName("kwd")] return {"pacs": pacs, "other": other} except Exception: print >> sys.stderr, "Can't find keywords"
def _get_authors(self): authors = [] for pextag in self.document.getElementsByTagName('pex-dc:creator'): affiliations = [] for auttag in pextag.getElementsByTagName('pex-dc:name'): author = xml_to_text(auttag) lastname = author.split()[-1] givenames = " ".join(author.split()[:-1]) givenames = collapse_initials(givenames) name = "%s, %s" % (lastname, givenames) name = safe_title(name) for afftag in pextag.getElementsByTagName('pex-dc:affiliation'): affiliations.append(xml_to_text(afftag)) authors.append((name, affiliations)) return authors
def get_ref_link(self, xml_doc, name): links = xml_doc.getElementsByTagName('ce:inter-ref') ret = None for link in links: if name in link.getAttribute("xlink:href").encode('utf-8'): ret = xml_to_text(link).strip() return ret
def _author_dic_from_xml(self, author): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append( cross_ref.getAttribute("refid").encode('utf-8')) return tmp
def _get_references(self): for ref in self.document.getElementsByTagName("ref"): label = ref.getAttribute("id") label = sub(r"\D", "", label) text_ref = "" ext_link = "" for mixed in ref.getElementsByTagName("mixed-citation"): ref_type = mixed.getAttribute("publication-type") if ref_type == "thesis": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "conf-proc": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "other" or ref_type == "web": text_ref = get_value_in_tag(ref, "mixed-citation") ext_link = get_value_in_tag(mixed, "ext-link") elif ref_type == "book": text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName("string-name"): surname = get_value_in_tag(auth, "surname") given_names = get_value_in_tag(auth, "given-names") given_names = collapse_initials(given_names) authors.append("%s, %s" % (surname, given_names)) year = get_value_in_tag(ref, "year") source = get_value_in_tag(ref, "source") volume = get_value_in_tag(ref, "volume") page = get_value_in_tag(ref, "fpage") if ref_type == "journal": source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page
def _get_pacscodes(self): pacscodes = [] for tag in self.document.getElementsByTagName('kwd-group'): if tag.getAttribute('kwd-group-type') == 'pacs': for code in tag.getElementsByTagName('kwd'): pacscodes.append(xml_to_text(code)) return pacscodes
def _get_orcids(xml_doc): orcid_pattern = '\d{4}-\d{4}-\d{4}-\d{3}[\d|X]' result = [] def _append_orcid(orcid): if orcid and is_valid_orcid(orcid): result.append('ORCID:{0}'.format(orcid)) else: result.append('') xml_authors = xml_doc.getElementsByTagName("ce:author") for xml_author in xml_authors: try: orcid = xml_author.getAttribute('orcid') _append_orcid(orcid) except IndexError: result.append('') if result: return result xml_authors = xml_doc.getElementsByTagName("contrib") for xml_author in xml_authors: try: contrib_id = xml_author.getElementsByTagName('contrib-id')[0] if contrib_id.getAttribute('contrib-id-type') == 'orcid': orcid_raw = xml_to_text(contrib_id) orcid = re.search(orcid_pattern, orcid_raw).group() _append_orcid(orcid) except (IndexError, AttributeError): result.append('') return result
def _get_keywords(self): keywords = [] for tag in self.document.getElementsByTagName('kwd-group'): if tag.getAttribute('kwd-group-type') != 'pacs': for kwd in tag.getElementsByTagName('kwd'): keywords.append(xml_to_text(kwd)) return keywords
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("Citation"): if not reference.getElementsByTagName("BibArticle"): references.append((get_value_in_tag(reference, "BibUnstructured"), '', '', '', '', '', '', '')) else: label = get_value_in_tag(reference, "ArticleTitle") authors = [] for author in reference.getElementsByTagName("BibAuthorName"): given_name = get_value_in_tag(author, "Initials") surname = get_value_in_tag(author, "FamilyName") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi_tag = reference.getElementsByTagName("Occurrence") doi = "" for tag in doi_tag: if tag.getAttribute("Type") == "DOI": doi = xml_to_text(tag) ## What is it exactly? # issue = get_value_in_tag(reference, "sb:issue") issue = "" page = get_value_in_tag(reference, "FirstPage") title = get_value_in_tag(reference, "JournalTitle") volume = get_value_in_tag(reference, "VolumeID") year = get_value_in_tag(reference, "Year") references.append((label, authors, doi, issue, page, title, volume, year)) return references
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv")) if ref_type != 'journal': plain_text = get_value_in_tag(reference, "mixed-citation") references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def _get_references(self): for ref in self.document.getElementsByTagName('ref'): label = ref.getAttribute('id') label = sub(r'\D', '', label) text_ref = '' ext_link = '' for mixed in ref.getElementsByTagName('mixed-citation'): ref_type = mixed.getAttribute('publication-type') if ref_type == 'thesis': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'conf-proc': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'other' or ref_type == 'web': text_ref = get_value_in_tag(ref, 'mixed-citation') ext_link = get_value_in_tag(mixed, 'ext-link') elif ref_type == 'book': text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName('string-name'): surname = get_value_in_tag(auth, 'surname') given_names = get_value_in_tag(auth, 'given-names') given_names = collapse_initials(given_names) authors.append('%s, %s' % (surname, given_names)) year = get_value_in_tag(ref, 'year') source = get_value_in_tag(ref, 'source') volume = get_value_in_tag(ref, 'volume') page = get_value_in_tag(ref, 'fpage') if ref_type == 'journal': source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield (label, ref_type, text_ref, ext_link, authors, year, source, volume, page)
def _get_authors(self): authors = [] affiliations = {} for tag in self.document.getElementsByTagName('aff'): aid = tag.getAttribute('id') affiliation = xml_to_text(tag) affiliation = ' '.join(affiliation.split()[1:]) affiliations[aid] = affiliation for tag in self.document.getElementsByTagName('contrib'): if tag.getAttribute('contrib-type') == 'author': rid = '' for aff in tag.getElementsByTagName('xref'): if aff.getAttribute('ref-type') == 'aff': rid = aff.getAttribute('rid') if len(rid.split()) > 1: rid = rid.split()[0] given_names = get_value_in_tag(tag, 'given-names') given_names = collapse_initials(given_names) surname = get_value_in_tag(tag, 'surname') name = "%s, %s" % (surname, given_names) try: authors.append((name, affiliations[rid])) except KeyError: authors.append((name, '')) return authors
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = create_record() record_add_field(rec, tag="035", subfields=[('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false')]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def get_references(self, xml_doc): for ref in xml_doc.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(ref, "ce:label") if self.CONSYN: innerrefs = ref.getElementsByTagName("sb:reference") if not innerrefs: yield self._get_ref(ref, label) for inner in innerrefs: yield self._get_ref(inner, label) else: authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi = get_value_in_tag(ref, "ce:doi") issue = get_value_in_tag(ref, "sb:issue") page = get_value_in_tag(ref, "sb:first-page") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date")[:4] else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) yield (label, authors, doi, issue, page, title, volume, year, textref, ext_link)
def _get_subject(self): subjects = [] for tag in self.document.getElementsByTagName('subj-group'): if tag.getAttribute('subj-group-type') == 'toc-minor' or \ tag.getAttribute('subj-group-type') == 'section': for subject in tag.getElementsByTagName('subject'): subjects.append(xml_to_text(subject)) return ', '.join(subjects)
def _get_authors(self): authors = [] for pextag in self.document.getElementsByTagName('pex-dc:creator'): affiliations = [] for auttag in pextag.getElementsByTagName('pex-dc:name'): author = xml_to_text(auttag) lastname = author.split()[-1] givenames = " ".join(author.split()[:-1]) givenames = collapse_initials(givenames) name = "%s, %s" % (lastname, givenames) name = safe_title(name) for afftag in pextag.getElementsByTagName( 'pex-dc:affiliation'): if afftag: affiliations.append(xml_to_text(afftag)) authors.append((name, affiliations)) return authors
def _get_orcid(self, xml_author): try: contrib_id = xml_author.getElementsByTagName('contrib-id')[0] if contrib_id.getAttribute('contrib-id-type') == 'orcid': orcid_raw = xml_to_text(contrib_id) pattern = '\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\d|X]' return re.search(pattern, orcid_raw).group() except (IndexError, AttributeError): return None
def _affiliation_from_sa_field(self, affiliation): sa_affiliation = affiliation.getElementsByTagName('sa:affiliation') if sa_affiliation: return xml_to_text(sa_affiliation[0], ', ') else: affiliation = re.sub(r'^(\d+\ ?)',"",get_value_in_tag(affiliation, "ce:textfn")) if affiliation: return affiliation else: raise IndexError
def get_doi(self, xml): ids = xml.getElementsByTagName('article-id') ret = "" for i in ids: if i.getAttribute('pub-id-type').encode('utf-8') == 'doi': ret = xml_to_text(i) if not ret: print >> sys.stdout, "Can't find DOI." return ret
def _get_authors(self): authors = [] for tag in self.document.getElementsByTagName('dc:creator'): author = xml_to_text(tag) lastname = author.split()[-1] givenames = author.split()[:-1] lastname, givenames = fix_name_capitalization(lastname, givenames) givenames = collapse_initials(givenames) authors.append("%s, %s" % (lastname, givenames)) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) surname, given_names = fix_name_capitalization( surname, given_names.split() ) name = '%s, %s' % (surname, given_names) affiliations = [] for aff in contrib.getElementsByTagName('aff'): affiliations.append(xml_to_text(aff)) emails = [] for email in contrib.getElementsByTagName('email'): emails.append(xml_to_text(email)) authors.append((name, affiliations, emails)) return authors
def get_collection(self, journal): """Return this articles' collection.""" conference = '' for tag in self.document.getElementsByTagName('conference'): conference = xml_to_text(tag) if conference or journal == "International Journal of Modern Physics: Conference Series": return [('a', 'HEP'), ('a', 'ConferencePaper')] elif self._get_article_type() == "review-article": return [('a', 'HEP'), ('a', 'Review')] else: return [('a', 'HEP'), ('a', 'Published')]
def get_authors(self, xml_doc): authors = [] for author in xml_doc.getElementsByTagName("ce:author"): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append( cross_ref.getAttribute("refid").encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml_doc.getElementsByTagName("ce:affiliation"): aff_id = affiliation.getAttribute("id").encode('utf-8') text = re.sub( r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get( "cross_ref", []) if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: message = "Implicit affiliations are used, " message += ("but there's more than one affiliation: " + str(affiliations)) print(message, file=sys.stderr) if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) name = safe_title(name) affiliations = [] for aff in contrib.getElementsByTagName('aff'): affiliations.append(xml_to_text(aff)) emails = [] for email in contrib.getElementsByTagName('email'): emails.append(xml_to_text(email)) collaborations = [] for collaboration in contrib.getElementsByTagName("collab"): collaborations.append(xml_to_text(collaboration)) authors.append((name, affiliations, emails, collaborations)) return authors
def _affiliation_from_sa_field(self, affiliation): sa_affiliation = affiliation.getElementsByTagName('sa:affiliation') if sa_affiliation: return xml_to_text(sa_affiliation[0], ', ') else: affiliation = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) if affiliation: return affiliation else: raise IndexError
def _get_authors(self): authors = [] for tag in self.document.getElementsByTagName('dc:creator'): author = xml_to_text(tag) lastname = author.split()[-1] lastname = lastname[0] + lastname[1:].lower() givennames = '' for name in author.split()[:-1]: name = name[0] + name[1:].lower() givennames += name + ' ' authors.append("%s, %s" % (lastname, givennames.strip())) return authors
def get_issn(self, xml): issns = xml.getElementsByTagName('issn') ret = None for issn in issns: if issn.getAttribute("pub-type").encode('utf-8') == 'epub': ret = issn.getAttribute("pub-type").encode('utf-8') if not ret and issns: ret = xml_to_text(issns[0]) return ret
def author_dic_from_xml(author): return {key: val for key, val in { 'surname': get_value_in_tag(author, "ce:surname"), 'given_name': get_value_in_tag(author, "ce:given-name"), 'initials': get_value_in_tag(author, "ce:initials"), 'orcid': unicode(author.getAttribute('orcid')), 'email': next((xml_to_text(email) for email in author.getElementsByTagName("ce:e-address") if unicode(email.getAttribute("type")) in ('email', '')), None), 'cross_ref': [unicode(cross_ref.getAttribute("refid")) for cross_ref in author.getElementsByTagName("ce:cross-ref")] }.items() if val is not None}
def _get_affiliations(self): affiliations = {} for tag in self.document.getElementsByTagName('aff'): aid = tag.getAttribute('id') affiliation = xml_to_text(tag) if affiliation: #removes the label try: int(affiliation.split()[0]) affiliation = ' '.join(affiliation.split()[1:]) except ValueError: pass affiliations[aid] = affiliation return affiliations
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None try: ref_type = reference.getElementsByTagName('mixed-citation')[0] ref_type = ref_type.getAttribute('publication-type').encode('utf-8') except: ref_type = reference.getElementsByTagName('citation')[0] ref_type = ref_type.getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(self.get_ref_link(reference, "arxiv")) if ref_type != 'journal': try: plain_text = get_value_in_tag(reference, "mixed-citation", tag_to_remove=self.tag_to_remove) except: plain_text = get_value_in_tag(reference, "citation", tag_to_remove=self.tag_to_remove) references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def _get_author_emails(self): author_emails = {} for tag in self.document.getElementsByTagName('author-notes'): email_elements = tag.getElementsByTagName('corresp') email_elements += tag.getElementsByTagName('fn') for tg in email_elements: nid = tg.getAttribute('id') email = xml_to_text(tg) email = email.replace(';', '') #removes the label if email.split() > 1: emails = email.split()[1:] valid_emails = [] for email in emails: if '@' in email and '.' in email: valid_emails.append(email) author_emails[nid] = valid_emails return author_emails
def author_dic_from_xml(author): return { key: val for key, val in { 'surname': get_value_in_tag(author, "ce:surname"), 'given_name': get_value_in_tag(author, "ce:given-name"), 'initials': get_value_in_tag(author, "ce:initials"), 'orcid': unicode(author.getAttribute('orcid')), 'email': next((xml_to_text(email) for email in author.getElementsByTagName("ce:e-address") if unicode(email.getAttribute("type")) in ('email', '')), None), 'cross_ref': [ unicode(cross_ref.getAttribute("refid")) for cross_ref in author.getElementsByTagName("ce:cross-ref") ] }.items() if val is not None }
def _get_orcids(xml_doc): result = [] xml_authors = xml_doc.getElementsByTagName("ce:author") for xml_author in xml_authors: try: orcid = xml_author.getAttribute('orcid') result.append(orcid) except IndexError: result.append('') xml_authors = xml_doc.getElementsByTagName("contrib") for xml_author in xml_authors: try: contrib_id = xml_author.getElementsByTagName('contrib-id')[0] if contrib_id.getAttribute('contrib-id-type') == 'orcid': orcid_raw = xml_to_text(contrib_id) pattern = '\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\d|X]' result.append(re.search(pattern, orcid_raw).group()) except (IndexError, AttributeError): result.append('') return result
def _get_note(self, note_id): for tag in self.document.getElementsByTagName('fn'): if tag.getAttribute('id') == note_id: for label in tag.getElementsByTagName('label'): tag.removeChild(label) return xml_to_text(tag)
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation') affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID')[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append( ('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def _get_ref(self, ref, label): doi = get_value_in_tag(ref, "ce:doi") page = get_value_in_tag(ref, "sb:first-page") if not page: page = get_value_in_tag(ref, "sb:article-number") issue = get_value_in_tag(ref, "sb:issue") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date") else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) if ext_link and ext_link.lower().startswith('arxiv'): # check if the identifier contains # digits seperated by dot regex = r'\d*\.\d*' if not re.search(regex, ext_link): ext_link = ext_link[6:] comment = get_value_in_tag(ref, "sb:comment") links = [] for link in ref.getElementsByTagName("ce:inter-ref"): links.append(xml_to_text(link)) title = "" try: container = ref.getElementsByTagName("sb:contribution")[0] title = container.getElementsByTagName("sb:maintitle")[0] title = xml_to_text(title) except IndexError: title = '' except TypeError: title = '' isjournal = ref.getElementsByTagName("sb:issue") journal = "" if isjournal: isjournal = True if not page: page = comment container = ref.getElementsByTagName("sb:issue")[0] journal = get_value_in_tag(container, "sb:maintitle") edited_book = ref.getElementsByTagName("sb:edited-book") editors = [] book_title = "" publisher = "" if edited_book: # treat as a journal if ref.getElementsByTagName("sb:book-series"): container = ref.getElementsByTagName("sb:book-series")[0] journal = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(ref, "sb:date") isjournal = True # conference elif ref.getElementsByTagName("sb:conference"): container = ref.getElementsByTagName("sb:edited-book")[0] maintitle = get_value_in_tag(container, "sb:maintitle") conference = get_value_in_tag(container, "sb:conference") date = get_value_in_tag(container, "sb:date") # use this variable in order to get in the 'm' field publisher = maintitle + ", " + conference + ", " + date else: container = ref.getElementsByTagName("sb:edited-book")[0] if ref.getElementsByTagName("sb:editors"): for editor in ref.getElementsByTagName("sb:editor"): surname = get_value_in_tag(editor, "ce:surname") firstname = get_value_in_tag(editor, "ce:given-name") editors.append("%s,%s" % (surname, firstname)) if title: book_title = get_value_in_tag(container, "sb:maintitle") else: title = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(container, "sb:date") if ref.getElementsByTagName("sb:publisher"): container = ref.getElementsByTagName("sb:publisher")[0] location = get_value_in_tag(container, "sb:location") publisher = get_value_in_tag(container, "sb:name") if location: publisher = location + ": " + publisher if ref.getElementsByTagName("sb:book"): if ref.getElementsByTagName("sb:book-series"): book_series = ref.getElementsByTagName("sb:book-series")[0] title += ", " + \ get_value_in_tag(book_series, "sb:maintitle") title += ", " + \ get_value_in_tag(book_series, "sb:volume-nr") publisher = get_value_in_tag(ref, "sb:publisher") if not year: year = get_value_in_tag(ref, "sb:date") year = re.sub(r'\D', '', year) return (label, authors, doi, issue, page, title, volume, year, textref, ext_link, isjournal, comment, journal, publisher, editors, book_title)
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field(rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get('given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, ) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file(".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, fileName, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_jp directory :param fileName: the name of the file to parse. :type fileName: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(fileName) article_type = self._get_article_type() if article_type not in ['research-article', 'introduction', 'letter']: return '' rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) subjects = self.document.getElementsByTagName('kwd') subjects = map(xml_to_text, subjects) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) for subject in subjects: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.'] if journal in astronomy_journals: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'INSPIRE'), ('a', 'Astrophysics')]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() abstract = self._format_abstract(abstract) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) conference = '' for tag in self.document.getElementsByTagName('conference'): conference = xml_to_text(tag) if conference: record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '500', subfields=[('a', conference)]) self._add_references(rec, ref_extract_callback) self._add_authors(rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_keywords(self, xml): try: return [xml_to_text(keyword) for keyword in xml.getElementsByTagName("Keyword")] except Exception, err: print >> sys.stderr, "Can't find keywords. %s" % (err,)