def loadJATSReference(self, ref, doc): """ Load a reference from the bibliography section. :param ref: xml node for reference element :param doc: :class `SciDoc <SciDoc>` instance we're loading this for :returns: dict with the new loaded reference :rtype: dict """ xmltext=ref.__repr__() authorlist=[] surnames=[] original_id=ref["id"] citation_type_key="publication-type" element=ref.find("element-citation") if not element: element=ref.find("mixed-citation") if not element: element=ref.find("citation") if element: citation_type_key="citation-type" author_group=ref.find("person-group",{"person-group-type":"author"}) if not author_group: collab=ref.find("collab") if collab: authorlist.append(guessNamesOfPlainTextAuthor(collab.text)) else: author_group=ref if author_group: authors=author_group.findAll("name") else: authors=None collab=ref.find("collab") if collab: authorlist.append({"family":collab.text, "given":""}) surnames.append(collab.text) if authors: for a in authors: astring=a.__repr__() surname=a.find("surname") if surname: surnames.append(surname.text) given_names=a.find("given-names") if given_names and surname: authorlist.append({"given": given_names.text, "family": surname.text}) else: astring=cleanxml(astring) authorlist.append(guessNamesOfPlainTextAuthor(astring)) else: srnms=ref.findAll("surname") for s in srnms: surnames.append(s.text) newref=doc.addReference() ## newref["xml"]=xmltext ## newref["text"]=cleanxml(xmltext) newref["authors"]=authorlist newref["surnames"]=surnames newref["external_links"]=[] newref["title"]="<NO TITLE>" if not element: newref["xml"]=xmltext return newref article_title=ref.find("article-title") source=element.find("source") if source: newref["publication-name"]=source.text else: newref["publication-name"]="" try: newref["publication-type"]=element[citation_type_key] except: newref["publication-type"]="unknown" if newref["publication-type"]=="book": if source: newref["title"]=source.text else: if article_title: newref["title"]=article_title.text elif newref["publication-type"]=="journal": if article_title: newref["title"]=article_title.text elif newref["publication-type"]=="other": if article_title: newref["title"]=article_title.text elif source: newref["title"]=source.text self.extractInfoFromPatentText(ref.__repr__(), newref) self.loadJATSmetadataIfExists(element,["volume","issue","fpage","lpage","year"],newref) id=element.find("pub-id",{"pub-id-type":"doi"}) if id: newref["doi"]=id.text id=element.find("pub-id",{"pub-id-type":"pmid"}) if id: newref["pmid"]=id.text id=element.find("pub-id",{"pub-id-type":"pmc"}) if id: newref["pmcid"]=id.text if newref["title"] == "": newref["title"]="<NO TITLE FOUND>" ## comment=element.find("comment") ## if comment: ## extlink=comment.find("ext-link",{"ext-link-type":"uri"}) extlinks=element.findAll("ext-link",{"ext-link-type":"uri"}) for extlink in extlinks: newref["external_links"].append(extlink["xlink:href"]) if original_id and self.USE_ORIGINAL_REF_ID: newref["id"]=original_id return newref
def loadParsCitReference(self, reference): """ Given an XML <citation> node, loads all the relevant values from it. Args: reference: XML node Returns: metadata of reference """ metadata={ "title":"title", # key: the key in the final dict. Value: the XML tag to look for "year":"date", "volume":"volume", "pages":"pages", "journal":"journal", "publisher":"publisher", "location":"location", "raw_string":"rawstring", "institution":"institution", } for key in metadata: # substitute each value string by its value in the XML, if found node=reference.find(metadata[key]) if node: text=node.text.strip(".") # just to be annoying. get the actual text of the node else: text="" metadata[key]=text # often the title will end up as anything else: location, journal if len(metadata["title"]) < 2: if len(metadata.get("journal","")) > 2: metadata["title"]=metadata["journal"] metadata["journal"]="" elif len(metadata.get("location","")) > 2: metadata["title"]=metadata["location"] metadata["location"]="" if metadata["title"].startswith("Building and Using"): pass # All parsers get the "In Proceedings of..." wrong and put it in the title. # this is a manual hack fix rx_fix_title_in=re.compile(r"([,.] ?In[\:]? (\w.*))") match=rx_fix_title_in.search(metadata["title"]) if match: metadata["journal"]=match.group(2) + metadata.get("journal","") metadata["title"]=rx_fix_title_in.sub("", metadata["title"]).strip(" ,") rx_fix_title_thesis=re.compile(r", ((?:(?:Doctoral|MSc)? ?[Tt]hesis|(?:\w+|([A-Z]\. ?)+)[Dd]issertation).*)",flags=re.IGNORECASE) match=rx_fix_title_thesis.search(metadata["title"]) if match: metadata["journal"]=match.group(1) + metadata.get("journal","") metadata["title"]=rx_fix_title_thesis.sub("", metadata["title"]).strip(" ,") # remove hanging ". In" at the end of the title metadata["title"]=re.sub(r"[.,;] ?In ?$","",metadata["title"]) for atype in ["journal", "booktitle"]: node=reference.find(atype) if node: metadata["publication"]=node.text.strip(".,: ") #TODO: expand this to inproceedings, etc. metadata["type"] = atype metadata["authors"]=[] author_nodes=reference.findAll("author") for author_string in [author.text for author in author_nodes]: metadata["authors"].append(guessNamesOfPlainTextAuthor(author_string)) metadata["surnames"]=[author["family"] for author in metadata["authors"]] return metadata