def senteniceFile(filenames, env): for filename in filenames: print(filename) basefile = os.path.basename(filename) datefile = basefile.replace(".xml.gz", ".date") typefile = basefile.replace(".xml.gz", ".pubtype") pmid2date = {} pmid2types = defaultdict(set) with open(storagePath + datefile, 'w') as outdate, open(storagePath + typefile, "w") as outtype: pubmedParser = PubmedXMLParser() pubmedParser.parseXML(filename) for elem in PubmedArticleIterator(pubmedParser): try: entry = PubmedEntry.fromXMLNode(elem) if entry == None: continue pmid2date[entry.pmid] = entry.pub_date for dtype in entry.pub_types: pmid2types[entry.pmid].add(dtype) except: traceback.print_exc() eprint("Exception", datefile) try: pmid = elem.find('MedlineCitation/PMID').text eprint(pmid) except: pass continue for x in pmid2date: print(x, "\t".join([str(x) for x in pmid2date[x]]), sep="\t", file=outdate) for x in pmid2types: for doctype in pmid2types[x]: print(x, doctype, sep="\t", file=outtype)
def createNodeKeyConstraint(self, label, properties, nodeName='n'): eprint("WILL ONLY WORK WITH ENTERPRISE EDITION!") return None labelStr = self.makeLabels(label) propStr = ", ".join([nodeName + "." + x for x in properties]) createConstraint = "CREATE CONSTRAINT ON (n{label}) ASSERT ({propstr}) IS NODE KEY".format( label=labelStr, propstr=propStr) return self.runInDatabase(createConstraint)
def parseXML(self, path): self.tree = None try: self.tree = etree.parse(path) except: try: self.tree = etree.fromstring(path) except Exception as e: eprint("Unable to load graph:", str(e)) raise if '.nxml' in path: self.remove_namespace(self.tree) # strip namespace for return self.tree
def runInDatabase(self, query): self.dbQueries += 1 if self.dbQueries % 10000 == 0: print(self.dbQueries) if self.printQueries: print(query) if self.simulateDB: pass else: try: returnVal = self.session.run(query) return returnVal except neo4je.ClientError as e: eprint(e) exit(-1)
def senteniceFile(filenames, env): for filename in filenames: print(filename) basefile = os.path.basename(filename) sentfile = basefile.replace(".xml.gz", ".sent") titlefile = basefile.replace(".xml.gz", ".title") authorfile = basefile.replace(".xml.gz", ".author") citationfile = basefile.replace(".xml.gz", ".citation") pmid2title = {} pmid2authors = defaultdict(set) pmid2citations = defaultdict(set) with open(storagePath + sentfile, 'w') as outfile: pubmedParser = PubmedXMLParser() pubmedParser.parseXML(filename) for elem in PubmedArticleIterator(pubmedParser): try: entry = PubmedEntry.fromXMLNode(elem) if entry == None: continue sents = entry.to_sentences(tokenizer) for x in sents: outfile.write(x + "\n") pmidID = entry.getID() if entry.title != None: pmid2title[pmidID] = entry.title if entry.authors != None and len(entry.authors) > 0: for author in entry.authors: #first, initials, last pmid2authors[pmidID].add( (author[1], author[2], author[0])) if entry.cites != None and len(entry.cites) > 0: for cite in entry.cites: try: val = int(cite) pmid2citations[pmidID].add(val) except: continue except: eprint("Exception", sentfile) try: pmid = elem.find('MedlineCitation/PMID').text eprint(pmid) except: pass continue with open(storagePath + titlefile, 'w') as outfile: print(titlefile) for pmid in pmid2title: title = pmid2title[pmid] if title == None or len(title) == 0: continue outfile.write(str(pmid) + "\t" + str(title) + "\n") with open(storagePath + authorfile, 'w') as outfile: print(authorfile) for pmid in pmid2authors: authors = pmid2authors[pmid] if authors == None or len(authors) == 0: continue for author in authors: first = author[0] if author[0] != None else '' initials = author[1] if author[1] != None else '' last = author[2] if author[2] != None else '' outfile.write( str(pmid) + "\t" + "\t".join([first, initials, last]) + "\n") with open(storagePath + citationfile, 'w') as outfile: print(citationfile) for pmid in pmid2citations: citations = pmid2citations[pmid] if citations == None or len(citations) == 0: continue for quote in citations: outfile.write(str(pmid) + "\t" + str(quote) + "\n")
] allfileIDs = sorted(allfileIDs, reverse=True) addUnknownPubmeds = False retVal = db.matchNodes(['PUBMED'], None, nodename='n') relevantPMIDs = set() for x in retVal: nodeData = x['n'] if 'id' in nodeData.properties: pmid = nodeData.properties['id'] relevantPMIDs.add(pmid) else: eprint("No data in: ", str(nodeData)) if len(relevantPMIDs) == 0: eprint("No RELEVANT PUBMED entries found") def analyseFile(splitFileID, relPMIDs): fileID = "{:>4}".format(splitFileID).replace(" ", "0") diseaseHitsFile = resultBase + "/disease/medline17n" + fileID + ".index" hitsFile = SyngrepHitFile(diseaseHitsFile, diseaseMap) if len(hitsFile) == 0: return
citedByFile = dataDir + "/miRExplore/pubmed_citedby.tsv" if createCitationLists: retVal = db.matchNodes(['PUBMED'], None, nodename='n') print("Query finished") pmids = set() for x in retVal: nodeData = x['n'] if 'id' in nodeData.properties: pmid = nodeData.properties['id'] pmids.add(pmid) else: eprint("No data in: ", str(nodeData)) print(len(pmids)) store = CoCitationStore() foundCitations = store.getCites(pmids) foundCitedBy = store.getCitedBy(pmids) with open(citationFile, 'w') as outfile: for pmid in foundCitations: outfile.write( str(pmid) + "\t" + str(",".join([str(x) for x in foundCitations[pmid]])) + "\n") with open(citedByFile, 'w') as outfile: for pmid in foundCitedBy:
for id in id2node: node = id2node[id] db.createNodeIfNotExists(['CELLLINE'], node) allSpecies = id2species.get(id, set()) cellLineUnique = len(allSpecies) == 1 for species in allSpecies: try: taxID = int(species) db.createRelationship('tax', ['TAX'], {'id': taxID}, 'cell', ['CELLLINE'], node, ['HAS_CELLLINE'], {'unique': cellLineUnique}) except: eprint(str(species) + "is not a valid tax id in database") continue for id in id2derived_from: allDerivatives = id2derived_from[id] for deriv in allDerivatives: if not deriv in id2node: eprint("Not in id2node: " + str(deriv)) continue db.createRelationship('id', ['CELLLINE'], {'id': id}, 'other', ['CELLLINE'], {'id': deriv}, ['CELLINE_DERIVED_FROM'], None)
def senteniceFile(filenames, env): for filename in filenames: print(filename) storagePath = os.path.dirname(filename) + "/" basefile = os.path.basename(filename) sentfile = basefile.replace(".xml", ".sent") titlefile = basefile.replace(".xml", ".title") authorfile = basefile.replace(".xml", ".author") citationfile = basefile.replace(".xml", ".citation") datefile = basefile.replace(".xml", ".date") typefile = basefile.replace(".xml", ".pubtype") pmidfile = basefile.replace(".xml", ".pmid") pmid2title = {} pmid2authors = defaultdict(set) pmid2citations = defaultdict(set) with open(storagePath + sentfile, 'w') as outfile, open( storagePath + datefile, 'w') as outdate, open( storagePath + typefile, "w") as outtype, open(storagePath + pmidfile, "w") as outpmid: pubmedParser = PubmedXMLParser() pubmedParser.parseXML(filename) for elem in [pubmedParser.tree]: try: entry = PubmedEntry.fromXMLNode(elem) if entry == None: continue sents = entry.to_sentences(tokenizer) for x in sents: outfile.write(x + "\n") pmidID = entry.getID() if entry.created != None: print(pmidID, "\t".join([str(x) for x in entry.created]), sep="\t", file=outdate) if entry.pub_types != None: for ept in entry.pub_types: print(pmidID, ept, sep="\t", file=outtype) if entry.pmc != None: print(pmidID, entry.pmc, sep="\t", file=outpmid) if entry.title != None: pmid2title[pmidID] = entry.title if entry.authors != None and len(entry.authors) > 0: for author in entry.authors: #first, initials, last pmid2authors[pmidID].add( (author[1], author[2], author[0])) if entry.cites != None and len(entry.cites) > 0: for cite in entry.cites: try: val = int(cite) pmid2citations[pmidID].add(val) except: continue except: eprint("Exception", sentfile) exit(-1) try: pmid = PubmedEntry.fromXMLNode(elem) eprint(pmid) except: pass continue with open(storagePath + titlefile, 'w') as outfile: print(titlefile) for pmid in pmid2title: title = pmid2title[pmid] if title == None or len(title) == 0: continue outfile.write(str(pmid) + "\t" + str(title) + "\n") with open(storagePath + authorfile, 'w') as outfile: print(authorfile) for pmid in pmid2authors: authors = pmid2authors[pmid] if authors == None or len(authors) == 0: continue for author in authors: first = author[0] if author[0] != None else '' initials = author[1] if author[1] != None else '' last = author[2] if author[2] != None else '' outfile.write( str(pmid) + "\t" + "\t".join([first, initials, last]) + "\n") with open(storagePath + citationfile, 'w') as outfile: print(citationfile) for pmid in pmid2citations: citations = pmid2citations[pmid] if citations == None or len(citations) == 0: continue for quote in citations: outfile.write(str(pmid) + "\t" + str(quote) + "\n")