def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: ent1File = resultBase + "/"+args.folder1+"/" + splitFileID + ".index" ent2File = resultBase + "/"+args.folder2+"/" + splitFileID + ".index" relFile = resultBase + "/relations/" + splitFileID + ".index" sentFile = args.sentdir + "/" + splitFileID + ".sent" ent1Hits = SyngrepHitFile(ent1File, ent1Syns) if len(ent1Hits) == 0: continue ent2Hits = SyngrepHitFile(ent2File, ent2Syns) if len(ent2Hits) == 0: continue relHits = SyngrepHitFile(relFile, relSyns) # only load sentences if there's a hit ... sentDB = None sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in ent1Hits: if docID in ent2Hits: if sentDB == None: sentDB = SentenceDB(sentFile) ent1SynHits = ent1Hits.getHitsForDocument(docID) ent2SynHits = ent2Hits.getHitsForDocument(docID) # if docID == 'a27229723': # [print(x.synonyme) for x in hgncSynHits] # [print(x.synonyme) for x in mirnaSynHits] foundCoocs = findCooccurrences(str(docID), ent1SynHits, ent2SynHits, sentDB, relHits) fileCoocs += foundCoocs sys.stderr.write("Found {cnt} elems in files {ids}\n".format(cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) thisProcID = str(os.getpid()) sys.stderr.write("{procID}: Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed, procID=thisProcID)) return None
def analyseFile(splitFileIDs, env): subject2pmids = defaultdict(set) for splitFileID in splitFileIDs: fileID = "{:>4}".format(splitFileID).replace(" ", "0") fmaFile = resultBase + "/model_anatomy/pubmed18n" + fileID + ".index" doidFile = resultBase + "/disease/pubmed18n" + fileID + ".index" goFile = resultBase + "/neutrophils/pubmed18n" + fileID + ".index" sentFile = "/mnt/c/dev/data/pubmed/pubmed18n" + fileID + ".sent" fmaHits = SyngrepHitFile(fmaFile, fmaSyns) if len(fmaHits) == 0: return print("Processing file: ", fileID) doidHits = SyngrepHitFile(doidFile, doidSyns) goHits = SyngrepHitFile(goFile, goSyns) sentDB = SentenceDB(sentFile) for docID in fmaHits: docFMAHits = fmaHits.getHitsForDocument(docID) docDoidHits = doidHits.getHitsForDocument(docID) docGOHits = goHits.getHitsForDocument(docID) # no tissue hits at all ... if len(docFMAHits) == 0: continue # is neutrophil hit? if no -> continue neutrophilMentioned = testMentioned(docFMAHits, neutrophilSynIDs) if not neutrophilMentioned: continue subject2pmids['NEUTROPHIL'].add(int(docID)) if testMentioned(docFMAHits, tissueIDs): subject2pmids['TISSUES'].add(int(docID)) if testMentioned(docDoidHits, doidSynIDs): subject2pmids['DOID'].add(int(docID)) if testMentioned(docGOHits, goSynIDs): subject2pmids['INFLAMM'].add(int(docID)) return subject2pmids
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: hgncFile = resultBase + "/hgnc/" + splitFileID + ".index" mirnaFile = resultBase + "/mirna/" + splitFileID + ".index" relFile = resultBase + "/relations/" + splitFileID + ".index" sentFile = "/mnt/c/dev/data/pmc/allsent/" + splitFileID + ".sent" mirnaHits = SyngrepHitFile(mirnaFile, mirnaSyns) if len(mirnaHits) == 0: continue hgncHits = SyngrepHitFile(hgncFile, hgncSyns) if len(hgncHits) == 0: continue relHits = SyngrepHitFile(relFile, relSyns) sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in mirnaHits: if docID in hgncHits: mirnaSynHits = mirnaHits.getHitsForDocument(docID) hgncSynHits = hgncHits.getHitsForDocument(docID) #if docID == 'a27229723': # [print(x.synonyme) for x in hgncSynHits] # [print(x.synonyme) for x in mirnaSynHits] foundCoocs = findCooccurrences(str(docID), hgncSynHits, mirnaSynHits, sentDB, relHits) fileCoocs += foundCoocs sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None
def analyseFile(splitFileID, relPMIDs): fileID = "{:>4}".format(splitFileID).replace(" ", "0") diseaseHitsFile = resultBase + "/disease/medline17n" + fileID + ".index" hitsFile = SyngrepHitFile(diseaseHitsFile, diseaseMap) if len(hitsFile) == 0: return print("Document: " + str(fileID)) print("Start Document: " + str(fileID)) procDB = neo4jInterface(simulate=False, printQueries=False) for docID in hitsFile: if not docID in relPMIDs: continue synHits = hitsFile.getHitsForDocument(docID) foundUniqueHits = set() for hit in synHits: if len(hit.foundSyn) < 5: if not hit.perfectHit: continue hitSyn = hit.synonym foundUniqueHits.add(hitSyn.id.replace('_', ':')) for synonymID in foundUniqueHits: pubmedExists = False if addUnknownPubmeds: procDB.createNodeIfNotExists(['EVIDENCE', 'PUBMED'], {'id': docID}) pubmedExists = True else: if procDB.nodeExists(['PUBMED'], {'id': docID}): pubmedExists = True if pubmedExists: res = procDB.createRelationship('disease', ['DISEASE'], {'id': synonymID}, 'pubmed', ['PUBMED'], {'id': docID}, ['DISEASE_MENTION'], None) print("Add: ", fileID, docID, synonymID, [x for x in res if res != None]) print("End Document: " + str(fileID)) procDB.close()
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: diseaseFile = resultBase + "/disease/" + splitFileID + ".index" diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns) if len(diseaseHits) == 0: continue sentFile = "/mnt/c/dev/data/pmc/allsent/" + splitFileID + ".sent" sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in diseaseHits: docHits = diseaseHits.getHitsForDocument(docID) allSynIDs = set() for hit in docHits: allSynIDs.add(hit.synonym.id.replace('_', ':', 1)) removeIDs = set() for synID in allSynIDs: gterm = celloObo.getID(synID) allChildren = gterm.getAllChildren(maxLevel=2) for x in allChildren: removeIDs.add(x) allowedIDs = [x for x in allSynIDs if not x in removeIDs] #allowedIDs = allSynIDs.remove(removeIDs) allterms = [] for synID in allowedIDs: gterm = celloObo.getID(synID) allterms.append(gterm) fileCoocs.append((docID, gterm.id, gterm.name)) sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None
resultBase = dataDir + "/miRExplore/textmine/results/" indexFoundSyns = Counter() excludedSyns = loadExludeWords() checkResultsFor = 'disease' analyseFiles = 100 maxFiles = 892 checkSynsMap = SynfileMap(resultBase + "/" + checkResultsFor + "/synfile.map") checkSynsMap.loadSynFiles(('/home/users/joppich/ownCloud/data/', dataDir)) for splitFileID in range(maxFiles, maxFiles - analyseFiles - 1, -1): fileID = "{:>4}".format(splitFileID).replace(" ", "0") print(fileID) indexFile = resultBase + "/" + checkResultsFor + "/medline17n" + fileID + ".index" foundHits = SyngrepHitFile(indexFile, checkSynsMap) for doc in foundHits: docHits = foundHits.getHitsForDocument(doc) for hit in docHits: indexFoundSyns[hit.hitSyn] += 1 for (syn, cnt) in indexFoundSyns.most_common(100): #if syn in excludedSyns: print(str(syn) + " -> " + str(cnt))
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: diseaseFile = resultBase + "/model_anatomy/" + splitFileID + ".index" diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns) if len(diseaseHits) == 0: continue sentFile = args.sentdir + "/" + splitFileID + ".sent" #"/mnt/c/dev/data/pmc/allsent/"+splitFileID +".sent" sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in diseaseHits: docHits = diseaseHits.getHitsForDocument(docID) synid2loc = defaultdict(list) allSynIDs = set() for hit in docHits: if "and " in hit.foundSyn: continue allSynIDs.add(hit.synonym.id) synid2loc[hit.synonym.id].append( (str(hit.documentID), hit.position[0], hit.position[1])) removeIDs = set() for synID in allSynIDs: gterm = getTerm(synID, fmaObo) if gterm == None: sys.stderr.write("Invalid synID: " + synID) removeIDs.add(synID) continue allChildren = gterm.getAllChildren(maxLevel=2) for x in allChildren: removeIDs.add(x) allowedIDs = [x for x in allSynIDs if not x in removeIDs] #allowedIDs = allSynIDs.remove(removeIDs) allterms = [] for synID in allowedIDs: gterm = getTerm(synID, fmaObo) allterms.append(gterm) fileCoocs.append( (docID, gterm.id, gterm.name, synid2loc[synID])) sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: fileID = "{:>4}".format(splitFileID).replace(" ", "0") hgncFile = resultBase + "/hgnc/pubmed18n" + fileID + ".index" mirnaFile = resultBase + "/mirna/pubmed18n" + fileID + ".index" sentFile = "/mnt/c/dev/data/pubmed/pubmed18n" + fileID + ".sent" mirnaHits = SyngrepHitFile(mirnaFile, mirnaSyns) if len(mirnaHits) == 0: continue hgncHits = SyngrepHitFile(hgncFile, hgncSyns) if len(hgncHits) == 0: continue sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(fileID) + "\n") for docID in mirnaHits: if docID in hgncHits: mirnaSynHits = mirnaHits.getHitsForDocument(docID) hgncSynHits = hgncHits.getHitsForDocument(docID) #if docID == 'a27229723': # [print(x.synonyme) for x in hgncSynHits] # [print(x.synonyme) for x in mirnaSynHits] foundCoocs = findCooccurrences(str(docID), hgncSynHits, mirnaSynHits, sentDB) fileCoocs += foundCoocs assocByGene = defaultdict(set) for x in foundCoocs: geneID = x.gene geneLabel = 'GENE' mirnaID = x.mirna mirnaLabel = x.idtype assoc = (geneID, geneLabel, mirnaID, mirnaLabel) assocByGene[assoc[0]].add(assoc) addDocAsEvidence = False assocByTypeForGene = {} for gene in assocByGene: assocs = assocByGene[gene] mimatSet = set() miSet = set() orgmirSet = set() familySet = set() for assoc in assocs: if assoc[3] == 'MIRNA': mimatSet.add(assoc) elif assoc[3] == 'MIRNA_PRE': miSet.add(assoc) elif assoc[3] == 'MIRNA_ORGMIR': orgmirSet.add(assoc) elif assoc[3] == 'MIRNA_FAMILY': familySet.add(assoc) else: print("Unknown relation in doc: " + docID) print(assoc) if len(mimatSet) > 0 or len(miSet) > 0 or len( orgmirSet) > 0 or len(familySet) > 0: assocByTypeForGene[gene] = (mimatSet, miSet, orgmirSet, familySet) #filter assocs here such that if a taxid specific version was found, not the general version is added if len(assocByTypeForGene) > 0: if db: db.createNodeIfNotExists(['EVIDENCE', 'PUBMED'], {'id': docID}) print("Adding: " + str(docID) + ": " + str(assocByTypeForGene)) # # TODO first create all unique edges # TODO add genes to mirna edges and mirnas to gene edges to keep track from where an edge originates # TODO edges should get weights = how many relations have been found # mirnaEdges = defaultdict(set) geneEdges = defaultdict(set) for gene in assocByTypeForGene: assocsForGene = assocByTypeForGene[ gene] # (mimatSet, miSet, orgmirSet, familySet) for subSet in assocsForGene: for cooc in subSet: geneEdges[(cooc[0], cooc[1])].add( (cooc[2], cooc[3])) mirnaEdges[(cooc[2], cooc[3])].add( (cooc[0], cooc[1])) for edge in geneEdges: edgeMirnas = [x[0] for x in geneEdges[edge]] if db: db.createRelationship('gene', [edge[1]], {'id': edge[0]}, 'pmid', ['PUBMED'], {'id': docID}, ['ST_MENTION'], { 'type': 'GENE_MENTION', 'mirnas': edgeMirnas }) for edge in mirnaEdges: edgeGenes = [x[0] for x in mirnaEdges[edge]] if db: db.createRelationship('pmid', ['PUBMED'], {'id': docID}, 'mi', [edge[1]], {'id': edge[0]}, ['ST_MENTION'], { 'type': 'MIRNA_MENTION', 'genes': edgeGenes }) sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None
def analyseFile(splitFileID, relPMIDs): fileID = "{:>4}".format(splitFileID).replace(" ", "0") diseaseHitsFile = resultBase + "/cellline/medline17n" + fileID + ".index" hitsFile = SyngrepHitFile(diseaseHitsFile, celllinesMap) if len(hitsFile) == 0: return print("Start Document: " + str(fileID)) procDB = neo4jInterface(simulate=False, printQueries=False) for docID in hitsFile: if not docID in relPMIDs: continue synHits = hitsFile.getHitsForDocument(docID) foundUniqueHits = set() foundOrgs = set() for hit in synHits: if len(hit.foundSyn) < 5: if not hit.perfectHit: continue hitSynFileID = hit.synonymID.synfile foundOrgs.add(synfileID2tax[hitSynFileID]) hitSyn = hit.synonym foundUniqueHits.add(hitSyn.id) if len(foundUniqueHits) == 0: continue for celllineID in foundUniqueHits: pubmedExists = False if addUnknownPubmeds: procDB.createNodeIfNotExists(['EVIDENCE', 'PUBMED'], {'id': docID}) pubmedExists = True else: if procDB.nodeExists(['PUBMED'], {'id': docID}): pubmedExists = True if pubmedExists: res = procDB.createRelationship('cellline', ['CELLLINE'], {'id': celllineID}, 'pubmed', ['PUBMED'], {'id': docID}, ['CELLLINE_MENTION'], None) print("Add: ", fileID, docID, celllineID, [x for x in res if res != None]) foundOrgs = foundOrgs.difference(allSet) if len(foundOrgs) == 1: pass # create relation # print('Associate: ' + str(foundOrgs)) elif len(foundOrgs) == 0: pass elif len(foundOrgs) > 1: # print('Ambiguous pubmed: ' + docID) pass print("End Document: " + str(fileID)) procDB.close()
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: print(splitFileID, file=sys.stderr) ent1File = resultBase + "/" + args.folder1 + "/" + splitFileID + ".index" ent2File = resultBase + "/" + args.folder2 + "/" + splitFileID + ".index" relFile = resultBase + "/relations/" + splitFileID + ".index" sentFile = args.sentdir + "/" + splitFileID + ".sent" ent1Hits = SyngrepHitFile(ent1File, ent1Syns, sentIDNoText=args.sentid_no_text) if len(ent1Hits) == 0: continue ent2Hits = SyngrepHitFile(ent2File, ent2Syns, sentIDNoText=args.sentid_no_text) if len(ent2Hits) == 0: continue relHits = SyngrepHitFile(relFile, relSyns, sentIDNoText=args.sentid_no_text) # only load sentences if there's a hit ... sentDB = None sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in ent1Hits: #if not docID in ["27150436"]: # continue if accept_pmids != None: if not docID in accept_pmids: continue if docID in ent2Hits: if sentDB == None: sentDB = SentenceDB(sentFile, sent_no_byte=args.sent_no_byte) ent1SynHits = ent1Hits.getHitsForDocument(docID) ent2SynHits = ent2Hits.getHitsForDocument(docID) # if docID == 'a27229723': # [print(x.synonyme) for x in hgncSynHits] # [print(x.synonyme) for x in mirnaSynHits] foundCoocs = findCooccurrences(str(docID), ent1SynHits, ent2SynHits, sentDB, relHits) for relEntEnt in foundCoocs: if relEntEnt.accepted(): print(str(relEntEnt), flush=True) """ print( "{ent1}\t{ent1found}\t{ent1type}\t{ent2}\t{ent2found}\t{ent2type}\t{pubmed}\t{sapar}\t{sase}\t{relation}\n".format( ent1=cooc.ent1, ent2=cooc.ent2, ent1found=cooc.ent1found, ent2found=cooc.ent2found, ent1type=cooc.ent1type, ent2type=cooc.ent2type, pubmed=cooc.pubmed, sapar=cooc.sameParagraph, sase=cooc.sameSentence, relation=cooc.relation, ), end='', flush=True) """ fileCoocs += foundCoocs sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) #printed = printStuff(None, fileCoocs, None) thisProcID = str(os.getpid()) sys.stderr.write( "{procID}: Found {cnt} (printed: {printed}) elems in files {ids}\n". format(cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=len(fileCoocs), procID=thisProcID)) return None
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: diseaseFile = resultBase + "/org/" + splitFileID + ".index" diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns) if len(diseaseHits) == 0: continue sentFile = args.sentdir + "/" + splitFileID + ".sent" #"/mnt/c/dev/data/pmc/allsent/"+splitFileID +".sent" sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in diseaseHits: docHits = diseaseHits.getHitsForDocument(docID) synid2loc = defaultdict(list) allSynIDs = set() for hit in docHits: if "and " in hit.foundSyn: continue allSynIDs.add(hit.synonym.id) synid2loc[hit.synonym.id].append( (str(hit.documentID), hit.position[0], hit.position[1])) #allowedIDs = allSynIDs.remove(removeIDs) allowedOrgIDs = [] evs = [] for x in synid2loc: if orgID2TLC.get(x, None) != None: allowedOrgIDs.append(x) evs += synid2loc[x] fileCoocs.append( (docID, ",".join([orgID2TLC[x] for x in allowedOrgIDs]), evs)) sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None
def analyseFile(splitFileIDs, env): fileCoocs = [] for splitFileID in splitFileIDs: indexFile = resultBase + "/" + splitFileID + ".index" oboHits = SyngrepHitFile(indexFile, oboSyns, sentIDNoText=args.sentid_no_text) if len(oboHits) == 0: continue # not used anyhow ... #sentFile = args.sentdir + "/" + splitFileID + ".sent" #sentDB = SentenceDB(sentFile) sys.stderr.write("Found something in: " + str(splitFileID) + "\n") for docID in oboHits: if accept_pmids != None: if not docID in accept_pmids: continue docHits = oboHits.getHitsForDocument(docID) synid2loc = defaultdict(list) allSynIDs = set() for hit in docHits: if len(hit.foundSyn) < args.phit: if hit.perfectHit != True: continue if "and " in hit.foundSyn: continue if hit.synonym is None: print(hit, file=sys.stderr) sys.stderr.flush() allSynIDs.add(hit.synonym.id) synid2loc[hit.synonym.id].append( (str(hit.documentID), hit.position[0], hit.position[1], hit.hitSyn)) removeIDs = set() for synID in allSynIDs: gterm = getTerm(synID, celloObo) if gterm == None: sys.stderr.write("Invalid synID: " + synID) removeIDs.add(synID) continue allChildren = gterm.getAllChildren(maxLevel=2) for x in allChildren: removeIDs.add(x) allowedIDs = [x for x in allSynIDs if not x in removeIDs] #allowedIDs = allSynIDs.remove(removeIDs) allterms = [] for synID in allowedIDs: gterm = getTerm(synID, celloObo) allterms.append(gterm) fileCoocs.append( (docID, gterm.id, gterm.name, synid2loc[synID])) sys.stderr.write("Found {cnt} elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs))) printed = printStuff(None, fileCoocs, None) sys.stderr.write( "Found {cnt} (printed: {printed}) elems in files {ids}\n".format( cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed)) return None