def getSideEffectSet(path, seCounter, dValidSes=set()): fin = open(path, encoding="utf8", errors='ignore') fin.readline() currentId = -1 currentSESet = set() print("Loading: ...", path) dCase2Se = dict() skipCase = False assert 'medication error' not in dValidSes while True: line = fin.readline() if line == "": break line = line.strip().lower() parts = line.split("$") caseId = parts[1] seName = parts[2] assert len(seName) > 0 if caseId != currentId: if currentId != -1: dCase2Se[currentId] = currentSESet for se in currentSESet: utils.add_dict_counter(seCounter, se) currentId = caseId currentSESet = set() if seName in dValidSes: currentSESet.add(seName) fin.close() return dCase2Se
def finalStats(): fin = open("%s/finalMap/FinalMap.txt" % params.OUTPUT_DIR) lines = fin.readlines() lines = [line.strip() for line in lines] dMap = dict() for line in lines: parts = line.split("||") dMap[parts[0]] = parts[1] fin.close() fin = open("%s/finalMap/FinalMapH.txt" % params.OUTPUT_DIR) lines = fin.readlines() lines = [line.strip() for line in lines] dMapH = dict() for line in lines: parts = line.split("||") dMapH[parts[0]] = parts[1] fin.close() dFreq = dict() fin = open("%s/Tmp/DrugFreq2.txt" % params.OUTPUT_DIR) while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("\t") drugJader = parts[1] c = int(parts[0]) dDrugBank = utils.get_dict(dMap, drugJader, -1) d2 = utils.get_dict(dMapH, drugJader, -1) if dDrugBank != -1: utils.add_dict_counter(dFreq, dDrugBank, c) elif d2 != -1: utils.add_dict_counter(dFreq, drugJader, c) kvs = utils.sort_dict(dFreq) fout = open("%s/FinalDrugFreq.txt" % params.OUTPUT_DIR, "w") for kv in kvs: k, v = kv fout.write("%.6s\t%s\n" % (v, k)) from plotLib import plotHistD, plotCul plotCul(kvs[::-1], 50, 2, "SelectedDrugCutOff", xLabel="ThreshHold: Freq >=", yLabel="Number of Drugs") fout.close() from plotLib import plotHistD, plotCul plotCul(kvs[::-1], 20, 1, "SelectedDrugCutOff", xLabel="ThreshHold: Freq >=", yLabel="Number of Drugs")
def stats3(): fin = open("%s/FSUBTEST/3/FileMap.txt" % params.FADER_OUT) dComCount = dict() while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("\t") hashFile = parts[0] f = open("%s/FSUBTEST/3/%s" % (params.FADER_OUT, hashFile)) while True: l = f.readline() if l == "": break parts = l.strip().split("_") drug = parts[0] se = parts[1].split("\t")[0] utils.add_dict_counter(dComCount, drug) f.close() counts = dComCount.values() maxCount = max(counts) dCountFreq = dict() for c in counts: utils.add_dict_counter(dCountFreq, c) arCountFreq = np.zeros(maxCount) for c,v in dCountFreq.items(): arCountFreq[c-1] = v x = np.arange(1, maxCount + 1) from matplotlib import pyplot as plt fig = plt.figure(figsize=(8,4)) ax = fig.add_subplot(1,2,1) ax.scatter(x, arCountFreq) plt.xlabel("3-Drug Combinations with Frequencies") plt.ylabel("Number of Combinations") sum = np.sum(arCountFreq) prop = arCountFreq / sum ax2 = ax.twinx() ax2.scatter(x, prop) plt.ylabel("Percentage") plt.xlim(1,5) # fig.add_subplot(1,2,2) # plt.scatter(x, prop) # plt.xlabel("Freq of 3 Drug Combinations") # plt.ylabel("Proportion") plt.title("%s Combinations (%s)" % (np.sum(arCountFreq[:5]), round(np.sum(prop[:5]), 2))) plt.tight_layout() plt.savefig("%s/D3_Freq.png" % params.FIG_DIR)
def getFDADrug(): fin = open("%s/polyDrugADR.txt" % params.DATA_DIR) se1 = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("|") ses = parts[0].split(",") for se in ses: utils.add_dict_counter(se1, se) fin.close() fin = open("%s/CADER.txt" % params.CAD_OUT) se2 = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("$") ses = parts[1].split(",") for se in ses: utils.add_dict_counter(se2, se) kvs1 = utils.sort_dict(se1) kvs2 = utils.sort_dict(se2) print(len(kvs1), len(kvs2)) k1 = set() k2 = set() MIN_T = 5 for kv in kvs1: k, v = kv if v >= MIN_T: k1.add(k) for kv in kvs2: k, v = kv if v >= 60: k2.add(k) n1 = 0 n2 = 0 for k in k1: if k not in k2: n1 += 1 for k in k2: if k not in k1: n2 += 1 print( len(k1), len(k2), n1, n2, n1 / len(k1), n2 / len(k2), )
def exportSubG2(): fin = open("%s/JADER.txt" % params.JADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) drugs = sorted(drugs) sortNames = ",".join(drugs) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/G%s" % (params.JADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) if len(drugs) > 2 and len(drugs) <= 20: for i in range(len(drugs)): for j in range(i + 1, len(drugs)): d1 = drugs[i] d2 = drugs[j] pair = "%s,%s" % (d1, d2) try: f2 = foutDict[2] except: f2 = open("%s/SUB/G%s" % (params.JADER_OUT, 2), "w") foutDict[2] = f2 f2.write("%s$%s\n" % (pair, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.JADER_OUT)
def plot3X(): dLength = utils.load_obj("%s/FDrugCombLength" % params.FADER_OUT) kvs = utils.sort_dict(dLength) dCount = dict() for kv in kvs: _, v = kv utils.add_dict_counter(dCount, v) maxLength = max(dCount.keys()) x = [i for i in range(1, maxLength + 1)] import numpy as np y = np.zeros(maxLength) for k, v in dCount.items(): y[k - 1] = v fin = open("%s/FDrug2AllSeList.txt" % params.FADER_OUT) dLength2NReports = dict() kv = [] vs = [] while True: line = fin.readline() if line == "": break line = line.strip().split("$") parts = line[0].split(":") c = int(parts[1]) drugCombLenght = len(parts[0].split(",")) utils.add_dict_counter(dLength2NReports, drugCombLenght, c) vs.append(c) kv.append([parts[0], c]) # import matplotlib.pyplot as plt # import numpy as np # maxX = max(dLength2NReports.keys()) x = [i for i in range(1, maxLength + 1)] z = np.zeros(maxLength) for k, v in dLength2NReports.items(): z[k - 1] = v import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax = fig.add_subplot(projection='3d') ax.plot(x, y, z, marker='>') ax.set_xlabel('DrugComb Length') ax.set_ylabel('DrugComb Count') ax.set_zlabel('NReport') plt.tight_layout plt.savefig("%s/3DDrugCombLengthReport.png" % params.FIG_DIR)
def plotDrugCombCount(): fin = open("%s/FDrug2AllSeList.txt" % params.FADER_OUT) dLength2NReports = dict() kv = [] vs = [] while True: line = fin.readline() if line == "": break line = line.strip().split("$") parts = line[0].split(":") c = int(parts[1]) drugCombLenght = len(parts[0].split(",")) utils.add_dict_counter(dLength2NReports, drugCombLenght, c) vs.append(c) kv.append([parts[0], c]) import matplotlib.pyplot as plt import numpy as np maxX = max(dLength2NReports.keys()) x = [i for i in range(1, maxX + 1)] y = np.zeros(maxX) for k, v in dLength2NReports.items(): y[k - 1] = v plt.scatter(x, y) plt.xlabel("DrugComb length") plt.ylabel("Num Reports") plt.tight_layout() plt.savefig("%s/FDAReportsOnDrugLength.png" % params.FIG_DIR) # plotHistD(vs, 100, "HistDrugCombFrequency") from dataProcessing.plotLib import plotCul2, plotCul, plotHistD print(len(kv), kv[-1]) print(kv[0]) print(max(vs), min(vs)) plotCul(kv[::-1], 10, 1, "DrugCombFreq", xLabel="Threshold of DrugComb Frequency", yLabel="Num DrugComb") plotCul2(kv[::-1], 10, 1, "DrugCombReports", xLabel="Threshold of DrugComb Frequency", yLabel="Num Reports")
def statsCommonSes(): fin = open("%s/CADER.txt" % (params.CAD_OUT)) dSeCout = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("$") ses = parts[-1].split(",") for se in ses: utils.add_dict_counter(dSeCout, se) kvs = utils.sort_dict(dSeCout) ks = [] for kv in kvs: k, v = kv if v <= 20: continue ks.append(k) utils.save_obj(ks, "%s/SeTopList.txt" % params.CAD_OUT)
def exportSub(): fin = open("%s/FDrug2SeList_19814.txt" % params.FADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) sortNames = ",".join(sorted(drugs)) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/%s" % (params.FADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.FADER_OUT)
def exportCanSaltFreq(): fin = open("%s/rawMatching/MatchingDrug2.txt" % params.OUTPUT_DIR) wordFreqs = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("||") # words = parts[0].split(" ") # for word in words: # if "(" not in word and ")" not in word: # utils.add_dict_counter(wordFreqs, word) utils.add_dict_counter(wordFreqs, parts[1]) kvs = utils.sort_dict(wordFreqs) fout = open("%s/rawMatching/CandSaltFreq.txt" % params.OUTPUT_DIR, "w") for kv in kvs: k, v = kv if v <= 2: continue fout.write("%s\n" % (k)) fout.close()
def checkDupR(): fin = open("%s/ReportDrug1.txt" % params.CAD_OUT) dCout = dict() nError = 0 cc = 0 while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") idx = parts[0] utils.add_dict_counter(dCout, idx) cc += 1 print("Total: ", nError, cc) kvs = utils.sort_dict(dCout) fout = open("%s/S1.txt" % params.CAD_OUT, "w") for kv in kvs: k, v = kv fout.write("%s\t%s\n" % (k, v)) fout.close()
def plotDrugCombLength(): dLength = utils.load_obj("%s/DrugCombLength" % params.JADER_OUT) kvs = utils.sort_dict(dLength) dCount = dict() for kv in kvs: _, v = kv utils.add_dict_counter(dCount, v) maxLength = max(dCount.keys()) x = [i for i in range(1, maxLength+1)] import numpy as np y = np.zeros(maxLength) for k, v in dCount.items(): y[k-1] = v import matplotlib.pyplot as plt plt.scatter(x,y) plt.xlabel("DrugComb length") plt.ylabel("Num DrugComb") plt.tight_layout() plt.savefig("%s/%s.png" % (params.FIG_DIR, "JADERDrugLength"))
def exportNoMatching(): d1 = loadMatchingFiles("%s/rawMatching/MatchingDrug1.txt" % params.OUTPUT_DIR) d2 = loadMatchingFiles("%s/rawMatching/MatchingDrug2.txt" % params.OUTPUT_DIR) # d3 = loadMatchingFiles("%s/MatchingDrugH.txt" % params.OUTPUT_DIR) print("Matching targets: ", len(d1.values()), len(set(d2.values()))) for k, v in d2.items(): d1[k] = v # for k, v in d3.items(): # d1[k] = v selectedDrugs = d1.keys() fin = open("%s/Tmp/DrugFreq2.txt" % params.OUTPUT_DIR) fout2 = open("%s/rawMatching/NoMatchingDrugFreq.txt" % params.OUTPUT_DIR, "w") dMatchCout = dict() noMatchingList = set() while True: line = fin.readline() if line == "": break parts = line.strip().split("\t") name = parts[1].strip().lower() cout = int(parts[0]) # if line.__contains__("theophyllline"): # print(name, name in selectedDrugs) if name in selectedDrugs: targetName = d1[name] utils.add_dict_counter(dMatchCout, targetName, cout) else: fout2.write("%s" % line) noMatchingList.add(name) fin.close() fout2.close()
def exportDrugCom2Side(): fin = open("%s/JADER.txt" % params.JADER_OUT) fout = open("%s/JADER2AllSeList.txt" % params.JADER_OUT, "w") dDrugComb2Se = dict() dDrugCombCount = dict() dDrugCom2Lenght = dict() drugCont = dict() seCount = dict() cc = 0 while True: line = fin.readline() if line == "": break cc += 1 line = line.strip() parts = line.split("$") drugCom = parts[0] dDrugCom2Lenght[drugCom] = len(drugCom.split(",")) ses = parts[1].split(",") utils.add_dict_counter(dDrugCombCount, drugCom, 1) for drug in drugCom.split(","): utils.add_dict_counter(drugCont, drug, 1) sesComb = utils.get_insert_key_dict(dDrugComb2Se, drugCom, dict()) for se in ses: utils.add_dict_counter(sesComb, se, 1) utils.add_dict_counter(seCount, se) kvs = utils.sort_dict(dDrugCombCount) for kv in kvs: k, v = kv seCountKv = utils.sort_dict(dDrugComb2Se[k]) sString = [] for seCountx in seCountKv: se,count = seCountx sString.append("%s:%s"% (se, count)) fout.write("%s:%s$%s$%s\n" % (k, v, len(sString), ",".join(sString))) fout.close() utils.save_obj(seCount, "%s/JADERSeCountFX" % params.JADER_OUT) utils.save_obj(dDrugCom2Lenght, "%s/DrugCombLength" % params.JADER_OUT) print(len(drugCont), len(seCount))
def exportPair(): fin = open("%s/CADER.txt" % OUT_DIR) # fout = open("%s/JADERIndPair.txt" % params.JADER_OUT, "w") validDrugs = dict() validPairs = dict() validIndicates = dict() validSes = dict() while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") drugComb = parts[1] indications = parts[2] ses = parts[3] drugs = drugComb.split(",") # print(drugs) for drug in drugs: utils.add_dict_counter(validDrugs, drug) for ind in indications.split(","): utils.add_dict_counter(validIndicates, ind) for se in ses.split(","): utils.add_dict_counter(validSes, se) if len(drugs) >= 2 and len(drugs) <= 20: drugs = sorted(drugs) for i in range(len(drugs)): for j in range(i + 1, len(drugs)): d1, d2 = drugs[i], drugs[j] pair = "%s,%s" % (d1, d2) utils.add_dict_counter(validPairs, pair) cDrug = utils.sort_dict(validDrugs) cInd = utils.sort_dict(validIndicates) cSe = utils.sort_dict(validSes) cPair = utils.sort_dict(validPairs) print(len(cPair)) writeSortedDictC(cDrug, "%s/%sADrugs.txt" % (OUT_DIR, PREF)) writeSortedDictC(cInd, "%s/%sAInd.txt" % (OUT_DIR, PREF)) writeSortedDictC(cSe, "%s/%sASe.txt" % (OUT_DIR, PREF)) writeSortedDictC(cPair, "%s/%sPairs.txt" % (OUT_DIR, PREF))
def getDrugSet(path, dDrugSet, dDrugCombSet, dMap=dict()): fin = open(path, encoding="utf8", errors='ignore') fin.readline() currentId = -1 currentDrugSet = set() print("Loading: ...", path) skipCase = False while True: line = fin.readline() if line == "": break line = line.strip().lower() parts = line.split("$") caseId = parts[1] drugName = parts[4] drugName = stripDrugNameO(drugName) if len(drugName) == 0: skipCase = True currentId = caseId currentDrugSet = set() continue if len(dMap) == 0: utils.add_dict_counter(dDrugSet, drugName) else: drugName = utils.get_dict(dMap, drugName, -1) if drugName == -1: skipCase = True if caseId != currentId: if currentId != -1 and not skipCase: utils.add_dict_counter(dDrugCombSet, tuple(currentDrugSet), 1) for dName in currentDrugSet: utils.add_dict_counter(dDrugSet, dName) currentId = caseId currentDrugSet = set() if drugName != -1: skipCase = False if not skipCase: if type(drugName) == int: print(currentId, caseId) print(line) exit(-1) currentDrugSet.add(drugName) fin.close()
def exportBySE(seNames, pathIn, dirOut, pathInfo): fin = open(pathIn) dCombCount = dict() dCombSe = dict() dSe = dict() nA = 0 print("Reading...") if not type(seNames) == set: seNames = set(seNames) print(seNames) while True: line = fin.readline() if line == "": break nA += 1 parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] ses = set(ses.split(",")) for se in seNames: dCombCountx = utils.get_insert_key_dict(dCombCount, se, dict()) utils.add_dict_counter(dCombCountx, drugCmb) if se in ses: dComSEx = utils.get_insert_key_dict(dCombSe, se, dict()) utils.add_dict_counter(dSe, se) utils.add_dict_counter(dComSEx, drugCmb) fin.close() print("Cal Contingency table...") dContigenTable = dict() for se in seNames: dCombCountx = dCombCount[se] dComSEx = utils.get_dict(dCombSe, se, dict()) nSe = utils.get_dict(dSe, se, 0) if nSe == 0: continue for drugComb, nComb in dCombCountx.items(): ar = np.zeros((2, 2)) nCombSe = utils.get_dict(dComSEx, drugComb, 0) if nCombSe == 0: # print("SKIP") continue ar[0, 0] = nCombSe ar[1, 0] = nComb - nCombSe ar[0, 1] = nSe - nCombSe ar[1, 1] = nA - (nComb + nSe - nCombSe) nName = "%s_%s" % (drugComb, se) dContigenTable[nName] = ar producers = [] consumers = [] queue = Queue(params.K_FOLD) counter = Value('i', 0) counter2 = Value('i', 0) inputList = list(dContigenTable.items()) nInputList = len(inputList) nDPerWorker = int(nInputList / params.N_DATA_WORKER) # assert 'g-csf' in allDrugNames for i in range(params.N_DATA_WORKER): startInd = i * nDPerWorker endInd = (i + 1) * nDPerWorker endInd = min(endInd, nInputList) if i == params.N_DATA_WORKER - 1: endInd = nInputList data = inputList[startInd:endInd] producers.append(Process(target=producer, args=(queue, data))) sname = "__".join(list(seNames)) seNameString = "%s" % hash(sname) fFileNameMap = open(pathInfo, "a") fFileNameMap.write("%s\t%s\n" % (seNameString, sname)) fFileNameMap.close() fout = open("%s/%s" % (dirOut, seNameString), "w") p = Process(target=consumer, args=(queue, counter, counter2, fout, [])) p.daemon = True consumers.append(p) print("Start Producers...") for p in producers: p.start() print("Start Consumers...") for p in consumers: p.start() for p in producers: p.join() print("Finish Producers") queue.put(None) while True: if counter.value == 0: time.sleep(0.01) continue else: break fout.flush() fout.close()