def getDataFrame(): input = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\compare.txt' out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\compareDataFrame.txt' wt = FF.getWriter(out, False) iter = FF.getLineByPath(input) dataframe = {} diseases = 'BD,CAD,Height,RA,TC'.split(',') for dis in diseases: dataframe[dis] = {} dataframe[dis]['our'] = [] dataframe[dis]['their'] = [] for line in iter: arr = line.split('\t') dataframe['BD']['our'].append(arr[0]) dataframe['BD']['their'].append(arr[1]) dataframe['CAD']['our'].append(arr[2]) dataframe['CAD']['their'].append(arr[3]) dataframe['Height']['our'].append(arr[4]) dataframe['Height']['their'].append(arr[5]) dataframe['RA']['our'].append(arr[6]) dataframe['RA']['their'].append(arr[7]) dataframe['TC']['our'].append(arr[8]) dataframe['TC']['their'].append(arr[9]) wt.write('TraitDisease\tMethod\tTissueCellType\tRank\n') tissueSet = sorted(dataframe['BD']['our']) for dis in dataframe.keys(): for met in dataframe[dis].keys(): for tis in tissueSet: wt.write(dis + '\t' + met + '\t' + tis + '\t' + str( len(dataframe[dis][met]) - dataframe[dis][met].index(tis)) + '\n') wt.close()
def getNGData(): overlap = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\overlap.txt' out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\NGdata.txt' input = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\NGresults.txt' dic_iter = FF.getLineByPath(overlap) dict = {} dict['our'] = [] dict['their'] = [] for line in dic_iter: arr = line.split('\t') dict['our'].append(arr[0]) dict['their'].append(arr[1]) ng_iter = FF.getLineByPath(input) dis_tis = {} tiss = [] diss = ng_iter.__next__().split('\t')[1:] print(diss) for dis in diss: dis_tis[dis] = [] for line in ng_iter: arr = line.split('\t') if arr[0] in dict['their']: tiss.append(dict['our'][dict['their'].index(arr[0])]) for i in range(len(diss)): dis_tis[diss[i]].append(arr[i + 1]) wt = FF.getWriter(out, False) wt.write('TraitDisease\tTissueCellType\tValue\n') for dis in dis_tis.keys(): for i in range(len(dis_tis[dis])): wt.write(dis + '\t' + tiss[i] + '\t' + str(dis_tis[dis][i]) + '\n') wt.close()
def getConditionalGeneList(dir, outPath, outPathGC, pattern): fdr = getFDR() import os import common.util.FileFunction as FF if not os.path.exists(os.path.dirname(outPath)): os.mkdir(os.path.dirname(outPath)) wt = FF.getWriter(outPath, False) wtgc = FF.getWriter(outPathGC, False) FF.gzWrite(wt, "DiseaseName\tAssociatedGenes\n", outPath) FF.gzWrite(wtgc, "DiseaseName\tAssociatedGenesCounts\n", outPathGC) import os print(len(os.listdir(dir))) for disDir in os.listdir(dir): for file in os.listdir(os.path.join(dir, disDir)): if file.__contains__(pattern): print(os.path.join(dir, disDir, file)) pcut = fdr[file.split("-")[0]] genelist = getGeneListFromKGGxlsx( os.path.join(dir, disDir, file), pcut) FF.gzWrite( wt, file.split("-")[0] + "\t" + ",".join(genelist) + "\n", outPath) FF.gzWrite( wtgc, file.split("-")[0] + "\t" + str(len(genelist)) + "\n", outPathGC) wt.close() print("\n" + str(len(os.listdir(dir))) + " files finish!")
def getRank(inPath, outDir): if not os.path.exists(outDir): os.mkdir(outDir) line_iter = FF.getArrByPath(inPath) head = line_iter.__next__() bwArr = [] for i in range(int((len(head) - 1) / 2)): bwArr.append( FF.getWriter(os.path.join(outDir, head[i + 1][0:-1] + ".txt"), False)) for line in line_iter: for j in range(len(bwArr)): FF.gzWrite(bwArr[j], line[0] + "\t" + line[j + 1] + "\n", ".txt") for k in range(len(bwArr)): bwArr[k].close()
def singleDiseaseDeal(): #path="F:\Projects\TEA\GWAS_tmp\BD\BD-ECS-Cond-0.01.xlsx" path = "F:\Projects\TEA\GWAS_tmp\BD\\noCondGenes.txt" pcut = 7.427493748198251E-4 outPath = "F:\Projects\TEA\GWAS_tmp\BD\\noCondAssociatedGenes.txt" import os import common.util.FileFunction as FF if not os.path.exists(os.path.dirname(outPath)): os.mkdir(os.path.dirname(outPath)) wt = FF.getWriter(outPath, False) FF.gzWrite(wt, "DiseaseName\tAssociatedGenes\n", outPath) #genes=getGeneListFromKGGxlsx(path, pcut) genes = getGeneListFromKGGtxt(path, pcut) FF.gzWrite(wt, "BD\t" + ",".join(genes) + "\n", outPath) wt.close()
def getKGGparameter(dir, outPath): import common.util.FileFunction as FF wt = FF.getWriter(outPath, False) FF.gzWrite(wt, "DiseaseName\tErrorRate\n", outPath) import os for disDir in os.listdir(dir): for file in os.listdir(os.path.join(dir, disDir)): if file.__contains__("ECS-Cond"): if len(file.split("-")[2].split("_")) == 1: error = "0.05" else: error = file.split("-")[2].split("_")[1].replace( ".xlsx", "") FF.gzWrite(wt, file.split("-")[0] + "\t" + error + "\n", outPath) wt.close()
def filterOurResult(): overlap = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\overlap.txt' out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\OurData_v3.tpm_0.01.AddCategories2.txt' input = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.forRplot.txt' dic_iter = FF.getLineByPath(overlap) dict = {} dict['our'] = [] dict['their'] = [] for line in dic_iter: arr = line.split('\t') dict['our'].append(arr[0]) dict['their'].append(arr[1]) ng_iter = FF.getLineByPath(input) wt = FF.getWriter(out, False) wt.write(ng_iter.__next__() + '\n') for line in ng_iter: if line.split('\t')[1] in dict['our']: wt.write(line + '\n') wt.close()
def analysis(inPath, outPath): line_iter = FF.getArrByPath(inPath) wt = FF.getWriter(outPath, False) FF.gzWrite(wt, "\t".join(line_iter.__next__()) + "\n", outPath) for line in line_iter: FF.gzWrite(wt, line[0], outPath) for idx in range(len(line) - 1): genes = line[idx + 1].split(",") hit = 0 nonhit = 0 for gene in genes: pubmed = gene.split(":")[1] if pubmed: hit += 1 else: nonhit += 1 FF.gzWrite(wt, "\t" + str(hit) + "/" + str(nonhit), outPath) FF.gzWrite(wt, "\t" + "\n", outPath) wt.close()
def writeLog(self, content): import common.util.FileFunction as FF import time wt = FF.getWriter(self.logPath, True) if self.timeDate: content = time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime(time.time())) + content wt.write("\n" + content) print(content) wt.close()
def getGeneListFromKGGtxt(path, pcut): regenelist = [] import common.util.FileFunction as FF line_iter = FF.getLineByPath(path) line_iter.__next__() for line in line_iter: cell = line.split("\t") if float(cell[1]) < pcut: regenelist.append(cell[0]) return regenelist
def classTissues(): tissueClass = {} tissueClass['Brain']='Brain-Amygdala Brain-Anteriorcingulatecortex(BA24) Brain-Caudate(basalganglia) ' \ 'Brain-CerebellarHemisphere Brain-Cerebellum Brain-Cortex Brain-FrontalCortex(BA9) ' \ 'Brain-Hippocampus Brain-Hypothalamus Brain-Nucleusaccumbens(basalganglia) ' \ 'Brain-Putamen(basalganglia) Brain-Spinalcord(cervicalc-1) ' \ 'Brain-Substantianigra'.split("\t") tissueClass[ 'Adipose'] = 'Adipose-Subcutaneous\tAdipose-Visceral(Omentum)'.split( '\t') tissueClass[ 'Circulatory'] = 'Artery-Aorta\tArtery-Coronary\tArtery-Tibial\tHeart-AtrialAppendage\tHeart-LeftVentricle'.split( '\t') tissueClass[ 'Reproductive (Female)'] = 'FallopianTube\tCervix-Ectocervix\tCervix-Endocervix\tUterus\tVagina\tOvary\tBreast-MammaryTissue'.split( '\t') tissueClass[ 'Digestive'] = 'MinorSalivaryGland\tStomach\tLiver\tEsophagus-GastroesophagealJunction\tEsophagus-Mucosa\tEsophagus-Muscularis\tSmallIntestine-TerminalIleum\tColon-Sigmoid\tColon-Transverse'.split( '\t') tissueClass['Endocrine'] = 'AdrenalGland,Thyroid'.split(',') tissueClass['Urinary'] = 'Bladder,Kidney-Cortex'.split(',') tissueClass['Immune'] = 'Cells-EBV-transformedlymphocytes,Spleen'.split( ',') tissueClass['Connective'] = ['Cells-Transformedfibroblasts'] tissueClass['Respiratory'] = ['Lung'] tissueClass['Muscular'] = ['Muscle-Skeletal'] tissueClass['Nerve'] = ['Nerve-Tibial'] tissueClass['Reproductive (Male)'] = 'Prostate,Testis'.split(',') tissueClass[ 'Skin'] = 'Skin-NotSunExposed(Suprapubic),Skin-SunExposed(Lowerleg)'.split( ',') #orgpath='F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\NGdata.txt' #outpath='F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\NGdataAddCategories2.txt' orgpath = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.txt' outpath = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.forRplot.txt' line_iter = FF.getLineByPath(orgpath) wt = FF.getWriter(outpath, False) FF.gzWrite(wt, line_iter.__next__() + '\tCategories\n') for line in line_iter: break_loop = False tissue = line.split("\t")[1] for cata in tissueClass.keys(): for tis in tissueClass[cata]: if tis == tissue: FF.gzWrite(wt, line + '\t' + cata + '\n') break_loop = True break if break_loop == True: break wt.close()
def compareMethods(): orign = 'D:\\Users\\xuechao\\Desktop\\orign.txt' dict_path = 'D:\\Users\\xuechao\\Desktop\\overlap.txt' out = 'D:\\Users\\xuechao\\Desktop\\out.txt' dic_iter = FF.getLineByPath(dict_path) dict = {} dict['our'] = [] dict['their'] = [] for line in dic_iter: arr = line.split('\t') dict['our'].append(arr[0]) dict['their'].append(arr[1]) print(len(dict['our'])) print(len(dict['their'])) wt = FF.getWriter(out, False) ori_iter = FF.getLineByPath(orign) for line in ori_iter: arr = line.split('\t') for tis in arr: if tis in dict['our']: FF.gzWrite(wt, tis + '\t') continue if tis in dict['their']: FF.gzWrite(wt, dict['our'][dict['their'].index(tis)] + '\t') continue else: FF.gzWrite(wt, '\t') FF.gzWrite(wt, '\n') wt.close()
def mergeGenes(noPath, rankPath, outPath): no_arr = [] out_wt = FF.getWriter(outPath, False) out_wtc = FF.getWriter(outPath + ".count", False) FF.gzWrite(out_wt, "Disease\tNoRankGenes\tRankGenes\n", outPath) FF.gzWrite(out_wtc, "Disease\tAllNoGenes\tAllRankGenes\tNoRankGenes\tRankGenes\n", outPath) no_iter = FF.getArrByPath(noPath) rank_iter = FF.getArrByPath(rankPath) no_iter.__next__() rank_iter.__next__() for arr in no_iter: no_arr.append(arr) for arrr in rank_iter: for arrn in no_arr: if arrr[0] == arrn[0]: FF.gzWrite(out_wt, arrr[0] + "\t", outPath) gener = arrr[1].split(",") genen = arrn[1].split(",") FF.gzWrite( out_wtc, arrr[0] + "\t" + str(len(genen)) + "\t" + str(len(gener)) + "\t", outPath) tmpgenes = [] for gene1 in genen: goto = False for gene2 in gener: if gene1 == gene2: goto = True break if goto == True: continue else: tmpgenes.append(gene1) FF.gzWrite(out_wt, ",".join(tmpgenes) + "\t", outPath) FF.gzWrite(out_wtc, str(len(tmpgenes)) + "\t", outPath) ## tmpgenes = [] for gene1 in gener: goto = False for gene2 in genen: if gene1 == gene2: goto = True break if goto == True: continue else: tmpgenes.append(gene1) FF.gzWrite(out_wt, ",".join(tmpgenes) + "\n", outPath) FF.gzWrite(out_wtc, str(len(tmpgenes)) + "\n", outPath) continue continue out_wt.close()
def getFinalResultXlsx(dir, noDir, removeNoPebmed=False): inPath = os.path.join(dir, "compareNCBI.txt") outPath = os.path.join(dir, "compareNCBI.stat.rmNoNCBI.print.xls") style1 = XFStyle() style2 = XFStyle() style3 = XFStyle() pattern = Pattern() pattern.pattern = Pattern.SOLID_PATTERN pattern.pattern_fore_colour = Style.colour_map['gray25'] algn1 = Alignment() algn1.wrap = algn1.WRAP_AT_RIGHT algn1.horz = algn1.HORZ_LEFT algn1.vert = algn1.VERT_CENTER style2.alignment = algn1 style1.alignment = algn1 style1.pattern = pattern import xlwt wb = xlwt.Workbook() ncbi = FF.getArrByPath(inPath) head = ncbi.__next__() for arr in ncbi: rank_dir = os.path.join(dir, "TSBGene", "DiseaseBased") #rank_dir ="F:\Projects\TEA\GWAS_10_diseases" wb_rank = xlrd.open_workbook( getKGGXlsPath(rank_dir, arr[0], "ECS-rank-Cond")) wb_no = xlrd.open_workbook( getKGGXlsPath(noDir, arr[0], "ECS-rmHLA-Cond")) fisher = [] ws = wb.add_sheet(arr[0], True) ws.col(0).width = 256 * 15 ws.col(1).width = 256 * 15 ws.col(2).width = 256 * 15 ws.col(3).width = 256 * 100 rn = 0 ws.write(rn, 0, "Gene", style=style2) ws.write(rn, 1, "P1", style=style2) ws.write(rn, 2, "P2", style=style2) ws.write(rn, 3, "PubMedID", style=style2) rn += 1 for i in range(len(arr) - 1): ws.write_merge(rn, rn, 0, 3, getRightColName(head[i + 1]), style=style1) rn += 1 pubmeds = arr[i + 1].split(",") fisher.append(len(pubmeds)) hit = 0 for pm in pubmeds: gpm = pm.split(":") if gpm[1]: hit += 1 ws.write(rn, 0, gpm[0], style=style2) ws.write(rn, 3, gpm[1], style=style2) ws.write(rn, 1, getGenePvalueFromKGGXls(gpm[0], wb_no), style2) ws.write(rn, 2, getGenePvalueFromKGGXls(gpm[0], wb_rank), style2) rn += 1 else: if not removeNoPebmed: ws.write(rn, 0, gpm[0], style=style2) ws.write(rn, 1, getGenePvalueFromKGGXls(gpm[0], wb_no), style2) ws.write(rn, 2, getGenePvalueFromKGGXls(gpm[0], wb_rank), style2) rn += 1 fisher.append(hit) fisher[0] = fisher[0] - fisher[1] fisher[2] = fisher[2] - fisher[3] no_bigger_rank = stats.fisher_exact([fisher[0:2], fisher[2:4]], alternative="less")[1] no_smaller_rank = stats.fisher_exact([fisher[0:2], fisher[2:4]], alternative="greater")[1] ws.write_merge( rn, rn + 1, 0, 3, "STATISTIC (hit counts/non-hit counts): By p-value ranking:" + str(fisher[1]) + "/" + str(fisher[0]) + "; By selective expression ranking:" + str(fisher[3]) + "/" + str(fisher[2]) + "\n" + "Fisher's exact test: P(H1=p-value>selective expression)=" + str(no_bigger_rank) + "; P(H1=selective expression>p-value)=" + str(no_smaller_rank), style=style1) rn += 2 ws.write_merge( rn, rn + 2, 0, 3, "Note: P1: This is a conditional gene-based association p-value according " "to statistical significance order. \nP2: This is a conditional gene-based " "association p-value according to tissue-specific pathogenic potential. " "\nThe papers co-mentioning the gene and diseases/traits in the titles or " "abstracts in PubMed database were searched by the API function.", style=style2) wb.save(outPath)