def GetTopoAlignStat(topo1, topo2): stat = [] posTM1 = myfunc.GetTMPosition(topo1) if len(posTM1) > 0: for (b, e) in posTM1: segList1 = [] segList2 = [] cntTM = 0 cntGap = 0 cntSeq = 0 for j in xrange(b, e): if topo1[j] == 'M': segList2.append(topo2[j]) if topo2[j] == 'M': cntTM += 1 elif topo2[j] == '-': cntGap += 1 else: cntSeq += 1 rd = {} sizeSeg = len(segList2) freqTM = myfunc.FloatDivision(cntTM, sizeSeg) freqGap = myfunc.FloatDivision(cntGap, sizeSeg) freqSeq = myfunc.FloatDivision(cntSeq, sizeSeg) rd['seg2'] = ''.join(segList2) rd['freqTM'] = freqTM rd['freqGap'] = freqGap rd['freqSeq'] = freqSeq stat.append(rd) return stat
def WriteNumTMHeatMap(data, maxNumTM, count, mode, outfile): #{{{ try: fpout = open(outfile, "w") #maxNumTM = dataTable['maxNumTM'] #count = dataTable['numPair'] #data = dataTable['data'] scale_norm_col_list = [] # normalized so that the sum of diagonal = 100 if mode == "norm_diag": diag = [data[i][i] for i in xrange(maxNumTM + 1)] scale_norm_diag = myfunc.FloatDivision(count, sum(diag)) elif mode == "norm_col": for j in xrange(0, maxNumTM + 1): li = [data[i][j] for i in xrange(0, maxNumTM + 1)] scale_norm_col_list.append(myfunc.FloatDivision( count, sum(li))) for i in xrange(0, maxNumTM + 1): if mode == "norm_diag": scale = scale_norm_diag elif mode == "norm_row": scale = myfunc.FloatDivision(count, sum(data[i])) for j in xrange(0, maxNumTM + 1): if mode == "norm_col": scale = scale_norm_col_list[j] fpout.write( " %6.3g" % (myfunc.FloatDivision(data[i][j], count) * scale * 100)) fpout.write("\n") fpout.close() return 0 except IOError: print >> sys.stderr, "Failed to write to file %s" % outfile return 1
def WriteTable2D(freq, subsum, classList, seqIDTGroupList, outfile): #{{{ #works with dataCmpClass and dataNCtermInter try: fpout = open(outfile, "wb") except IOError: print >> sys.stderr, "Failed to write to file %s" % outfile return 1 numGroup = len(seqIDTGroupList) / 2 numClass = len(classList) fpout.write("%4s %7s" % ("#Idx", "SeqIDT")) for cls in classList: fpout.write(" %9s" % (cls)) fpout.write(" %9s" % "Maximum") fpout.write(" %10s" % "Occurrence") fpout.write("\n") for i in xrange(numGroup): stridtrange = "%g-%g" % (seqIDTGroupList[i * 2], seqIDTGroupList[i * 2 + 1]) fpout.write("%-4d %7s" % (i, stridtrange)) for j in xrange(numClass): fpout.write(" %9.3f" % (myfunc.FloatDivision(freq[i][j], subsum[i]) * 100)) fpout.write(" %9.3f" % max([ myfunc.FloatDivision(freq[i][j], subsum[i]) * 100 for j in range(numClass) ])) fpout.write(" %10d" % subsum[i]) fpout.write("\n") fpout.write("%-12s" % ("#sum")) totalOccur = [0] * numClass totalSum = sum(subsum) for j in xrange(numClass): for i in xrange(numGroup): totalOccur[j] += freq[i][j] fpout.write(" %9.3f" % (myfunc.FloatDivision(totalOccur[j], totalSum) * 100)) fpout.write(" %9.3f" % max([ myfunc.FloatDivision(totalOccur[j], totalSum) * 100 for j in range(numClass) ])) fpout.write(" %10d" % totalSum) fpout.write("\n") fpout.close() return 0
def GetAlignmentFactorFromPairAlignment(seq1,seq2, isLocalAlignment):#{{{ """ Return alignment factor as a dictionary """ alignFactor = {} alnLength = len(seq1) if isLocalAlignment is True: cntLocalAlnLength = 0 cntLocalIDT = 0 cntLocalGap = 0 cntLocalLen1 = 0 cntLocalLen2 = 0 cntLocalUnAligned = 0 for i in range(alnLength): if ((seq1[i].isalpha() and seq1[i].islower()) or (seq2[i].isalpha() and seq2[i].islower())): cntLocalUnAligned += 1 else: cntLocalAlnLength += 1 if seq1[i] == seq2[i]: cntLocalIDT += 1 elif seq1[i] == "-" or seq2[i] == "-": cntLocalGap += 1 if seq1[i] != "-": cntLocalLen1 += 1 if seq2[i] != "-": cntLocalLen2 += 1 alignFactor['numIDT'] = cntLocalIDT alignFactor['numGap'] = cntLocalGap alignFactor['alnLength'] = cntLocalAlnLength alignFactor['seqidt0'] = myfunc.FloatDivision(cntLocalIDT, cntLocalAlnLength) * 100 alignFactor['seqidt1'] = myfunc.FloatDivision(cntLocalIDT, min(cntLocalLen1,cntLocalLen2))*100 alignFactor['seqidt2'] = myfunc.FloatDivision(cntLocalIDT, cntLocalAlnLength - cntLocalGap)*100 alignFactor['seqLength1'] = cntLocalLen1 alignFactor['seqLength2'] = cntLocalLen2 alignFactor['numUnaligned'] = cntLocalUnAligned else: len1 = len(seq1.replace("-", "")) len2 = len(seq2.replace("-", "")) cntIDT = 0 cntGap = 0 for i in range(alnLength): if seq1[i] == seq2[i]: cntIDT += 1 elif seq1[i] == "-" or seq2[i] == "-": cntGap += 1 alignFactor['numIDT'] = cntIDT alignFactor['numGap'] = cntGap alignFactor['alnLength'] = alnLength alignFactor['seqidt0'] = myfunc.FloatDivision(cntIDT, alnLength) * 100 alignFactor['seqidt1'] = myfunc.FloatDivision(cntIDT, min(len1,len2)) * 100 alignFactor['seqidt2'] = myfunc.FloatDivision(cntIDT, alnLength - cntGap) * 100 alignFactor['seqLength1'] = len1 alignFactor['seqLength2'] = len2 return alignFactor
def WriteSpecialPair( dataTable, all_pairInfoList, #{{{ seqid2pfamidDict, seqid2clanidDict, tm_pfamidSet, tm_clanidSet, pfamidDefDict, clanidDefDict, SPE_PAIR_LIST, outfile): fpout = myfunc.myopen(outfile, sys.stdout, "w", False) numpair_total = len(all_pairInfoList) pairInfoLists = dataTable['pairInfoLists'] (freqListPfam, freqListClan) = AnaFamFrequency(pairInfoLists, seqid2pfamidDict, seqid2clanidDict, tm_pfamidSet, tm_clanidSet) for i in xrange(len(SPE_PAIR_LIST)): cnt_this_pair_pfam = 0 cnt_this_pair_clan = 0 for tup in freqListPfam[i]: cnt_this_pair_pfam += tup[1][0] for tup in freqListClan[i]: cnt_this_pair_clan += tup[1][0] print >> fpout pair = SPE_PAIR_LIST[i] print >> fpout, pair, "PfamID", "%5d %5.1f %8d %6.2f" % ( cnt_this_pair_pfam, cnt_this_pair_pfam * g_params['scale_count'], numpair_total, myfunc.FloatDivision(cnt_this_pair_pfam, numpair_total) * 100) print >> fpout for tup in freqListPfam[i]: try: pfamdef = pfamidDefDict[tup[0]] except KeyError: pfamdef = "" fpout.write("%-8s %20s %5d %5.1f %6.2f " % (tup[0], pfamdef, tup[1][0], tup[1][0] * g_params['scale_count'], float(tup[1][0]) / cnt_this_pair_pfam * 100)) for pp in tup[1][1]: fpout.write("(%s %s) " % (pp[0], pp[1])) fpout.write("\n") print >> fpout pair = SPE_PAIR_LIST[i] print >> fpout, pair, "ClanID", "%5d %5.1f %8d %6.2f" % ( cnt_this_pair_clan, cnt_this_pair_clan * g_params['scale_count'], numpair_total, myfunc.FloatDivision(cnt_this_pair_clan, numpair_total) * 100) print >> fpout for tup in freqListClan[i]: try: clandef = clanidDefDict[tup[0]] except KeyError: clandef = "" fpout.write("%-8s %20s %5d %5.1f %6.2f " % (tup[0], clandef, tup[1][0], tup[1][0] * g_params['scale_count'], float(tup[1][0]) / cnt_this_pair_clan * 100)) for pp in tup[1][1]: fpout.write("(%s %s) " % (pp[0], pp[1])) fpout.write("\n") print >> fpout, "#=====================================================" myfunc.myclose(fpout) return 0
def Benchmark(real_topodict, idSet_single, idSet_multi, TM_type, fpout, fpout_wrong, seqDict): #{{{ if g_params['mode'] == "tps": itemlist = ["40", "41", "42", "43", "44", "All"] elif g_params['mode'] == "tp": itemlist = ["50", "51", "52", "53", "54", "55", "All"] isRestrictIDList = g_params['isRestrictIDList'] addname = "" if g_params['isRMSP']: addname = ".RMSP" numRealTopo = len(real_topodict) if isRestrictIDList: numRealTopo = len(g_params['restrictIDset'] & set(real_topodict.keys())) pred_topofile_list = [] pred_topodict_list = [] # Step 1, read in predicted topology for item in itemlist: pred_topofile = "" if item.upper() == "ALL": if g_params['mode'] == "tps": pred_topofile = "%s/%s.topcons-single_topcons_single%s.topo" % ( g_params['path_predtopo'], g_params['basename'], addname) elif g_params['mode'] == "tp": pred_topofile = "%s/%s.topcons.result_TOPCONS%s.topo" % ( g_params['path_predtopo'], g_params['basename'], addname) else: if g_params['mode'] == "tps": pred_topofile = "%s/%s_topcons_single.m1.agree-%s%s.topo" % ( g_params['path_predtopo'], g_params['basename'], item, addname) elif g_params['mode'] == "tp": pred_topofile = "%s/%s.topcons.result_TOPCONS.m1.agree-%s%s.topo" % ( g_params['path_predtopo'], g_params['basename'], item, addname) (pred_idlist, pred_annolist, pred_topolist) = myfunc.ReadFasta(pred_topofile) if len(pred_idlist) <= 0: print >> sys.stderr, "Failed to read pred_topofile %s" % ( pred_topofile) pred_topodict = {} for i in xrange(len(pred_idlist)): if ((not isRestrictIDList) or pred_idlist[i] in g_params['restrictIDset']): #if (TM_type == "All_Alpha" or (TM_type == "Single" and pred_idlist[i] in idSet_single) or (TM_type == "Multi" and pred_idlist[i] in idSet_multi)): pred_topodict[pred_idlist[i]] = pred_topolist[i] pred_topodict_list.append(pred_topodict) # Step 2, calculate precision of the prediction #header line fpout.write("#%s\n" % (TM_type)) fpout.write("#%2s %7s %8s %8s %8s %8s %8s %8s %8s\n" % ("No", "Group", "nIDT", "nINV", "nPred", "PPV(%)", "NPV_INV", "NPV_Other", "nAllReal")) for i in xrange(len(itemlist)): item = itemlist[i] pred_topodict = pred_topodict_list[i] numPredTopo = len(pred_topodict) (numIDTtopo, numINVtopo) = CountIdenticalTopology(pred_topodict, real_topodict, item, TM_type, fpout_wrong, seqDict, item) ss = "%-3d %7s %8d %8d %8d %8.1f %8.1f %8.1f %8d" % ( i, item, numIDTtopo, numINVtopo, numPredTopo, myfunc.FloatDivision(numIDTtopo, numPredTopo) * 100.0, myfunc.FloatDivision(numINVtopo, numPredTopo) * 100.0, myfunc.FloatDivision(numPredTopo - numIDTtopo - numINVtopo, numPredTopo) * 100.0, numRealTopo) fpout.write("%s\n" % (ss)) fpout.write("\n")
for iw in xrange(winsize): IncrementSumTableWithDiffTopo(i + iw, outList, sumWindowWithDiffTopoList) isFirstWindow = False else: # if not first window, minus the previous one, and plus the next one #iw_previous = i-1 #iw_next = i+winsize-1 DecrementSumTableWithDiffTopo(i - 1, outList, sumWindowWithDiffTopoList) IncrementSumTableWithDiffTopo(i + winsize - 1, outList, sumWindowWithDiffTopoList) fracList = [] for j in xrange(len(sumWindowWithDiffTopoList)): fracList.append( myfunc.FloatDivision(sumWindowWithDiffTopoList[j], winsize)) freqTopNList.append([i + 1, outList[i + winsize / 2][pivIdx]] + fracList) outfile1 = outfile + ".difffam_win%d.sortby_%s.mindiffpair_%d.txt" % ( winsize, itemList[pivIdx], mindiffpair) outfileList.append(outfile1) fpout = myfunc.myopen(outfile1, sys.stdout, "w", False) ss_sort_item = "min_%s" % (itemList[pivIdx]) fpout.write("#%-7s %*s %7s" % ("idxWin", len(ss_sort_item), ss_sort_item, "DIFF")) for ss in cmpclassList[1:]: fpout.write(" %7s" % (ss)) fpout.write("\n") for i in xrange(len(freqTopNList)): d = freqTopNList[
def WriteFamPairCount( freqList, pairInfoList, famDefDict, #{{{ cmpclassList, pairwise_comparison_method, isCmpDup, outfile): """ Write the number of pairs for each protein family as well as the frequency of topology variations in different classes for each family Input: freqList: [(pfamid, [numpair,numseq, numseq_TMpro, (id1,id2,cmpclass), ()...]), ...] pairInfoList: a list of tuples, [(id1,id2,cmpclass)] """ numpair_total = len(pairInfoList) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) # write header line fpout.write("#%-7s %18s %7s %7s %9s %6s %8s" % ("PfamID", "PfamDef", "NumPair", "NumSeq", "NumSeq_TM", "%#", "NumTotal")) for ss in cmpclassList: fpout.write(" %6s" % (ss)) fpout.write("\n") CountListAll = [ ] # list of [(numpair, numseq, numseq_TMpro, [#IDT, #INV, #...]), ([])] for tup in freqList: famid = tup[0] pairInfoList_fam = tup[ 1] #(pfamid, [numpair, numseq, numseq_TMpro, (id1,id2,cmpclass),(id1,id2,cmpclass)]) numpair = tup[1][0] numseq = tup[1][1] numseq_TMpro = tup[1][2] try: famdef = famDefDict[famid] except KeyError: famdef = "" cmpclasslist_fam = [] # a list of [cmpclass, cmpclass, ...] for tt in pairInfoList_fam[3:]: cmpclass = tt[2] if isCmpDup: if cmpclass.find("TM2GAP|DUP") == 0: cmpclass = "DUP" else: cmpclass = cmpclass.split('|')[0] else: cmpclass = cmpclass.split('|')[0] cmpclasslist_fam.append(cmpclass) cntList = [] for cls in cmpclassList: cntList.append(cmpclasslist_fam.count(cls)) fpout.write("%-8s %18s %7d %7d %9d %6.2f %8d" % (famid, famdef, numpair, numseq, numseq_TMpro, float(numpair) / numpair_total * 100.0, numpair_total)) for cnt in cntList: fpout.write(" %6d" % cnt) fpout.write("\n") CountListAll.append((numpair, numseq, numseq_TMpro, cntList)) myfunc.myclose(fpout) print "file %s output" % (outfile) # output the fraction of topN largest families that have topology variations # sorted in descending order by "numpair", "numseq", numseq_TMpro #{{{ mindiffpair = g_params['mindiffpair'] itemList = ["numpair", "numseq", "numseq_TMpro"] outfileList = [] for pivIdx in xrange(len(itemList)): #[0,1,2] outList = sorted(CountListAll, key=lambda x: x[pivIdx], reverse=True) freqTopNList = [] # [ [topN, min, frac_DIFF, frac_INV, frac_TM2GAP]] sumTopNWithDiffTopoList = [0] * (len(cmpclassList)) for i in xrange(len(outList)): isHaveDiffTopo = False cntDiffPair = 0 cntList = outList[i][3] for j in xrange(1, len(cntList)): cntDiffPair += cntList[j] if cntList[j] >= mindiffpair: sumTopNWithDiffTopoList[j] += 1 if cntDiffPair >= mindiffpair: isHaveDiffTopo = True if isHaveDiffTopo: sumTopNWithDiffTopoList[0] += 1 fracList = [] for j in xrange(len(sumTopNWithDiffTopoList)): fracList.append( myfunc.FloatDivision(sumTopNWithDiffTopoList[j], i + 1)) freqTopNList.append([i + 1, outList[i][pivIdx]] + fracList) outfile1 = outfile + ".topNdifffam.sortby_%s.mindiffpair_%d.txt" % ( itemList[pivIdx], mindiffpair) outfileList.append(outfile1) fpout = myfunc.myopen(outfile1, sys.stdout, "w", False) ss_sort_item = "min_%s" % (itemList[pivIdx]) fpout.write("#%-7s %*s %7s" % ("topN", len(ss_sort_item), ss_sort_item, "DIFF")) for ss in cmpclassList[1:]: fpout.write(" %7s" % (ss)) fpout.write("\n") for i in xrange(len(freqTopNList)): d = freqTopNList[ i] # [topN, min, frac_DIFF, frac_INV, frac_TM2GAP] fpout.write("%-8d %*d" % (d[0], len(ss_sort_item), d[1])) fracList = d[2:] for tt in fracList: fpout.write(" %7.2f" % (tt * 100)) fpout.write("\n") myfunc.myclose(fpout) print "file %s output" % (outfile1) # make plot cmd = ["%s/plotMaxFracFamilyWithTopoVariation.sh" % (binpath)] + outfileList try: subprocess.check_output(cmd) except subprocess.CalledProcessError, e: print e
# check whether it is alternating liStatus = [] th = 0.75 # only when 75% of the nterm status is either i or o are recognized minCount = 10 # i : 1 # o : -1 # non-determined: 0 for i in range(1, maxNumTM+1): try: (n1, n2) = (countDict[i][0], countDict[i][1]) except (KeyError, IndexError): (n1, n2) = (0, 0) if myfunc.FloatDivision(n1, n1+n2) >= th and n1 >= minCount: liStatus.append(1) elif myfunc.FloatDivision(n2, n1+n2) >= th and n2 >= minCount: liStatus.append(-1) else: liStatus.append(0) numAltSerie = 3 isAlternate = IsAlternate(liStatus, numAltSerie) #write the result fpout.write("#isAlternate: %d\n"%(isAlternate)) fpout.write("%6s %8s %8s\n"%("#numTM", "i(Nterm)", "o(Nterm)")) for i in range(1, maxNumTM+1): try:
cmpclassList = ["IDT"] # get topN diff freqTopNList = [] # [ [topN, min, frac_DIFF]] sumTopNWithDiffTopoList = [0] * len(cmpclassList) for i in xrange(len(anaList)): ana = anaList[i] isHaveDiffTopo = False numSeqCls1 = ana['cluster'][0][1] if len(ana['cluster']) >= 2: numSeqCls2 = ana['cluster'][1][1] else: numSeqCls2 = 0 fracCls2 = myfunc.FloatDivision(numSeqCls2, ana['numseq']) if (numSeqCls2 >= threshold_NumSeq_Group_2 and fracCls2 >= threshold_Fraction_Group_2): isHaveDiffTopo = True if isHaveDiffTopo: sumTopNWithDiffTopoList[0] += 1 fracList = [] for j in xrange(len(sumTopNWithDiffTopoList)): fracList.append( myfunc.FloatDivision(sumTopNWithDiffTopoList[j], i + 1)) freqTopNList.append([i + 1, ana['numseq']] + fracList) # output topN statistics outfileList = [] item = "numseq_TMpro"
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" outfile = "" infile = "" gomapfile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.Family.nr100.filter.fragmented.uniq.pfam.goinfowithancestor.txt" gotermfile = "/data3/wk/MPTopo/pfamAna_refpro/GO_analysis/GO_term.txt" anclevel = 2 gotype = "function" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-gomap", "--gomap"]: (gomapfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-goterm", "--goterm"]: (gotermfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 # print len(gomapfile), gomapfile # lines = open(gomapfile, "r").readlines() # print lines if myfunc.checkfile(infile) != 0: return 1 if myfunc.checkfile(gomapfile, "GO map file") != 0: return 1 if myfunc.checkfile(gotermfile, "GO Term file") != 0: return 1 goMapDict = ReadGOMap(gomapfile) goTermDict = ReadGOTerm(gotermfile) idList = myfunc.ReadIDList(infile) freqDict = {} for goid in GOLevelOneSet: freqDict[goid] = 0 for seqid in idList: try: gomap = goMapDict[seqid] except KeyError: msg = "No GO info for seqid %s" print >> sys.stderr, msg % (seqid) continue for di in gomap[gotype]: ancinfo = di['ancestor'] if (len(ancinfo) > 2 and "all" in ancinfo[0] and "GO:0003674" in ancinfo[1]): ancGOList = di['ancestor'][anclevel] for idd in ancGOList: #debuging if idd not in GOLevelOneSet: print >> sys.stderr, "seqid=", seqid, "ancID=", idd, gomap[ gotype] continue #debuging if not idd in freqDict: freqDict[idd] = 0 freqDict[idd] += 1 #freqList = sorted(freqDict.items(), key=lambda x:x[1], reverse=True) freqList = sorted(freqDict.items(), key=lambda x: x[0], reverse=True) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) printTupList = [] for tup in freqList: try: term = goTermDict[tup[0]] except KeyError: term = "" printTupList.append((tup[0], term, tup[1])) maxSizeTerm = max([len(x[1]) for x in printTupList]) maxSizeGOID = max([len(x[0]) for x in printTupList]) total = sum([x[2] for x in printTupList]) for tup in printTupList: fpout.write("%-*s\t%-*s\t%4d\t%6.6f\n" % (maxSizeGOID, tup[0], maxSizeTerm, tup[1], tup[2], myfunc.FloatDivision(tup[2], total))) myfunc.myclose(fpout)
def HHAlign2Pairaln( infile, evalue_threshold, coverage_threshold, hdl_seq, #{{{ fpout, fpout_tableinfo, fpout_stat): if not os.path.exists(infile): print >> sys.stderr, "infile %s does not exist, Ignore" % (infile) return 1 hhalignHitList = ReadHHAlignResult(infile) numHit = len(hhalignHitList) if numHit < 1: print >> sys.stderr, "No hit found for file %s. Ignore" % infile return 1 elif numHit > 1: print >> sys.stderr, "More than 1 (%d) hit found for file %s." % ( numHit, infile) return 1 # for item in hhalignHitList[0]: # print item, hhalignHitList[0][item] hit = hhalignHitList[0] if coverage_threshold >= 0.0: try: if hit['query_length'] >= hit['template_length']: coverage_of_shorter_seq = myfunc.FloatDivision( len(hit['template_alignseq'].replace("-", "")), hit['template_length']) else: coverage_of_shorter_seq = myfunc.FloatDivision( len(hit['query_alignseq'].replace("-", "")), hit['query_length']) except KeyError: print >> sys.stderr, "bad hit for file %s" % (infile) return 1 if coverage_of_shorter_seq < coverage_threshold: print >> sys.stderr, "coverage (%.3f) < %g for %s. Ignore" % ( coverage_of_shorter_seq, coverage_threshold, infile) return 1 if hit['evalue'] > evalue_threshold: print >> sys.stderr, "evalue (%g) > %g for %s. Ignore" % ( hit['evalue'], evalue_threshold, infile) return 1 query_rawseq = hdl_seq.GetRecord(hit['query_seqid']) if query_rawseq == None: return 1 hit_rawseq = hdl_seq.GetRecord(hit['hit_seqid']) if hit_rawseq == None: return 1 (hit_seqid, hit_annotation, hit_seq) = myfunc.ExtractFromSeqWithAnno(hit_rawseq) (query_seqid, query_annotation, query_seq) = myfunc.ExtractFromSeqWithAnno(query_rawseq) (hit_unaligned_head, hit_unaligned_tail) = GetUnalignedHeadTail(hit_seq, hit['pos_template_begin'], hit['pos_template_end']) (query_unaligned_head, query_unaligned_tail) = GetUnalignedHeadTail(query_seq, hit['pos_query_begin'], hit['pos_query_end']) (hit_unaligned_head, query_unaligned_head) = FillUnalignedGapForHead(hit_unaligned_head, query_unaligned_head) (hit_unaligned_tail, query_unaligned_tail) = FillUnalignedGapForTail(hit_unaligned_tail, query_unaligned_tail) # output pairaln softmargin = 5 if hit['pos_query_begin'] <= softmargin or hit[ 'pos_template_begin'] <= softmargin: isHeadUnaligned = False query_unaligned_head = query_unaligned_head.upper() hit_unaligned_head = hit_unaligned_head.upper() else: isHeadUnaligned = True query_unaligned_head = query_unaligned_head.lower() hit_unaligned_head = hit_unaligned_head.lower() if (hit['pos_query_end'] >= hit['query_length'] - softmargin or hit['pos_template_end'] >= hit['template_length'] - softmargin): isTailUnaligned = False query_unaligned_tail = query_unaligned_tail.upper() hit_unaligned_tail = hit_unaligned_tail.upper() else: isTailUnaligned = True query_unaligned_tail = query_unaligned_tail.lower() hit_unaligned_tail = hit_unaligned_tail.lower() complete_query_alignseq = "%s%s%s" % (query_unaligned_head, hit['query_alignseq'].upper(), query_unaligned_tail) complete_tempalte_alignseq = "%s%s%s" % (hit_unaligned_head, hit['template_alignseq'].upper(), hit_unaligned_tail) #print hit['query_alignseq'] #print hit['template_alignseq'] if fpout != None: fpout.write(">%s\n" % (hit['query_description'])) fpout.write("%s\n" % complete_query_alignseq) fpout.write(">%s\n" % (hit['hit_description'])) fpout.write("%s\n" % complete_tempalte_alignseq) # output stat if fpout_stat != None: pos_query = "%d-%d" % (hit['pos_query_begin'], hit['pos_query_end']) pos_template = "%d-%d" % (hit['pos_template_begin'], hit['pos_template_end']) fpout_stat.write( "%-8s %-8s %7g %8.3f %6.1f %6.1f %6d %9s %4d %9s %4d\n" % ( hit['query_seqid'], hit['hit_seqid'], hit['evalue'], coverage_of_shorter_seq, hit['identity'], hit['prob'], hit['num_align_col'], pos_query, hit['query_length'], pos_template, hit['template_length'], )) # output tableinfo if fpout_tableinfo != None: isLocalAlignment = True rd = lcmp.GetAlignmentFactorFromPairAlignment(hit['query_alignseq'], hit['template_alignseq'], isLocalAlignment) # rd = lcmp.GetAlignmentFactorFromPairAlignment(complete_query_alignseq, complete_tempalte_alignseq, isLocalAlignment) fpout_tableinfo.write( "%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n" % (hit['query_seqid'], hit['hit_seqid'], rd['seqidt0'], hit['similarity'] * 100, rd['alnLength'], rd['seqLength1'], rd['seqLength2'], hit['score'], rd['numIDT'], -1, rd['numGap'], rd['seqidt1'], rd['seqidt2']))
def WriteHTMLTable( tablename, tabletitle, dataTable, htmlname, #{{{ outpath, fpout): numInputID = len(dataTable) print >> fpout, "<a name=\"%s\"></a><h4>%s</h4>" % (tablename, tabletitle) print >> fpout, "<table class=\"sortable\" border=1>" targetpath = outpath + os.sep + "data" if not os.path.exists(targetpath): os.system("mkdir -p %s" % (targetpath)) cntOutputID = 0 headerItemList = [] headerItemList.append("No.") headerItemList.append("PfamID") headerItemList.append("PfamDef") headerItemList.append("NumSeq<br>All") headerItemList.append("NumSeq<br>TPS_44") headerItemList.append("NumSeq<br>Used") headerItemList.append("Figure MSA") headerItemList.append("Figure Tree") headerItemList.append("Pair") headerItemList.append("INV(#,%)") headerItemList.append("Dup") headerItemList.append("TM2GAP") headerItemList.append("Mixed") headerItemList.append("TM2SEQ") headerItemList.append("TM2SP") headerItemList.append("Pairwise alignment with different topology") print >> fpout, "<tr>" for item in headerItemList: print >> fpout, "<th>" print >> fpout, item print >> fpout, "</th>" print >> fpout, "</tr>" inputIDList = dataTable.keys() for pfamid in inputIDList: numDiffPair = 0 numPair = 0 for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]: try: numPair_thisclass = len( dataTable[pfamid]['difftopopair'][cmpclass]) except KeyError: numPair_thisclass = 0 numPair += numPair_thisclass if cmpclass != "IDT": numDiffPair += numPair_thisclass if numDiffPair <= 0: continue info = dataTable[pfamid] try: numSeqAll = len(g_params['pfamid2seqidDict'][pfamid]) except KeyError: numSeqAll = 0 try: seqwithtopo_idlist = list( set(g_params['pfamid2seqidDict'][pfamid]) & set(g_params['hdl_topodb'].indexedIDList)) except KeyError: seqwithtopo_idlist = [] numSeqWithTopology = len(seqwithtopo_idlist) cntOutputID += 1 print >> fpout, "<tr>" # 1. No --------------------------- print >> fpout, '<td>' print >> fpout, '%d' % (cntOutputID) print >> fpout, '</td>' # 2. PfamID --------------------------- pfamURL = 'http://pfam.sanger.ac.uk/family/' + pfamid print >> fpout, '<td>' print >> fpout, '<a href=\"%s\" target=\"_blank\">%s</a>' % (pfamURL, pfamid) print >> fpout, '</td>' # 3 PfamDef --------------------------- try: pfamdef = g_params['pfamidDefDict'][pfamid] except KeyError: pfamdef = "" print >> fpout, '<td>' print >> fpout, '%s' % (pfamdef) print >> fpout, '</td>' # 4 NumSeqAll --------------------------- print >> fpout, '<td>' print >> fpout, '%d' % (numSeqAll) print >> fpout, '</td>' # 5 NumSeq with topology --------------------------- print >> fpout, '<td>' print >> fpout, '%d' % (numSeqWithTopology) print >> fpout, '</td>' # 6 NumSeqUsed --------------------------- try: numSeqUsed = len(dataTable[pfamid]['set_seqid']) except KeyError: numSeqUsed = 0 print >> fpout, '<td>' print >> fpout, '%d' % (numSeqUsed) print >> fpout, '</td>' # 7 Figure MSA--------------------------- if 1: ext = '.reordered.topomsa.png' print >> fpout, '<td>' imageSourceFile = g_params['msapath'] + os.sep + pfamid + ext if not os.path.exists(imageSourceFile): PrepareDataForTopoanaTMPro(pfamid, seqwithtopo_idlist, g_params['msapath']) imageTargetFile = outpath + os.sep + htmlname + os.sep + pfamid + ext thumbImageSourceFile = g_params[ 'msapath'] + os.sep + 'thumb.' + pfamid + ext thumbImageTargetFile = outpath + os.sep + htmlname + os.sep + 'thumb.' + pfamid + ext if os.path.exists(imageSourceFile): os.system( "%s %s %s" % (g_params['CP_EXE'], imageSourceFile, imageTargetFile)) if os.path.exists(thumbImageSourceFile): os.system("%s %s %s" % (g_params['CP_EXE'], thumbImageSourceFile, thumbImageTargetFile)) print >> fpout, ( "<a href=\"%s\"target=\"_blank\">" % (htmlname + os.sep + os.path.basename(imageTargetFile))) print >> fpout, ( "<img src=\"%s\">" % (htmlname + os.sep + os.path.basename(thumbImageTargetFile))) print >> fpout, "</a>" print >> fpout, '</td>' # 8 Figure Tree--------------------------- ext = '-itol.jpg' extpdf = '-itol.pdf' print >> fpout, '<td>' imageSourceFile = g_params['treepath'] + os.sep + pfamid + ext imageSourceFilePDF = g_params['treepath'] + os.sep + pfamid + extpdf imageTargetFile = outpath + os.sep + htmlname + os.sep + pfamid + ext imageTargetFilePDF = outpath + os.sep + htmlname + os.sep + pfamid + extpdf thumbImageSourceFile = g_params[ 'treepath'] + os.sep + 'thumb.' + pfamid + ext thumbImageTargetFile = outpath + os.sep + htmlname + os.sep + 'thumb.' + pfamid + ext if os.path.exists(imageSourceFile): os.system("%s %s %s" % (g_params['CP_EXE'], imageSourceFile, imageTargetFile)) if os.path.exists(imageSourceFilePDF): os.system( "%s %s %s" % (g_params['CP_EXE'], imageSourceFilePDF, imageTargetFilePDF)) if os.path.exists(thumbImageSourceFile): os.system("%s %s %s" % (g_params['CP_EXE'], thumbImageSourceFile, thumbImageTargetFile)) print >> fpout, ( "<a href=\"%s\"target=\"_blank\">" % (htmlname + os.sep + os.path.basename(imageTargetFile))) print >> fpout, ( "<img src=\"%s\">" % (htmlname + os.sep + os.path.basename(thumbImageTargetFile))) print >> fpout, "</a>" print >> fpout, '</td>' # 9 numPair --------------------------- print >> fpout, '<td>' print >> fpout, '%d' % (numPair) print >> fpout, '</td>' # 10-15 INV, DUP, TM2GAP, Mixed, TM2SEQ, TM2SP --------------------------- for cmpclass in g_params['cmpClassList_mp3_cmpdup'][1:]: try: nn = len(dataTable[pfamid]['difftopopair'][cmpclass]) except KeyError: nn = 0 print >> fpout, '<td>' if nn > 0: ss1 = "%d, %.1f%%" % (nn, myfunc.FloatDivision(nn, numPair)) else: ss1 = "-" print >> fpout, '%s' % (ss1) print >> fpout, '</td>' # 11 #pairwise alignment --------------------------- print >> fpout, '<td>' WriteSubTable(dataTable, pfamid, outpath, fpout) print >> fpout, '</td>' # Finish writing the row ===================================== print >> fpout, "</tr>" print >> fpout, "</table>"