示例#1
0
def GetTopoAlignStat(topo1, topo2):
    stat = []
    posTM1 = myfunc.GetTMPosition(topo1)
    if len(posTM1) > 0:
        for (b, e) in posTM1:
            segList1 = []
            segList2 = []
            cntTM = 0
            cntGap = 0
            cntSeq = 0
            for j in xrange(b, e):
                if topo1[j] == 'M':
                    segList2.append(topo2[j])
                    if topo2[j] == 'M':
                        cntTM += 1
                    elif topo2[j] == '-':
                        cntGap += 1
                    else:
                        cntSeq += 1
            rd = {}
            sizeSeg = len(segList2)
            freqTM = myfunc.FloatDivision(cntTM, sizeSeg)
            freqGap = myfunc.FloatDivision(cntGap, sizeSeg)
            freqSeq = myfunc.FloatDivision(cntSeq, sizeSeg)

            rd['seg2'] = ''.join(segList2)
            rd['freqTM'] = freqTM
            rd['freqGap'] = freqGap
            rd['freqSeq'] = freqSeq
            stat.append(rd)
    return stat
def WriteNumTMHeatMap(data, maxNumTM, count, mode, outfile):  #{{{
    try:
        fpout = open(outfile, "w")
        #maxNumTM = dataTable['maxNumTM']
        #count = dataTable['numPair']
        #data = dataTable['data']
        scale_norm_col_list = []
        # normalized so that the sum of diagonal = 100
        if mode == "norm_diag":
            diag = [data[i][i] for i in xrange(maxNumTM + 1)]
            scale_norm_diag = myfunc.FloatDivision(count, sum(diag))
        elif mode == "norm_col":
            for j in xrange(0, maxNumTM + 1):
                li = [data[i][j] for i in xrange(0, maxNumTM + 1)]
                scale_norm_col_list.append(myfunc.FloatDivision(
                    count, sum(li)))

        for i in xrange(0, maxNumTM + 1):
            if mode == "norm_diag":
                scale = scale_norm_diag
            elif mode == "norm_row":
                scale = myfunc.FloatDivision(count, sum(data[i]))
            for j in xrange(0, maxNumTM + 1):
                if mode == "norm_col":
                    scale = scale_norm_col_list[j]
                fpout.write(
                    " %6.3g" %
                    (myfunc.FloatDivision(data[i][j], count) * scale * 100))
            fpout.write("\n")
        fpout.close()
        return 0
    except IOError:
        print >> sys.stderr, "Failed to write to file %s" % outfile
        return 1
def WriteTable2D(freq, subsum, classList, seqIDTGroupList, outfile):  #{{{
    #works with dataCmpClass and dataNCtermInter
    try:
        fpout = open(outfile, "wb")
    except IOError:
        print >> sys.stderr, "Failed to write to file %s" % outfile
        return 1

    numGroup = len(seqIDTGroupList) / 2
    numClass = len(classList)
    fpout.write("%4s %7s" % ("#Idx", "SeqIDT"))
    for cls in classList:
        fpout.write(" %9s" % (cls))
    fpout.write(" %9s" % "Maximum")
    fpout.write(" %10s" % "Occurrence")
    fpout.write("\n")
    for i in xrange(numGroup):
        stridtrange = "%g-%g" % (seqIDTGroupList[i * 2],
                                 seqIDTGroupList[i * 2 + 1])
        fpout.write("%-4d %7s" % (i, stridtrange))
        for j in xrange(numClass):
            fpout.write(" %9.3f" %
                        (myfunc.FloatDivision(freq[i][j], subsum[i]) * 100))
        fpout.write(" %9.3f" % max([
            myfunc.FloatDivision(freq[i][j], subsum[i]) * 100
            for j in range(numClass)
        ]))
        fpout.write(" %10d" % subsum[i])
        fpout.write("\n")

    fpout.write("%-12s" % ("#sum"))
    totalOccur = [0] * numClass
    totalSum = sum(subsum)
    for j in xrange(numClass):
        for i in xrange(numGroup):
            totalOccur[j] += freq[i][j]
        fpout.write(" %9.3f" %
                    (myfunc.FloatDivision(totalOccur[j], totalSum) * 100))
    fpout.write(" %9.3f" % max([
        myfunc.FloatDivision(totalOccur[j], totalSum) * 100
        for j in range(numClass)
    ]))
    fpout.write(" %10d" % totalSum)
    fpout.write("\n")

    fpout.close()
    return 0
示例#4
0
def GetAlignmentFactorFromPairAlignment(seq1,seq2, isLocalAlignment):#{{{
    """
    Return alignment factor as a dictionary
    """
    alignFactor = {}
    alnLength = len(seq1)
    if isLocalAlignment is True:
        cntLocalAlnLength = 0
        cntLocalIDT = 0
        cntLocalGap = 0
        cntLocalLen1 = 0
        cntLocalLen2 = 0
        cntLocalUnAligned = 0
        for i in range(alnLength):
            if ((seq1[i].isalpha() and seq1[i].islower()) or (seq2[i].isalpha()
                and seq2[i].islower())):
                cntLocalUnAligned += 1
            else:
                cntLocalAlnLength += 1
                if seq1[i] == seq2[i]:
                    cntLocalIDT += 1
                elif seq1[i] == "-" or seq2[i] == "-":
                    cntLocalGap += 1
                if seq1[i] != "-":
                    cntLocalLen1 += 1
                if seq2[i] != "-":
                    cntLocalLen2 += 1
        alignFactor['numIDT'] = cntLocalIDT
        alignFactor['numGap'] = cntLocalGap
        alignFactor['alnLength'] = cntLocalAlnLength
        alignFactor['seqidt0'] = myfunc.FloatDivision(cntLocalIDT,
                cntLocalAlnLength) * 100
        alignFactor['seqidt1'] = myfunc.FloatDivision(cntLocalIDT,
                min(cntLocalLen1,cntLocalLen2))*100
        alignFactor['seqidt2'] = myfunc.FloatDivision(cntLocalIDT,
                cntLocalAlnLength - cntLocalGap)*100
        alignFactor['seqLength1'] = cntLocalLen1
        alignFactor['seqLength2'] = cntLocalLen2
        alignFactor['numUnaligned'] = cntLocalUnAligned

    else:
        len1 = len(seq1.replace("-", ""))
        len2 = len(seq2.replace("-", ""))
        cntIDT = 0
        cntGap = 0
        for i in range(alnLength):
            if seq1[i] == seq2[i]:
                cntIDT += 1
            elif seq1[i] == "-" or seq2[i] == "-":
                cntGap += 1
        alignFactor['numIDT'] = cntIDT
        alignFactor['numGap'] = cntGap
        alignFactor['alnLength'] = alnLength
        alignFactor['seqidt0'] = myfunc.FloatDivision(cntIDT, alnLength) * 100
        alignFactor['seqidt1'] = myfunc.FloatDivision(cntIDT, min(len1,len2)) * 100
        alignFactor['seqidt2'] = myfunc.FloatDivision(cntIDT, alnLength - cntGap) * 100
        alignFactor['seqLength1'] = len1
        alignFactor['seqLength2'] = len2
    return alignFactor
示例#5
0
def WriteSpecialPair(
        dataTable,
        all_pairInfoList,  #{{{
        seqid2pfamidDict,
        seqid2clanidDict,
        tm_pfamidSet,
        tm_clanidSet,
        pfamidDefDict,
        clanidDefDict,
        SPE_PAIR_LIST,
        outfile):
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    numpair_total = len(all_pairInfoList)
    pairInfoLists = dataTable['pairInfoLists']
    (freqListPfam, freqListClan) = AnaFamFrequency(pairInfoLists,
                                                   seqid2pfamidDict,
                                                   seqid2clanidDict,
                                                   tm_pfamidSet, tm_clanidSet)
    for i in xrange(len(SPE_PAIR_LIST)):
        cnt_this_pair_pfam = 0
        cnt_this_pair_clan = 0
        for tup in freqListPfam[i]:
            cnt_this_pair_pfam += tup[1][0]
        for tup in freqListClan[i]:
            cnt_this_pair_clan += tup[1][0]

        print >> fpout
        pair = SPE_PAIR_LIST[i]
        print >> fpout, pair, "PfamID", "%5d %5.1f %8d %6.2f" % (
            cnt_this_pair_pfam, cnt_this_pair_pfam * g_params['scale_count'],
            numpair_total,
            myfunc.FloatDivision(cnt_this_pair_pfam, numpair_total) * 100)
        print >> fpout
        for tup in freqListPfam[i]:
            try:
                pfamdef = pfamidDefDict[tup[0]]
            except KeyError:
                pfamdef = ""
            fpout.write("%-8s %20s %5d %5.1f %6.2f  " %
                        (tup[0], pfamdef,
                         tup[1][0], tup[1][0] * g_params['scale_count'],
                         float(tup[1][0]) / cnt_this_pair_pfam * 100))
            for pp in tup[1][1]:
                fpout.write("(%s %s) " % (pp[0], pp[1]))
            fpout.write("\n")

        print >> fpout
        pair = SPE_PAIR_LIST[i]
        print >> fpout, pair, "ClanID", "%5d %5.1f %8d %6.2f" % (
            cnt_this_pair_clan, cnt_this_pair_clan * g_params['scale_count'],
            numpair_total,
            myfunc.FloatDivision(cnt_this_pair_clan, numpair_total) * 100)
        print >> fpout
        for tup in freqListClan[i]:
            try:
                clandef = clanidDefDict[tup[0]]
            except KeyError:
                clandef = ""
            fpout.write("%-8s %20s %5d %5.1f %6.2f  " %
                        (tup[0], clandef,
                         tup[1][0], tup[1][0] * g_params['scale_count'],
                         float(tup[1][0]) / cnt_this_pair_clan * 100))
            for pp in tup[1][1]:
                fpout.write("(%s %s) " % (pp[0], pp[1]))
            fpout.write("\n")
        print >> fpout, "#====================================================="
    myfunc.myclose(fpout)
    return 0
示例#6
0
def Benchmark(real_topodict, idSet_single, idSet_multi, TM_type, fpout,
              fpout_wrong, seqDict):  #{{{
    if g_params['mode'] == "tps":
        itemlist = ["40", "41", "42", "43", "44", "All"]
    elif g_params['mode'] == "tp":
        itemlist = ["50", "51", "52", "53", "54", "55", "All"]

    isRestrictIDList = g_params['isRestrictIDList']
    addname = ""
    if g_params['isRMSP']:
        addname = ".RMSP"

    numRealTopo = len(real_topodict)

    if isRestrictIDList:
        numRealTopo = len(g_params['restrictIDset']
                          & set(real_topodict.keys()))

    pred_topofile_list = []
    pred_topodict_list = []
    # Step 1, read in predicted topology
    for item in itemlist:
        pred_topofile = ""
        if item.upper() == "ALL":
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s.topcons-single_topcons_single%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], addname)

        else:
            if g_params['mode'] == "tps":
                pred_topofile = "%s/%s_topcons_single.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)
            elif g_params['mode'] == "tp":
                pred_topofile = "%s/%s.topcons.result_TOPCONS.m1.agree-%s%s.topo" % (
                    g_params['path_predtopo'], g_params['basename'], item,
                    addname)

        (pred_idlist, pred_annolist,
         pred_topolist) = myfunc.ReadFasta(pred_topofile)
        if len(pred_idlist) <= 0:
            print >> sys.stderr, "Failed to read pred_topofile %s" % (
                pred_topofile)
        pred_topodict = {}
        for i in xrange(len(pred_idlist)):
            if ((not isRestrictIDList)
                    or pred_idlist[i] in g_params['restrictIDset']):
                #if (TM_type == "All_Alpha" or (TM_type == "Single" and pred_idlist[i] in idSet_single) or (TM_type == "Multi" and pred_idlist[i] in idSet_multi)):
                pred_topodict[pred_idlist[i]] = pred_topolist[i]
        pred_topodict_list.append(pred_topodict)

# Step 2, calculate precision of the prediction
#header line
    fpout.write("#%s\n" % (TM_type))
    fpout.write("#%2s %7s %8s %8s %8s %8s %8s %8s %8s\n" %
                ("No", "Group", "nIDT", "nINV", "nPred", "PPV(%)", "NPV_INV",
                 "NPV_Other", "nAllReal"))
    for i in xrange(len(itemlist)):
        item = itemlist[i]
        pred_topodict = pred_topodict_list[i]
        numPredTopo = len(pred_topodict)

        (numIDTtopo,
         numINVtopo) = CountIdenticalTopology(pred_topodict, real_topodict,
                                              item, TM_type, fpout_wrong,
                                              seqDict, item)

        ss = "%-3d %7s %8d %8d %8d %8.1f %8.1f %8.1f %8d" % (
            i, item, numIDTtopo, numINVtopo, numPredTopo,
            myfunc.FloatDivision(numIDTtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numINVtopo, numPredTopo) * 100.0,
            myfunc.FloatDivision(numPredTopo - numIDTtopo - numINVtopo,
                                 numPredTopo) * 100.0, numRealTopo)
        fpout.write("%s\n" % (ss))
    fpout.write("\n")
示例#7
0
                for iw in xrange(winsize):
                    IncrementSumTableWithDiffTopo(i + iw, outList,
                                                  sumWindowWithDiffTopoList)
                isFirstWindow = False
            else:  # if not first window, minus the previous one, and plus the next one
                #iw_previous = i-1
                #iw_next = i+winsize-1
                DecrementSumTableWithDiffTopo(i - 1, outList,
                                              sumWindowWithDiffTopoList)
                IncrementSumTableWithDiffTopo(i + winsize - 1, outList,
                                              sumWindowWithDiffTopoList)

            fracList = []
            for j in xrange(len(sumWindowWithDiffTopoList)):
                fracList.append(
                    myfunc.FloatDivision(sumWindowWithDiffTopoList[j],
                                         winsize))
            freqTopNList.append([i + 1, outList[i + winsize / 2][pivIdx]] +
                                fracList)
        outfile1 = outfile + ".difffam_win%d.sortby_%s.mindiffpair_%d.txt" % (
            winsize, itemList[pivIdx], mindiffpair)
        outfileList.append(outfile1)
        fpout = myfunc.myopen(outfile1, sys.stdout, "w", False)

        ss_sort_item = "min_%s" % (itemList[pivIdx])
        fpout.write("#%-7s %*s %7s" %
                    ("idxWin", len(ss_sort_item), ss_sort_item, "DIFF"))
        for ss in cmpclassList[1:]:
            fpout.write(" %7s" % (ss))
        fpout.write("\n")
        for i in xrange(len(freqTopNList)):
            d = freqTopNList[
示例#8
0
def WriteFamPairCount(
        freqList,
        pairInfoList,
        famDefDict,  #{{{
        cmpclassList,
        pairwise_comparison_method,
        isCmpDup,
        outfile):
    """
    Write the number of pairs for each protein family as well as the frequency 
    of topology variations in different classes for each family
    Input:
        freqList:   [(pfamid, [numpair,numseq, numseq_TMpro, (id1,id2,cmpclass), ()...]), ...]
        pairInfoList: a list of tuples, [(id1,id2,cmpclass)]
    """
    numpair_total = len(pairInfoList)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    # write header line
    fpout.write("#%-7s %18s %7s %7s %9s %6s %8s" %
                ("PfamID", "PfamDef", "NumPair", "NumSeq", "NumSeq_TM", "%#",
                 "NumTotal"))
    for ss in cmpclassList:
        fpout.write(" %6s" % (ss))
    fpout.write("\n")

    CountListAll = [
    ]  # list of [(numpair, numseq, numseq_TMpro, [#IDT, #INV, #...]), ([])]

    for tup in freqList:
        famid = tup[0]
        pairInfoList_fam = tup[
            1]  #(pfamid, [numpair, numseq, numseq_TMpro, (id1,id2,cmpclass),(id1,id2,cmpclass)])
        numpair = tup[1][0]
        numseq = tup[1][1]
        numseq_TMpro = tup[1][2]
        try:
            famdef = famDefDict[famid]
        except KeyError:
            famdef = ""

        cmpclasslist_fam = []  # a list of [cmpclass, cmpclass, ...]
        for tt in pairInfoList_fam[3:]:
            cmpclass = tt[2]
            if isCmpDup:
                if cmpclass.find("TM2GAP|DUP") == 0:
                    cmpclass = "DUP"
                else:
                    cmpclass = cmpclass.split('|')[0]
            else:
                cmpclass = cmpclass.split('|')[0]

            cmpclasslist_fam.append(cmpclass)
        cntList = []
        for cls in cmpclassList:
            cntList.append(cmpclasslist_fam.count(cls))
        fpout.write("%-8s %18s %7d %7d %9d %6.2f %8d" %
                    (famid, famdef, numpair, numseq, numseq_TMpro,
                     float(numpair) / numpair_total * 100.0, numpair_total))

        for cnt in cntList:
            fpout.write(" %6d" % cnt)
        fpout.write("\n")

        CountListAll.append((numpair, numseq, numseq_TMpro, cntList))
    myfunc.myclose(fpout)
    print "file %s output" % (outfile)

    # output the fraction of topN largest families that have topology variations
    # sorted in descending order by "numpair", "numseq", numseq_TMpro #{{{
    mindiffpair = g_params['mindiffpair']
    itemList = ["numpair", "numseq", "numseq_TMpro"]
    outfileList = []
    for pivIdx in xrange(len(itemList)):  #[0,1,2]

        outList = sorted(CountListAll, key=lambda x: x[pivIdx], reverse=True)

        freqTopNList = []  # [ [topN, min, frac_DIFF, frac_INV, frac_TM2GAP]]
        sumTopNWithDiffTopoList = [0] * (len(cmpclassList))
        for i in xrange(len(outList)):
            isHaveDiffTopo = False
            cntDiffPair = 0
            cntList = outList[i][3]
            for j in xrange(1, len(cntList)):
                cntDiffPair += cntList[j]
                if cntList[j] >= mindiffpair:
                    sumTopNWithDiffTopoList[j] += 1

            if cntDiffPair >= mindiffpair:
                isHaveDiffTopo = True

            if isHaveDiffTopo:
                sumTopNWithDiffTopoList[0] += 1
            fracList = []
            for j in xrange(len(sumTopNWithDiffTopoList)):
                fracList.append(
                    myfunc.FloatDivision(sumTopNWithDiffTopoList[j], i + 1))

            freqTopNList.append([i + 1, outList[i][pivIdx]] + fracList)
        outfile1 = outfile + ".topNdifffam.sortby_%s.mindiffpair_%d.txt" % (
            itemList[pivIdx], mindiffpair)
        outfileList.append(outfile1)
        fpout = myfunc.myopen(outfile1, sys.stdout, "w", False)

        ss_sort_item = "min_%s" % (itemList[pivIdx])
        fpout.write("#%-7s %*s %7s" %
                    ("topN", len(ss_sort_item), ss_sort_item, "DIFF"))
        for ss in cmpclassList[1:]:
            fpout.write(" %7s" % (ss))
        fpout.write("\n")
        for i in xrange(len(freqTopNList)):
            d = freqTopNList[
                i]  #  [topN, min, frac_DIFF, frac_INV, frac_TM2GAP]
            fpout.write("%-8d %*d" % (d[0], len(ss_sort_item), d[1]))
            fracList = d[2:]
            for tt in fracList:
                fpout.write(" %7.2f" % (tt * 100))
            fpout.write("\n")
        myfunc.myclose(fpout)
        print "file %s output" % (outfile1)
# make plot
    cmd = ["%s/plotMaxFracFamilyWithTopoVariation.sh" %
           (binpath)] + outfileList
    try:
        subprocess.check_output(cmd)
    except subprocess.CalledProcessError, e:
        print e
示例#9
0

# check whether it is alternating
liStatus = []
th = 0.75 # only when 75% of the nterm status is either i or o are recognized
minCount = 10
# i : 1
# o : -1
# non-determined: 0

for i in range(1, maxNumTM+1):
    try:
        (n1, n2) = (countDict[i][0], countDict[i][1])
    except (KeyError, IndexError):
        (n1, n2) = (0, 0)
    if myfunc.FloatDivision(n1, n1+n2) >= th and n1 >= minCount:
        liStatus.append(1)
    elif myfunc.FloatDivision(n2, n1+n2) >= th and n2 >= minCount:
        liStatus.append(-1)
    else:
        liStatus.append(0)

numAltSerie = 3
isAlternate = IsAlternate(liStatus, numAltSerie)


#write the result
fpout.write("#isAlternate: %d\n"%(isAlternate))
fpout.write("%6s %8s %8s\n"%("#numTM", "i(Nterm)",  "o(Nterm)"))
for i in range(1, maxNumTM+1):
    try:
示例#10
0
    cmpclassList = ["IDT"]

    # get topN diff
    freqTopNList = []  # [ [topN, min, frac_DIFF]]
    sumTopNWithDiffTopoList = [0] * len(cmpclassList)
    for i in xrange(len(anaList)):
        ana = anaList[i]
        isHaveDiffTopo = False
        numSeqCls1 = ana['cluster'][0][1]
        if len(ana['cluster']) >= 2:
            numSeqCls2 = ana['cluster'][1][1]
        else:
            numSeqCls2 = 0

        fracCls2 = myfunc.FloatDivision(numSeqCls2, ana['numseq'])
        if (numSeqCls2 >= threshold_NumSeq_Group_2
                and fracCls2 >= threshold_Fraction_Group_2):
            isHaveDiffTopo = True
        if isHaveDiffTopo:
            sumTopNWithDiffTopoList[0] += 1
        fracList = []
        for j in xrange(len(sumTopNWithDiffTopoList)):
            fracList.append(
                myfunc.FloatDivision(sumTopNWithDiffTopoList[j], i + 1))

        freqTopNList.append([i + 1, ana['numseq']] + fracList)

# output topN statistics
    outfileList = []
    item = "numseq_TMpro"
示例#11
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    outfile = ""
    infile = ""
    gomapfile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.Family.nr100.filter.fragmented.uniq.pfam.goinfowithancestor.txt"
    gotermfile = "/data3/wk/MPTopo/pfamAna_refpro/GO_analysis/GO_term.txt"
    anclevel = 2
    gotype = "function"

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-gomap", "--gomap"]:
                (gomapfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-goterm", "--goterm"]:
                (gotermfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1

#     print len(gomapfile), gomapfile
#     lines = open(gomapfile, "r").readlines()
#     print lines

    if myfunc.checkfile(infile) != 0:
        return 1
    if myfunc.checkfile(gomapfile, "GO map file") != 0:
        return 1
    if myfunc.checkfile(gotermfile, "GO Term file") != 0:
        return 1

    goMapDict = ReadGOMap(gomapfile)
    goTermDict = ReadGOTerm(gotermfile)
    idList = myfunc.ReadIDList(infile)

    freqDict = {}
    for goid in GOLevelOneSet:
        freqDict[goid] = 0

    for seqid in idList:
        try:
            gomap = goMapDict[seqid]
        except KeyError:
            msg = "No GO info for seqid %s"
            print >> sys.stderr, msg % (seqid)
            continue
        for di in gomap[gotype]:
            ancinfo = di['ancestor']
            if (len(ancinfo) > 2 and "all" in ancinfo[0]
                    and "GO:0003674" in ancinfo[1]):
                ancGOList = di['ancestor'][anclevel]
                for idd in ancGOList:
                    #debuging
                    if idd not in GOLevelOneSet:
                        print >> sys.stderr, "seqid=", seqid, "ancID=", idd, gomap[
                            gotype]
                        continue
                    #debuging
                    if not idd in freqDict:
                        freqDict[idd] = 0
                    freqDict[idd] += 1

    #freqList = sorted(freqDict.items(), key=lambda x:x[1], reverse=True)
    freqList = sorted(freqDict.items(), key=lambda x: x[0], reverse=True)

    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    printTupList = []
    for tup in freqList:
        try:
            term = goTermDict[tup[0]]
        except KeyError:
            term = ""
        printTupList.append((tup[0], term, tup[1]))
    maxSizeTerm = max([len(x[1]) for x in printTupList])
    maxSizeGOID = max([len(x[0]) for x in printTupList])
    total = sum([x[2] for x in printTupList])

    for tup in printTupList:
        fpout.write("%-*s\t%-*s\t%4d\t%6.6f\n" %
                    (maxSizeGOID, tup[0], maxSizeTerm, tup[1], tup[2],
                     myfunc.FloatDivision(tup[2], total)))

    myfunc.myclose(fpout)
示例#12
0
def HHAlign2Pairaln(
        infile,
        evalue_threshold,
        coverage_threshold,
        hdl_seq,  #{{{
        fpout,
        fpout_tableinfo,
        fpout_stat):
    if not os.path.exists(infile):
        print >> sys.stderr, "infile %s does not exist, Ignore" % (infile)
        return 1

    hhalignHitList = ReadHHAlignResult(infile)
    numHit = len(hhalignHitList)
    if numHit < 1:
        print >> sys.stderr, "No hit found for file %s. Ignore" % infile
        return 1
    elif numHit > 1:
        print >> sys.stderr, "More than 1 (%d) hit found for file %s." % (
            numHit, infile)
        return 1

#     for item in hhalignHitList[0]:
#         print item, hhalignHitList[0][item]
    hit = hhalignHitList[0]

    if coverage_threshold >= 0.0:
        try:
            if hit['query_length'] >= hit['template_length']:
                coverage_of_shorter_seq = myfunc.FloatDivision(
                    len(hit['template_alignseq'].replace("-", "")),
                    hit['template_length'])
            else:
                coverage_of_shorter_seq = myfunc.FloatDivision(
                    len(hit['query_alignseq'].replace("-", "")),
                    hit['query_length'])
        except KeyError:
            print >> sys.stderr, "bad hit for file %s" % (infile)
            return 1
        if coverage_of_shorter_seq < coverage_threshold:
            print >> sys.stderr, "coverage (%.3f) < %g for %s. Ignore" % (
                coverage_of_shorter_seq, coverage_threshold, infile)
            return 1

    if hit['evalue'] > evalue_threshold:
        print >> sys.stderr, "evalue (%g) > %g for %s. Ignore" % (
            hit['evalue'], evalue_threshold, infile)
        return 1

    query_rawseq = hdl_seq.GetRecord(hit['query_seqid'])
    if query_rawseq == None:
        return 1
    hit_rawseq = hdl_seq.GetRecord(hit['hit_seqid'])
    if hit_rawseq == None:
        return 1
    (hit_seqid, hit_annotation,
     hit_seq) = myfunc.ExtractFromSeqWithAnno(hit_rawseq)
    (query_seqid, query_annotation,
     query_seq) = myfunc.ExtractFromSeqWithAnno(query_rawseq)
    (hit_unaligned_head,
     hit_unaligned_tail) = GetUnalignedHeadTail(hit_seq,
                                                hit['pos_template_begin'],
                                                hit['pos_template_end'])
    (query_unaligned_head,
     query_unaligned_tail) = GetUnalignedHeadTail(query_seq,
                                                  hit['pos_query_begin'],
                                                  hit['pos_query_end'])
    (hit_unaligned_head,
     query_unaligned_head) = FillUnalignedGapForHead(hit_unaligned_head,
                                                     query_unaligned_head)
    (hit_unaligned_tail,
     query_unaligned_tail) = FillUnalignedGapForTail(hit_unaligned_tail,
                                                     query_unaligned_tail)

    # output pairaln
    softmargin = 5
    if hit['pos_query_begin'] <= softmargin or hit[
            'pos_template_begin'] <= softmargin:
        isHeadUnaligned = False
        query_unaligned_head = query_unaligned_head.upper()
        hit_unaligned_head = hit_unaligned_head.upper()
    else:
        isHeadUnaligned = True
        query_unaligned_head = query_unaligned_head.lower()
        hit_unaligned_head = hit_unaligned_head.lower()

    if (hit['pos_query_end'] >= hit['query_length'] - softmargin
            or hit['pos_template_end'] >= hit['template_length'] - softmargin):
        isTailUnaligned = False
        query_unaligned_tail = query_unaligned_tail.upper()
        hit_unaligned_tail = hit_unaligned_tail.upper()
    else:
        isTailUnaligned = True
        query_unaligned_tail = query_unaligned_tail.lower()
        hit_unaligned_tail = hit_unaligned_tail.lower()

    complete_query_alignseq = "%s%s%s" % (query_unaligned_head,
                                          hit['query_alignseq'].upper(),
                                          query_unaligned_tail)
    complete_tempalte_alignseq = "%s%s%s" % (hit_unaligned_head,
                                             hit['template_alignseq'].upper(),
                                             hit_unaligned_tail)
    #print hit['query_alignseq']
    #print hit['template_alignseq']
    if fpout != None:
        fpout.write(">%s\n" % (hit['query_description']))
        fpout.write("%s\n" % complete_query_alignseq)
        fpout.write(">%s\n" % (hit['hit_description']))
        fpout.write("%s\n" % complete_tempalte_alignseq)

# output stat
    if fpout_stat != None:
        pos_query = "%d-%d" % (hit['pos_query_begin'], hit['pos_query_end'])
        pos_template = "%d-%d" % (hit['pos_template_begin'],
                                  hit['pos_template_end'])

        fpout_stat.write(
            "%-8s %-8s %7g %8.3f %6.1f %6.1f %6d %9s %4d %9s %4d\n" % (
                hit['query_seqid'],
                hit['hit_seqid'],
                hit['evalue'],
                coverage_of_shorter_seq,
                hit['identity'],
                hit['prob'],
                hit['num_align_col'],
                pos_query,
                hit['query_length'],
                pos_template,
                hit['template_length'],
            ))

# output tableinfo
    if fpout_tableinfo != None:
        isLocalAlignment = True
        rd = lcmp.GetAlignmentFactorFromPairAlignment(hit['query_alignseq'],
                                                      hit['template_alignseq'],
                                                      isLocalAlignment)
        # rd = lcmp.GetAlignmentFactorFromPairAlignment(complete_query_alignseq, complete_tempalte_alignseq, isLocalAlignment)
        fpout_tableinfo.write(
            "%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n"
            % (hit['query_seqid'], hit['hit_seqid'], rd['seqidt0'],
               hit['similarity'] * 100, rd['alnLength'], rd['seqLength1'],
               rd['seqLength2'], hit['score'], rd['numIDT'], -1, rd['numGap'],
               rd['seqidt1'], rd['seqidt2']))
示例#13
0
def WriteHTMLTable(
        tablename,
        tabletitle,
        dataTable,
        htmlname,  #{{{
        outpath,
        fpout):
    numInputID = len(dataTable)
    print >> fpout, "<a name=\"%s\"></a><h4>%s</h4>" % (tablename, tabletitle)
    print >> fpout, "<table class=\"sortable\" border=1>"

    targetpath = outpath + os.sep + "data"
    if not os.path.exists(targetpath):
        os.system("mkdir -p %s" % (targetpath))

    cntOutputID = 0

    headerItemList = []
    headerItemList.append("No.")
    headerItemList.append("PfamID")
    headerItemList.append("PfamDef")
    headerItemList.append("NumSeq<br>All")
    headerItemList.append("NumSeq<br>TPS_44")
    headerItemList.append("NumSeq<br>Used")
    headerItemList.append("Figure MSA")
    headerItemList.append("Figure Tree")
    headerItemList.append("Pair")
    headerItemList.append("INV(#,%)")
    headerItemList.append("Dup")
    headerItemList.append("TM2GAP")
    headerItemList.append("Mixed")
    headerItemList.append("TM2SEQ")
    headerItemList.append("TM2SP")
    headerItemList.append("Pairwise alignment with different topology")

    print >> fpout, "<tr>"
    for item in headerItemList:
        print >> fpout, "<th>"
        print >> fpout, item
        print >> fpout, "</th>"
    print >> fpout, "</tr>"

    inputIDList = dataTable.keys()

    for pfamid in inputIDList:
        numDiffPair = 0
        numPair = 0
        for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]:
            try:
                numPair_thisclass = len(
                    dataTable[pfamid]['difftopopair'][cmpclass])
            except KeyError:
                numPair_thisclass = 0
            numPair += numPair_thisclass
            if cmpclass != "IDT":
                numDiffPair += numPair_thisclass

        if numDiffPair <= 0:
            continue

        info = dataTable[pfamid]

        try:
            numSeqAll = len(g_params['pfamid2seqidDict'][pfamid])
        except KeyError:
            numSeqAll = 0

        try:
            seqwithtopo_idlist = list(
                set(g_params['pfamid2seqidDict'][pfamid])
                & set(g_params['hdl_topodb'].indexedIDList))
        except KeyError:
            seqwithtopo_idlist = []
        numSeqWithTopology = len(seqwithtopo_idlist)

        cntOutputID += 1
        print >> fpout, "<tr>"
        # 1. No ---------------------------
        print >> fpout, '<td>'
        print >> fpout, '%d' % (cntOutputID)
        print >> fpout, '</td>'
        # 2. PfamID ---------------------------
        pfamURL = 'http://pfam.sanger.ac.uk/family/' + pfamid
        print >> fpout, '<td>'
        print >> fpout, '<a href=\"%s\" target=\"_blank\">%s</a>' % (pfamURL,
                                                                     pfamid)
        print >> fpout, '</td>'
        # 3 PfamDef ---------------------------
        try:
            pfamdef = g_params['pfamidDefDict'][pfamid]
        except KeyError:
            pfamdef = ""
        print >> fpout, '<td>'
        print >> fpout, '%s' % (pfamdef)
        print >> fpout, '</td>'
        # 4 NumSeqAll ---------------------------
        print >> fpout, '<td>'
        print >> fpout, '%d' % (numSeqAll)
        print >> fpout, '</td>'
        # 5 NumSeq with topology ---------------------------
        print >> fpout, '<td>'
        print >> fpout, '%d' % (numSeqWithTopology)
        print >> fpout, '</td>'
        # 6 NumSeqUsed ---------------------------
        try:
            numSeqUsed = len(dataTable[pfamid]['set_seqid'])
        except KeyError:
            numSeqUsed = 0
        print >> fpout, '<td>'
        print >> fpout, '%d' % (numSeqUsed)
        print >> fpout, '</td>'

        # 7 Figure MSA---------------------------
        if 1:
            ext = '.reordered.topomsa.png'
            print >> fpout, '<td>'
            imageSourceFile = g_params['msapath'] + os.sep + pfamid + ext

            if not os.path.exists(imageSourceFile):
                PrepareDataForTopoanaTMPro(pfamid, seqwithtopo_idlist,
                                           g_params['msapath'])

            imageTargetFile = outpath + os.sep + htmlname + os.sep + pfamid + ext
            thumbImageSourceFile = g_params[
                'msapath'] + os.sep + 'thumb.' + pfamid + ext
            thumbImageTargetFile = outpath + os.sep + htmlname + os.sep + 'thumb.' + pfamid + ext
            if os.path.exists(imageSourceFile):
                os.system(
                    "%s %s %s" %
                    (g_params['CP_EXE'], imageSourceFile, imageTargetFile))
            if os.path.exists(thumbImageSourceFile):
                os.system("%s %s %s" %
                          (g_params['CP_EXE'], thumbImageSourceFile,
                           thumbImageTargetFile))
            print >> fpout, (
                "<a href=\"%s\"target=\"_blank\">" %
                (htmlname + os.sep + os.path.basename(imageTargetFile)))
            print >> fpout, (
                "<img src=\"%s\">" %
                (htmlname + os.sep + os.path.basename(thumbImageTargetFile)))
            print >> fpout, "</a>"
            print >> fpout, '</td>'

# 8 Figure Tree---------------------------
        ext = '-itol.jpg'
        extpdf = '-itol.pdf'
        print >> fpout, '<td>'
        imageSourceFile = g_params['treepath'] + os.sep + pfamid + ext
        imageSourceFilePDF = g_params['treepath'] + os.sep + pfamid + extpdf
        imageTargetFile = outpath + os.sep + htmlname + os.sep + pfamid + ext
        imageTargetFilePDF = outpath + os.sep + htmlname + os.sep + pfamid + extpdf
        thumbImageSourceFile = g_params[
            'treepath'] + os.sep + 'thumb.' + pfamid + ext
        thumbImageTargetFile = outpath + os.sep + htmlname + os.sep + 'thumb.' + pfamid + ext
        if os.path.exists(imageSourceFile):
            os.system("%s %s %s" %
                      (g_params['CP_EXE'], imageSourceFile, imageTargetFile))
        if os.path.exists(imageSourceFilePDF):
            os.system(
                "%s %s %s" %
                (g_params['CP_EXE'], imageSourceFilePDF, imageTargetFilePDF))
        if os.path.exists(thumbImageSourceFile):
            os.system("%s %s %s" % (g_params['CP_EXE'], thumbImageSourceFile,
                                    thumbImageTargetFile))
        print >> fpout, (
            "<a href=\"%s\"target=\"_blank\">" %
            (htmlname + os.sep + os.path.basename(imageTargetFile)))
        print >> fpout, (
            "<img src=\"%s\">" %
            (htmlname + os.sep + os.path.basename(thumbImageTargetFile)))
        print >> fpout, "</a>"
        print >> fpout, '</td>'

        # 9 numPair ---------------------------
        print >> fpout, '<td>'
        print >> fpout, '%d' % (numPair)
        print >> fpout, '</td>'
        # 10-15 INV, DUP, TM2GAP, Mixed, TM2SEQ, TM2SP ---------------------------
        for cmpclass in g_params['cmpClassList_mp3_cmpdup'][1:]:
            try:
                nn = len(dataTable[pfamid]['difftopopair'][cmpclass])
            except KeyError:
                nn = 0
            print >> fpout, '<td>'
            if nn > 0:
                ss1 = "%d, %.1f%%" % (nn, myfunc.FloatDivision(nn, numPair))
            else:
                ss1 = "-"
            print >> fpout, '%s' % (ss1)
            print >> fpout, '</td>'

# 11 #pairwise alignment ---------------------------
        print >> fpout, '<td>'
        WriteSubTable(dataTable, pfamid, outpath, fpout)
        print >> fpout, '</td>'

        # Finish writing the row =====================================
        print >> fpout, "</tr>"
    print >> fpout, "</table>"