Exemplo n.º 1
0
def Fasta2SwissProt(fastaFile, ssPath, outPath, ssFormat):  #{{{
    # add gap penalties
    rootname = os.path.basename(os.path.splitext(fastaFile)[0])
    inFilePath = os.path.dirname(fastaFile)
    if inFilePath == "":
        inFilePath = "./"
    (annotationList, seqList) = myfunc.ReadFasta_without_annotation(fastaFile)

    if outPath == "":
        localOutPath = inFilePath
    else:
        localOutPath = outPath

    outFile = "%s/%s.swiss" % (localOutPath, rootname)
    fpout = open(outFile, "w")

    if ssFormat == 0:
        for i in range(len(seqList)):
            aaSeq = seqList[i]
            seqLength = len(aaSeq)

            if DEBUG:
                print >> sys.stdout, "%d:%s" % (i, annotationList[i])

            sseList = []
            if ssPath != "":
                ssFile = "%s/%s/%s_%d.ss2" % (ssPath, rootname, rootname, i)
                (sseList) = GetSSEList(ssFile, seqLength, ssFormat)

# write out the result
            fpout.write("%-4s %s\n" % ("ID", annotationList[i]))
            for sse in sseList:
                if sse['type'] == 'helix':
                    fpout.write("%-4s %s " % ("FT", "HELIX"))
                elif sse['type'] == 'strand':
                    fpout.write("%-4s %s " % ("FT", "STRAND"))
                fpout.write("%d %d\n" % (sse['start'], sse['end']))

            fpout.write("%-4s SEQUENCE %d AA;\n" % ("SQ", seqLength))
            j = 0
            cntBlock = 0
            fpout.write("%5s" % (""))
            while j < seqLength:
                fpout.write("%s" % aaSeq[j:j + 10])
                j += 10
                cntBlock += 1
                if j >= seqLength:
                    fpout.write("\n")
                    break
                else:
                    if cntBlock < 6:
                        fpout.write(" ")
                    else:
                        fpout.write("\n%5s" % (""))
                        cntBlock = 0
            fpout.write("//\n")

        fpout.close()

    return len(seqList)
Exemplo n.º 2
0
def GetSeqDict(fastafile):  #{{{
    seqDict = {}
    (idList, seqList) = myfunc.ReadFasta_without_annotation(fastafile)
    for i in xrange(len(idList)):
        seqDict[idList[i]] = seqList[i]
    return seqDict
Exemplo n.º 3
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    pairalnFile = ""
    outfile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            pairalnFile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outfile", "--outfile"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            pairalnFile = argv[i]
            i += 1
    if pairalnFile == "":
        print >> sys.stderr, "pairalnFile not set"
        return 1
    elif not os.path.exists(pairalnFile):
        print >> sys.stderr, "pairalnFile %s does not exist" % pairalnFile
        return 1
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    (idList, seqList) = myfunc.ReadFasta_without_annotation(pairalnFile)
    numSeq = len(idList)
    numPair = numSeq / 2
    for i in xrange(numPair):
        id1 = idList[2 * i]
        id2 = idList[2 * i + 1]
        seq1 = seqList[2 * i]
        seq2 = seqList[2 * i + 1]
        if len(seq1) != len(seq2):
            print >> sys.stderr, "Bad alignment, seq length conflicts, %d (%s) = %d (%s)" % (
                len(seq1), id1, len(seq2), id2)
            continue
        tmpfile = tempfile.mktemp()
        fpout = open(tmpfile, "w")
        fpout.write(">%s\n" % (id1))
        fpout.write("%s\n" % (seq1))
        fpout.write(">%s\n" % (id2))
        fpout.write("%s\n" % (seq2))
        fpout.close()
        cmd = "%s/calEvoDist.sh -f 1 %s" % (binpath, tmpfile)
        os.system(cmd)
        os.remove(tmpfile)

    myfunc.myclose(fpout)
Exemplo n.º 4
0
def GetTopoDict(topofile):  #{{{
    (idList, topoList) = myfunc.ReadFasta_without_annotation(topofile)
    topoDict = {}
    for i in range(len(idList)):
        topoDict[idList[i]] = topoList[i]
    return topoDict
Exemplo n.º 5
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    infile = ""
    outfile = ""
    seqid2pfamidFile = datadir3 + os.sep + "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.seqid2pfamid"

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]:
                seqid2pfamidFile = argv[i + 1]
                i += 2
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i + 1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print("Error! Wrong argument:", argv[i], file=sys.stderr)
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "" or not os.path.exists(infile):
        print("Error. Infile not set. exit", file=sys.stderr)
        return 1
    if seqid2pfamidFile == "" or not os.path.exists(seqid2pfamidFile):
        print("Error. seqid2pfamidFile does not exist. exit", file=sys.stderr)
        return 1
    seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidFile)
    if seqid2pfamidFile == {}:
        print("Read seqid2pfamidFile failed.", file=sys.stderr)
        return 1

    (idList, topoList) = myfunc.ReadFasta_without_annotation(infile)
    if len(idList) < 1:
        print("Read infile failed.", file=sys.stderr)
        return 1

    idList.remove("Consensus")
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    WritePfamColorDef(idList, seqid2pfamidDict, fpout)

    myfunc.myclose(fpout)

    return 0
Exemplo n.º 6
0
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    tableinfoFile = ""
    hhprofilepathList = []
    hhsearchpathList = []
    dupfile = ""
    topofile = ""
# /data3/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.topo

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            tableinfoFile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-outpath", "--outpath"]:
                (outpath, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-hhprofile", "--hhprofile", "-hhprofile1", "--hhprofile1"] :
                (ss, i) = myfunc.my_getopt_str(argv, i)
                hhprofilepathList.append(ss)
            elif argv[i] in ["-hhsearch", "--hhsearch"] :
                (ss, i) = myfunc.my_getopt_str(argv, i)
                hhsearchpathList.append(ss)
            elif argv[i] in ["-dupfile", "--dupfile"] :
                (dupfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-topofile", "--topofile"] :
                (topofile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-l", "--l"] :
                (tableinfoFile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True; i += 1
            elif argv[i] in ["-overwrite", "-forcewrite", "--forcewrite"]:
                g_params['isForceOverWrite'] = True; i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            tableinfoFile = argv[i]
            i += 1

    if tableinfoFile == "":
        print >> sys.stderr, "tableinfoFile not set. exit"
        return 1
    if len(hhprofilepathList) < 1:
        print >> sys.stderr, "hhprofilepath not set. exit"
        return 1
    if not os.path.exists(outpath):
        os.makedirs(outpath)
        if not os.path.exists(outpath):
            print >> sys.stderr, "failed to created outpath %s"%(outpath)
            return 1
    topoDict = {}
    if topofile != "":
        (idList, topoList) = myfunc.ReadFasta_without_annotation(topofile)
        for i in xrange(len(idList)):
            topoDict[idList[i]] = topoList[i]

    if dupfile != "" and topoDict == {}:
        print >> sys.stderr, "Error! topoDict is empty when dupfile"\
                "is set. Exit"
        return 1

# read in hhprofile dict
    hhprofilepathMapDictList = []
    for hhprofilepath in hhprofilepathList:
        hhprofilemapfile = hhprofilepath + os.sep + "id2pathmap.txt"
        if not os.path.exists(hhprofilemapfile):
            print >> sys.stderr, "hhprofilemapfile not exist. exit"
            hhprofilepathMapDictList.append({})
        else:
            hhprofilepathMapDictList.append(ReadSeqPathMapDict(hhprofilemapfile))
            #print
            #print hhprofilemapfile
            #print ReadSeqPathMapDict(hhprofilemapfile)

# read in index dictionary for hhsearch result file
    hhsearchpathMapDictList = []
    if len(hhsearchpathList) > 0:
        g_params['isUsePreBuildHHSearchResult'] = True
        for hhsearchpath in hhsearchpathList:
            hhsearchmapfile = hhsearchpath + os.sep + "id2pathmap.txt"
            if not os.path.exists(hhsearchmapfile):
                print >> sys.stderr, "hhsearchmapfile not exist. exit"
                hhsearchpathMapDictList.append({})
            else:
                hhsearchpathMapDictList.append(ReadSeqPathMapDict(hhsearchmapfile))

    RunHHSearchPairwise(tableinfoFile, 
            hhprofilepathList, hhprofilepathMapDictList, 
            hhsearchpathList, hhsearchpathMapDictList, topoDict,
            outpath, dupfile)

    return 0
    print >> sys.stderr, "Error! file infile (%s) does not exist." % infile
    sys.exit(1)


def GetNtermState(topo):  #{{{
    if topo[0] != GAP:
        return topo[0]
    else:
        topo = topo.lstrip(GAP)
        if topo != "":
            return topo[0]
        else:
            return None


#}}}

(idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

# write out taxdef
numSeq = len(idList)
for i in xrange(numSeq):
    gid = idList[i]
    if gid != 'Consensus':
        color = red
        NtermState = GetNtermState(seqList[i])
        if NtermState == 'o':
            color = blue
        sys.stdout.write("%s,%s\n" % (gid, color))
sys.stdout.write("\n")
Exemplo n.º 8
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outpath = "./"
    isQuiet = False
    diffseqidtgroup = "0"

    pairseqAlnFile = ''
    cmpclassList = []
    topofile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            pairseqAlnFile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] == "-h" or sys.argv[i] == "--help":
                PrintHelp()
                return 1
            elif (argv[i] in ["-outpath", "--outpath"]):
                outpath = argv[i + 1]
                i += 2
            elif (argv[i] in ["-topofile", "--topofile"]):
                topofile = argv[i + 1]
                i += 2
            elif (argv[i] in ["-cmpclass", "--cmpclass"]):
                cmpclassList.append(argv[i + 1])
                i += 2
            elif sys.argv[i] == "-q":
                isQuiet = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", sys.argv[i]
                return -1
        else:
            pairseqAlnFile = argv[i]
            i += 1
    g_params['outpath'] = outpath
    g_params['cmpclassList'] = cmpclassList

    if pairseqAlnFile == "":
        print >> sys.stderr, "pairseqAlnFile not set. Exit."
        return 1
    if not os.path.exists(pairseqAlnFile):
        print >> sys.stderr, "pairseqAlnFile %s does not exists. Exit." % pairseqAlnFile
        return 1

    rootname = os.path.basename(os.path.splitext(pairseqAlnFile)[0])
    # Read in aaSeqDict
    print "Read in aaSeqDict"
    os.system("mkdir -p %s" % outpath)
    (idList, seqList) = myfunc.ReadFasta_without_annotation(pairseqAlnFile)

    # create seqdbfile
    seqdbfile = outpath + os.sep + rootname + ".seqdb.fa"
    fo = open(seqdbfile, "w")
    for i in range(len(idList)):
        print >> fo, ">%s" % (idList[i])
        print >> fo, "%s" % (seqList[i].replace('-', ''))
    fo.close()
    cmd = "%s/indexfasta.py %s"
    os.system(cmd % (binpath, seqdbfile))
    seqdbname = outpath + os.sep + rootname + ".seqdb"

    aaSeqDict = {}
    for i in xrange(len(idList)):
        aaSeqDict[idList[i]] = seqList[i].replace('-', '')
# # Output uniqid included in pairtopoAlnFile
#     uniqid_set = set(idList)
# # output uniqid seqfile
#     uniqidAASeqFile = outpath + os.sep + rootname + '.uniqid.aaseq.fa'
#     print "Output uniqid seqfile to %s"%uniqidAASeqFile
#     fpout = open(uniqidAASeqFile, "w")
#     for idd in uniqid_set:
#         fpout.write(">%s\n"%idd)
#         fpout.write("%s\n"%aaSeqDict[idd])
#     fpout.close()
# # Output dgscan file
#     dgpfile = outpath + os.sep + rootname + '.uniqid.dgscan'
#     print "Output dgscan file to %s"%dgpfile
#     cmd = "%s %s -lmin 21 -lmax 21 -o %s" %(dgscanprog, uniqidAASeqFile,
#             dgpfile)
#     os.system(cmd)
# Output pairwise topology comparison
    if not os.path.exists(topofile):
        print >> sys.stderr, "topofile %s not exist. exit." % topofile
        return 1
    cmd = "%s/seqpairaln_to_topopaircmp.sh %s -outpath %s -topofile %s -seqdb %s" % (
        binpath, pairseqAlnFile, outpath, topofile, seqdbname)
    print "Output paircmp file to %s" % outpath
    os.system(cmd)
    paircmpFile = outpath + os.sep + rootname + '.paircmp'
    pairtopoAlnFile = outpath + os.sep + rootname + '.topoaln.fa'
    # Read in paircmp file
    pairCmpclassDict = ReadPaircmpCmpclass(paircmpFile)

    print "Draw pairwise topology comparison ..."
    DrawPairwiseTopo(pairtopoAlnFile, aaSeqDict, pairCmpclassDict, outpath)

    return 0
def main(g_params):#{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1
    infile = ""
    outfile = ""
    tableinfoFile = datadir + os.sep + "uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.tableinfo"

    i = 1
    isNonOptionArg=False
    while i < numArgv:
        if isNonOptionArg == True:
            infile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-tableinfo", "--tableinfo"]:
                tableinfoFile = argv[i+1]
                i += 2
            elif argv[i] in ["-o", "--o"]:
                outfile = argv[i+1]
                i += 2
            elif argv[i] in ["-q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            infile = argv[i]
            i += 1
    if infile == "" or not os.path.exists(infile):
        print >> sys.stderr, "Error. Infile not set. exit"
        return 1
    if tableinfoFile == "" or not os.path.exists(tableinfoFile):
        print >> sys.stderr, "Error. tableinfoFile %s does not exist. exit" %(tableinfoFile)
        return 1
    seqid2TaxoDict = ReadUniprotInfoTable(tableinfoFile)
    if tableinfoFile == {}:
        print >> sys.stderr, "Read tableinfoFile failed."
        return 1

    (idList, topoList) = myfunc.ReadFasta_without_annotation(infile)
    if len(idList) < 1:
        print >> sys.stderr, "Read infile failed."
        return 1


    idList.remove("Consensus")
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)

    WriteTaxoColor(idList, seqid2TaxoDict, fpout)

    myfunc.myclose(fpout)

    return 0
Exemplo n.º 10
0
def main():  #{{{
    if 0:  #{{{
        strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo"
        strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo"
        strProtein1 = "id1"
        strProtein2 = "id2"
        fpLog = sys.stdout
        class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew(
            strTop1, strTop2, strProtein1, strProtein2, fpLog)
        # Note: calling the int, float, string will not change their original value
        # calling the dict, list will change their original value
        print "strTop1:", strTop1
        print "strTop2:", strTop2
#}}}
    if 0:  #{{{
        PrintFuncName()
        print("this file name is: %s" % __file__)
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        lines = fp.readlines()
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        BLOCK_SIZE = 100000
        fp = open(filename, "r")
        buff = fp.read(BLOCK_SIZE)
        while buff:
            buff = fp.read(BLOCK_SIZE)
        fp.close()
#}}}
    if 0:  #{{{
        # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta"
        filename = sys.argv[1]
        print filename
        fp = open(filename, "r")
        line = fp.readline()
        while line:
            line = fp.readline()
        fp.close()
        #}}}
    if 0:  #{{{
        try:
            BLOCK_SIZE = 100000
            infile = sys.argv[1]
            fpin = open(infile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                if len(recordList) > 0:
                    for record in recordList:
                        sys.stdout.write(">%s\n" % record[1])
                        sys.stdout.write("%s\n" % record[2])
                if isEOFreached == True:
                    break
            fpin.close()
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        try:
            infile = sys.argv[1]
            (annoList, seqList) = myfunc.ReadFasta_without_id(infile)
            for i in xrange(len(seqList)):
                sys.stdout.write(">%s\n" % annoList[i])
                sys.stdout.write("%s\n" % seqList[i])
        except IOError:
            raise
            #}}}
    if 0:  #{{{
        hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr"
        if IsDuplicatedByHHSearch(hhrfile):
            print "yes"

#}}}
    if 0:  #{{{
        import pairlistwithfamid2pairaln_by_msa
        seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV"
        seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP"
        seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
        alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment(
            seq1, seq2)
        print alignFactor
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{
        import my_extractdb
        #miniking my_extractdb.py see which one is faster
        try:
            dbname = sys.argv[1]
            idlistfile = sys.argv[2]
            cls = myfunc.MyDB(dbname)
            if cls.failure:
                print >> sys.stderr, "MyDB init failed"
            else:
                idlist = open(idlistfile, "r").read().split("\n")
                fpout = sys.stdout
                for seqid in idlist:
                    if seqid:
                        record = cls.GetRecord(seqid)
                        fpout.write(record)
            #             for rd in  cls.GetAllRecord():
            #                 print rd
#                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
#                print (seqid, anno, seq)
        except IndexError:
            print "error"
            pass
#}}}
    if 0:  #{{{ #test ReadLineByBlock
        try:
            infile = sys.argv[1]
            from myfunc import ReadLineByBlock
            cls = ReadLineByBlock(infile)
            lines = cls.readlines()
            while lines != None:
                for line in lines:
                    print line
                lines = cls.readlines()

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test speed of ReadLineByBlock
        # ReadLineByBlock is about 3 times fater than file.readline()
        try:
            from myfunc import ReadLineByBlock
            infile = sys.argv[1]

            start = time.time()
            hdl = ReadLineByBlock(infile)
            lines = hdl.readlines()
            while lines != None:
                lines = hdl.readlines()
            hdl.close()
            end = time.time()
            msg = "Reading %s by ReadLineByBlock costs %.3fs seconds"
            print msg % (infile, (end - start))

            start = time.time()
            hdl = open(infile, "r")
            line = hdl.readline()
            while line:
                line = hdl.readline()
            hdl.close()
            end = time.time()
            msg = "Reading %s by readline() costs %.3fs seconds"
            print msg % (infile, (end - start))

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test readline
        try:
            infile = sys.argv[1]
            fp = open(infile, "r")
            line = fp.readline()
            while line:
                print line
                line = fp.readline()
            fp.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test the speed of GetFirstWord
        try:
            nloop = int(sys.argv[1])
            string = "kjdafk jasdfj j"
            #string = "askdf askdf "
            #            string = "kajsdfasdfsdfjakasjdfka"
            #            string = "kajsdfasdf,sdfjakasjdfka"
            delimiter = " \t\r,.\n"
            delimiter = " "
            for i in xrange(nloop):
                #firstword = myfunc.GetFirstWord(string, delimiter)
                #firstword = string.split()[0]
                #firstword = string.partition(" ")[0]
                firstword = myfunc.GetFirstWord(string)
                #pass
                #print firstword
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ # read seq by SeqIO
        from Bio import SeqIO
        try:
            seqfile = sys.argv[1]
            # 1. SeqIO ####################
            start = time.time()
            handle = open(seqfile, "rU")
            cnt = 0
            for record in SeqIO.parse(handle, "fasta"):
                cnt += 1
            handle.close()
            end = time.time()
            msg = "Reading %d sequences by SeqIO costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 2. ReadFasta ####################
            start = time.time()
            seqfile = sys.argv[1]
            (idList, annoList, seqList) = myfunc.ReadFasta(seqfile)
            end = time.time()
            msg = "Reading %d sequences by ReadFasta costs %.3fs seconds"
            print msg % (len(idList), (end - start))

            # 3. ReadFasta from buffer
            BLOCK_SIZE = 100000
            start = time.time()
            cnt = 0
            fpin = open(seqfile, 'rb')
            unprocessedBuffer = ""
            isEOFreached = False
            while 1:
                buff = fpin.read(BLOCK_SIZE)
                if len(buff) < BLOCK_SIZE:
                    isEOFreached = True
                buff = unprocessedBuffer + buff
                recordList = []
                unprocessedBuffer = myfunc.ReadFastaFromBuffer(
                    buff, recordList, isEOFreached)
                cnt += len(recordList)
                if isEOFreached == True:
                    break
            fpin.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds"
            print msg % (cnt, (end - start))

            # 4. ReadFastaByBlock ####################
            start = time.time()
            seqfile = sys.argv[1]
            hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0)
            if hdl.failure:
                print >> sys.stderr, "Failed to init ReadFastaByBlock"
                return 1
            recordList = hdl.readseq()
            cnt = 0
            while recordList != None:
                cnt += len(recordList)
                #                 for rd in recordList:
                #                     print ">%s"%rd.description
                #                     print rd.seq
                recordList = hdl.readseq()
            hdl.close()
            end = time.time()
            msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds"
            print msg % (cnt, (end - start))
        except (IndexError, ValueError):
            pass
#}}}
    if 0:  #{{{ #test RemoveUnnecessaryGap
        try:
            infile = sys.argv[1]
            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)
            seqList = lcmp.RemoveUnnecessaryGap_old(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

            start = time.time()
            (idList, seqList) = myfunc.ReadFasta_without_annotation(infile)

            seqList = lcmp.RemoveUnnecessaryGap(seqList)
            end = time.time()
            msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds"
            print >> sys.stderr, msg % (infile, (end - start))
            for seq in seqList:
                print seq

        except IndexError:
            pass
#}}}
    if 0:  #{{{ #test ReadMPAByBlock
        try:
            infile = sys.argv[1]
            hdl = myfunc.ReadMPAByBlock(infile)
            if hdl.failure:
                return
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    #print rd.seqid
                    print ">%s" % (rd.description)
                    print "%s" % (myfunc.mpa2seq(rd.mpa))
                recordList = hdl.readseq()
            hdl.close()
        except IndexError:
            pass
#}}}
    if 0:  #{{{
        try:
            dbname = sys.argv[1]
            print dbname
            from myfunc import MyDB
            cls = MyDB(dbname)
            #            print cls.idList
            record = cls.GetRecord("A0FGX9")
            if record:
                print record
                #             for rd in  cls.GetAllRecord():
                #                 print rd
                (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record)
                print(seqid, anno, seq)
        except IndexError:
            pass

#}}}
    if 0:  #{{{ #test subprocess
        import glob
        #invoke shell explicitly, not very good, may have security problems
        subprocess.call("seq 10", shell=True)
        subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True)
        subprocess.call("ls topo*.py", shell=True)
    if 1:  #{{{ #test subprocess
        import glob
        #invoke shell implicitly, recommended way
        subprocess.call(["seq", "10"], shell=False)
        subprocess.call(["echo", "wait for 1 seconds..."])
        subprocess.call(["sleep", "1"])
        try:
            print subprocess.check_call(["ls",
                                         "topo*.py"])  #This will not work
        except subprocess.CalledProcessError, e:
            print "error message:", e
        subprocess.call(["ls"] + glob.glob("topo*.py"))
Exemplo n.º 11
0
def main(g_params):  #{{{
    argv = sys.argv
    numArgv = len(argv)
    if numArgv < 2:
        PrintHelp()
        return 1

    outfile = ""
    topoalnfile = ""
    localalifile = ""

    i = 1
    isNonOptionArg = False
    while i < numArgv:
        if isNonOptionArg == True:
            topoalnfile = argv[i]
            isNonOptionArg = False
            i += 1
        elif argv[i] == "--":
            isNonOptionArg = True
            i += 1
        elif argv[i][0] == "-":
            if argv[i] in ["-h", "--help"]:
                PrintHelp()
                return 1
            elif argv[i] in ["-o", "--o", "-outfile"]:
                (outfile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-localali", "--localali"]:
                (localalifile, i) = myfunc.my_getopt_str(argv, i)
            elif argv[i] in ["-q", "--q"]:
                g_params['isQuiet'] = True
                i += 1
            else:
                print >> sys.stderr, "Error! Wrong argument:", argv[i]
                return 1
        else:
            topoalnfile = argv[i]
            i += 1

    if myfunc.checkfile(topoalnfile) != 0:
        return 1
    if myfunc.checkfile(localalifile) != 0:
        return 1

    (idList, topoList) = myfunc.ReadFasta_without_annotation(topoalnfile)
    (idListLocal,
     seqListLocal) = myfunc.ReadFasta_without_annotation(localalifile)
    numseqLocal = len(idListLocal)
    numpairLocal = numseqLocal / 2
    localseqpairDict = {}
    for i in xrange(numpairLocal):
        id1 = idListLocal[2 * i]
        id2 = idListLocal[2 * i + 1]
        unaligned_str = GetUnAlignedString(seqListLocal[2 * i],
                                           seqListLocal[2 * i + 1])
        if unaligned_str != "":
            localseqpairDict[(id1, id2)] = [
                seqListLocal[2 * i], seqListLocal[2 * i + 1], unaligned_str
            ]
    del idListLocal, seqListLocal

    if outfile != "":
        outfile1 = outfile + ".1"
    fpout = myfunc.myopen(outfile, sys.stdout, "w", False)
    fpout1 = myfunc.myopen(outfile1, sys.stdout, "w", False)

    AnaLocalTopoAln(idList, topoList, localseqpairDict, fpout, fpout1)

    myfunc.myclose(fpout)
    myfunc.myclose(fpout1)