def Fasta2SwissProt(fastaFile, ssPath, outPath, ssFormat): #{{{ # add gap penalties rootname = os.path.basename(os.path.splitext(fastaFile)[0]) inFilePath = os.path.dirname(fastaFile) if inFilePath == "": inFilePath = "./" (annotationList, seqList) = myfunc.ReadFasta_without_annotation(fastaFile) if outPath == "": localOutPath = inFilePath else: localOutPath = outPath outFile = "%s/%s.swiss" % (localOutPath, rootname) fpout = open(outFile, "w") if ssFormat == 0: for i in range(len(seqList)): aaSeq = seqList[i] seqLength = len(aaSeq) if DEBUG: print >> sys.stdout, "%d:%s" % (i, annotationList[i]) sseList = [] if ssPath != "": ssFile = "%s/%s/%s_%d.ss2" % (ssPath, rootname, rootname, i) (sseList) = GetSSEList(ssFile, seqLength, ssFormat) # write out the result fpout.write("%-4s %s\n" % ("ID", annotationList[i])) for sse in sseList: if sse['type'] == 'helix': fpout.write("%-4s %s " % ("FT", "HELIX")) elif sse['type'] == 'strand': fpout.write("%-4s %s " % ("FT", "STRAND")) fpout.write("%d %d\n" % (sse['start'], sse['end'])) fpout.write("%-4s SEQUENCE %d AA;\n" % ("SQ", seqLength)) j = 0 cntBlock = 0 fpout.write("%5s" % ("")) while j < seqLength: fpout.write("%s" % aaSeq[j:j + 10]) j += 10 cntBlock += 1 if j >= seqLength: fpout.write("\n") break else: if cntBlock < 6: fpout.write(" ") else: fpout.write("\n%5s" % ("")) cntBlock = 0 fpout.write("//\n") fpout.close() return len(seqList)
def GetSeqDict(fastafile): #{{{ seqDict = {} (idList, seqList) = myfunc.ReadFasta_without_annotation(fastafile) for i in xrange(len(idList)): seqDict[idList[i]] = seqList[i] return seqDict
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 pairalnFile = "" outfile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: pairalnFile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outfile", "--outfile"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: pairalnFile = argv[i] i += 1 if pairalnFile == "": print >> sys.stderr, "pairalnFile not set" return 1 elif not os.path.exists(pairalnFile): print >> sys.stderr, "pairalnFile %s does not exist" % pairalnFile return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) (idList, seqList) = myfunc.ReadFasta_without_annotation(pairalnFile) numSeq = len(idList) numPair = numSeq / 2 for i in xrange(numPair): id1 = idList[2 * i] id2 = idList[2 * i + 1] seq1 = seqList[2 * i] seq2 = seqList[2 * i + 1] if len(seq1) != len(seq2): print >> sys.stderr, "Bad alignment, seq length conflicts, %d (%s) = %d (%s)" % ( len(seq1), id1, len(seq2), id2) continue tmpfile = tempfile.mktemp() fpout = open(tmpfile, "w") fpout.write(">%s\n" % (id1)) fpout.write("%s\n" % (seq1)) fpout.write(">%s\n" % (id2)) fpout.write("%s\n" % (seq2)) fpout.close() cmd = "%s/calEvoDist.sh -f 1 %s" % (binpath, tmpfile) os.system(cmd) os.remove(tmpfile) myfunc.myclose(fpout)
def GetTopoDict(topofile): #{{{ (idList, topoList) = myfunc.ReadFasta_without_annotation(topofile) topoDict = {} for i in range(len(idList)): topoDict[idList[i]] = topoList[i] return topoDict
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 infile = "" outfile = "" seqid2pfamidFile = datadir3 + os.sep + "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.seqid2pfamid" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]: seqid2pfamidFile = argv[i + 1] i += 2 elif argv[i] in ["-o", "--o"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print("Error! Wrong argument:", argv[i], file=sys.stderr) return 1 else: infile = argv[i] i += 1 if infile == "" or not os.path.exists(infile): print("Error. Infile not set. exit", file=sys.stderr) return 1 if seqid2pfamidFile == "" or not os.path.exists(seqid2pfamidFile): print("Error. seqid2pfamidFile does not exist. exit", file=sys.stderr) return 1 seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidFile) if seqid2pfamidFile == {}: print("Read seqid2pfamidFile failed.", file=sys.stderr) return 1 (idList, topoList) = myfunc.ReadFasta_without_annotation(infile) if len(idList) < 1: print("Read infile failed.", file=sys.stderr) return 1 idList.remove("Consensus") fpout = myfunc.myopen(outfile, sys.stdout, "w", False) WritePfamColorDef(idList, seqid2pfamidDict, fpout) myfunc.myclose(fpout) return 0
def main(g_params):#{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" tableinfoFile = "" hhprofilepathList = [] hhsearchpathList = [] dupfile = "" topofile = "" # /data3/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.topo i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: tableinfoFile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-hhprofile", "--hhprofile", "-hhprofile1", "--hhprofile1"] : (ss, i) = myfunc.my_getopt_str(argv, i) hhprofilepathList.append(ss) elif argv[i] in ["-hhsearch", "--hhsearch"] : (ss, i) = myfunc.my_getopt_str(argv, i) hhsearchpathList.append(ss) elif argv[i] in ["-dupfile", "--dupfile"] : (dupfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topofile", "--topofile"] : (topofile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"] : (tableinfoFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True; i += 1 elif argv[i] in ["-overwrite", "-forcewrite", "--forcewrite"]: g_params['isForceOverWrite'] = True; i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: tableinfoFile = argv[i] i += 1 if tableinfoFile == "": print >> sys.stderr, "tableinfoFile not set. exit" return 1 if len(hhprofilepathList) < 1: print >> sys.stderr, "hhprofilepath not set. exit" return 1 if not os.path.exists(outpath): os.makedirs(outpath) if not os.path.exists(outpath): print >> sys.stderr, "failed to created outpath %s"%(outpath) return 1 topoDict = {} if topofile != "": (idList, topoList) = myfunc.ReadFasta_without_annotation(topofile) for i in xrange(len(idList)): topoDict[idList[i]] = topoList[i] if dupfile != "" and topoDict == {}: print >> sys.stderr, "Error! topoDict is empty when dupfile"\ "is set. Exit" return 1 # read in hhprofile dict hhprofilepathMapDictList = [] for hhprofilepath in hhprofilepathList: hhprofilemapfile = hhprofilepath + os.sep + "id2pathmap.txt" if not os.path.exists(hhprofilemapfile): print >> sys.stderr, "hhprofilemapfile not exist. exit" hhprofilepathMapDictList.append({}) else: hhprofilepathMapDictList.append(ReadSeqPathMapDict(hhprofilemapfile)) #print #print hhprofilemapfile #print ReadSeqPathMapDict(hhprofilemapfile) # read in index dictionary for hhsearch result file hhsearchpathMapDictList = [] if len(hhsearchpathList) > 0: g_params['isUsePreBuildHHSearchResult'] = True for hhsearchpath in hhsearchpathList: hhsearchmapfile = hhsearchpath + os.sep + "id2pathmap.txt" if not os.path.exists(hhsearchmapfile): print >> sys.stderr, "hhsearchmapfile not exist. exit" hhsearchpathMapDictList.append({}) else: hhsearchpathMapDictList.append(ReadSeqPathMapDict(hhsearchmapfile)) RunHHSearchPairwise(tableinfoFile, hhprofilepathList, hhprofilepathMapDictList, hhsearchpathList, hhsearchpathMapDictList, topoDict, outpath, dupfile) return 0
print >> sys.stderr, "Error! file infile (%s) does not exist." % infile sys.exit(1) def GetNtermState(topo): #{{{ if topo[0] != GAP: return topo[0] else: topo = topo.lstrip(GAP) if topo != "": return topo[0] else: return None #}}} (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) # write out taxdef numSeq = len(idList) for i in xrange(numSeq): gid = idList[i] if gid != 'Consensus': color = red NtermState = GetNtermState(seqList[i]) if NtermState == 'o': color = blue sys.stdout.write("%s,%s\n" % (gid, color)) sys.stdout.write("\n")
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" isQuiet = False diffseqidtgroup = "0" pairseqAlnFile = '' cmpclassList = [] topofile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: pairseqAlnFile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] == "-h" or sys.argv[i] == "--help": PrintHelp() return 1 elif (argv[i] in ["-outpath", "--outpath"]): outpath = argv[i + 1] i += 2 elif (argv[i] in ["-topofile", "--topofile"]): topofile = argv[i + 1] i += 2 elif (argv[i] in ["-cmpclass", "--cmpclass"]): cmpclassList.append(argv[i + 1]) i += 2 elif sys.argv[i] == "-q": isQuiet = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", sys.argv[i] return -1 else: pairseqAlnFile = argv[i] i += 1 g_params['outpath'] = outpath g_params['cmpclassList'] = cmpclassList if pairseqAlnFile == "": print >> sys.stderr, "pairseqAlnFile not set. Exit." return 1 if not os.path.exists(pairseqAlnFile): print >> sys.stderr, "pairseqAlnFile %s does not exists. Exit." % pairseqAlnFile return 1 rootname = os.path.basename(os.path.splitext(pairseqAlnFile)[0]) # Read in aaSeqDict print "Read in aaSeqDict" os.system("mkdir -p %s" % outpath) (idList, seqList) = myfunc.ReadFasta_without_annotation(pairseqAlnFile) # create seqdbfile seqdbfile = outpath + os.sep + rootname + ".seqdb.fa" fo = open(seqdbfile, "w") for i in range(len(idList)): print >> fo, ">%s" % (idList[i]) print >> fo, "%s" % (seqList[i].replace('-', '')) fo.close() cmd = "%s/indexfasta.py %s" os.system(cmd % (binpath, seqdbfile)) seqdbname = outpath + os.sep + rootname + ".seqdb" aaSeqDict = {} for i in xrange(len(idList)): aaSeqDict[idList[i]] = seqList[i].replace('-', '') # # Output uniqid included in pairtopoAlnFile # uniqid_set = set(idList) # # output uniqid seqfile # uniqidAASeqFile = outpath + os.sep + rootname + '.uniqid.aaseq.fa' # print "Output uniqid seqfile to %s"%uniqidAASeqFile # fpout = open(uniqidAASeqFile, "w") # for idd in uniqid_set: # fpout.write(">%s\n"%idd) # fpout.write("%s\n"%aaSeqDict[idd]) # fpout.close() # # Output dgscan file # dgpfile = outpath + os.sep + rootname + '.uniqid.dgscan' # print "Output dgscan file to %s"%dgpfile # cmd = "%s %s -lmin 21 -lmax 21 -o %s" %(dgscanprog, uniqidAASeqFile, # dgpfile) # os.system(cmd) # Output pairwise topology comparison if not os.path.exists(topofile): print >> sys.stderr, "topofile %s not exist. exit." % topofile return 1 cmd = "%s/seqpairaln_to_topopaircmp.sh %s -outpath %s -topofile %s -seqdb %s" % ( binpath, pairseqAlnFile, outpath, topofile, seqdbname) print "Output paircmp file to %s" % outpath os.system(cmd) paircmpFile = outpath + os.sep + rootname + '.paircmp' pairtopoAlnFile = outpath + os.sep + rootname + '.topoaln.fa' # Read in paircmp file pairCmpclassDict = ReadPaircmpCmpclass(paircmpFile) print "Draw pairwise topology comparison ..." DrawPairwiseTopo(pairtopoAlnFile, aaSeqDict, pairCmpclassDict, outpath) return 0
def main(g_params):#{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 infile = "" outfile = "" tableinfoFile = datadir + os.sep + "uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.tableinfo" i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-tableinfo", "--tableinfo"]: tableinfoFile = argv[i+1] i += 2 elif argv[i] in ["-o", "--o"]: outfile = argv[i+1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if infile == "" or not os.path.exists(infile): print >> sys.stderr, "Error. Infile not set. exit" return 1 if tableinfoFile == "" or not os.path.exists(tableinfoFile): print >> sys.stderr, "Error. tableinfoFile %s does not exist. exit" %(tableinfoFile) return 1 seqid2TaxoDict = ReadUniprotInfoTable(tableinfoFile) if tableinfoFile == {}: print >> sys.stderr, "Read tableinfoFile failed." return 1 (idList, topoList) = myfunc.ReadFasta_without_annotation(infile) if len(idList) < 1: print >> sys.stderr, "Read infile failed." return 1 idList.remove("Consensus") fpout = myfunc.myopen(outfile, sys.stdout, "w", False) WriteTaxoColor(idList, seqid2TaxoDict, fpout) myfunc.myclose(fpout) return 0
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" topoalnfile = "" localalifile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: topoalnfile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-localali", "--localali"]: (localalifile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: topoalnfile = argv[i] i += 1 if myfunc.checkfile(topoalnfile) != 0: return 1 if myfunc.checkfile(localalifile) != 0: return 1 (idList, topoList) = myfunc.ReadFasta_without_annotation(topoalnfile) (idListLocal, seqListLocal) = myfunc.ReadFasta_without_annotation(localalifile) numseqLocal = len(idListLocal) numpairLocal = numseqLocal / 2 localseqpairDict = {} for i in xrange(numpairLocal): id1 = idListLocal[2 * i] id2 = idListLocal[2 * i + 1] unaligned_str = GetUnAlignedString(seqListLocal[2 * i], seqListLocal[2 * i + 1]) if unaligned_str != "": localseqpairDict[(id1, id2)] = [ seqListLocal[2 * i], seqListLocal[2 * i + 1], unaligned_str ] del idListLocal, seqListLocal if outfile != "": outfile1 = outfile + ".1" fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpout1 = myfunc.myopen(outfile1, sys.stdout, "w", False) AnaLocalTopoAln(idList, topoList, localseqpairDict, fpout, fpout1) myfunc.myclose(fpout) myfunc.myclose(fpout1)