def CompareToposGaplesslyNew(strTop1, strTop2, strProtein1, strProtein2, fpLog): #{{{ # -----iiiMMMooo ___\ iMMMooo # iiiiii--MMMMoo / iMMMMoo [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) if fpLog != 0: print >> fpLog, "Unnecessary gaps removed" print >> fpLog, "%-20s:%s" % (strProtein1, strTop1) print >> fpLog, "%-20s:%s" % (strProtein2, strTop2) print >> fpLog strNewTop1 = '' strNewTop2 = '' for i in range(len(strTop1)): if not (strTop1[i] == '-' or strTop2[i] == '-'): strNewTop1 += strTop1[i] strNewTop2 += strTop2[i] strNewTop1 = filterTopo(strNewTop1) strNewTop2 = filterTopo(strNewTop2) if fpLog != 0: print >> fpLog, "Gapless" print >> fpLog, "%-20s:%s" % (strProtein1, strNewTop1) print >> fpLog, "%-20s:%s" % (strProtein2, strNewTop2) print >> fpLog if len(strNewTop1) <= 0 and len(strNewTop2) <= 0: return ("DIFF", 0, 0) elif len(strNewTop1) * len( strNewTop2) == 0 and len(strNewTop1) + len(strNewTop2) > 0: print >> sys.stderr, "%s %s gapless length does not match" % ( strProtein1, strProtein2) sys.exit(1) (intMems1, N1) = counttopo(strNewTop1) (intMems2, N2) = counttopo(strNewTop2) return compareTopos(intMems1, intMems2, strNewTop1, strNewTop2, N1, N2)
def CompareToposGloballyNew(strTop1, strTop2, strProtein1, strProtein2, fpLog): #{{{ [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) strTop1 = trimTopo(strTop1) strTop2 = trimTopo(strTop2) strTop1 = filterTopo(strTop1) strTop2 = filterTopo(strTop2) if fpLog != 0: print("Global", file=fpLog) print("%-20s:%s" % (strProtein1, strTop1), file=fpLog) print("%-20s:%s" % (strProtein2, strTop2), file=fpLog) print(file=fpLog) if len(strTop1) <= 0 and len(strTop2) <= 0: return ("DIFF", 0, 0) elif len(strTop1) * len(strTop2) == 0 and len(strTop1) + len(strTop2) > 0: print("%s %s global length does not match" % (strProtein1, strProtein2), file=sys.stderr) sys.exit(1) (intNumMem1, Nterm1) = counttopo(strTop1) (intNumMem2, Nterm2) = counttopo(strTop2) return compareTopos(intNumMem1, intNumMem2, strTop1, strTop2, Nterm1, Nterm2)
def WriteSeqAlnHTML(seqAlnFileList, extTopoMSA, outfile): # {{{ try: fpout = open(outfile, "w") except IOError: print("Failed to write to %s" % (outfile), file=sys.stderr) return 1 WriteHTMLHeader( 'Alignment highlighted by <font color=%s>TM regions</font>' % ('red'), fpout) print("Processed alignments:") for alnfile in seqAlnFileList: rootname_alnfile = os.path.basename(os.path.splitext(alnfile)[0]) topomsafile = '.'.join([os.path.splitext(alnfile)[0], extTopoMSA]) if not (os.path.exists(alnfile) and os.path.exists(topomsafile)): if not os.path.exists(alnfile): sys.stderr.write('alnfile %s does not exist\n' % (alnfile)) if not os.path.exists(topomsafile): sys.stderr.write('topomsafile %s does not exist\n' % (topomsafile)) continue (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile) #print(seqIDList) (topoIDList, topoAnnoList, topoList) = myfunc.ReadFasta(topomsafile) #print(topoIDList) if g_params['removeUnnecessaryGap']: seqList = lcmp.RemoveUnnecessaryGap(seqList) topoList = lcmp.RemoveUnnecessaryGap(topoList) # since there is no shrinking, index map is always p->p final2seq_idxMapList = [] for i in range(len(seqIDList)): seqlength = len(seqList[i]) idxmap = {} for j in range(seqlength): idxmap[j] = j final2seq_idxMapList.append(idxmap) print(('\t' + rootname_alnfile)) WriteHTMLAlignment2(rootname_alnfile, topoIDList, topoAnnoList, topoList, topoList, seqList, final2seq_idxMapList, fpout) WriteHTMLTail(fpout) fpout.close() return 0
def compareToposLocally(strAlitopFile): #{{{ # Cut off unaligned ends: # Modified by Nanjiang from the original code 2010-08-11 # compareToposLocally, that is # -----iiiMMMooo-- ___\ iiiMMMooo # iiiiii--MMMMoooo / i--MMMMoo print "###########\nCompareToposLocally\n" #debug (strTop1, strTop2) = readAliTopo(strAlitopFile) [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) print "strTop1:%s" % (strTop1) #debug print "strTop2:%s" % (strTop2) #debug # 1. treat the beginning nbegin = 0 if (strTop1[0] == '-' or strTop2[0] == '-'): i = 0 while (i < len(strTop1) and (strTop1[i] == '-' or strTop2[i] == '-')): i = i + 1 nbegin = i # 2. treat the ending nend = len(strTop1) if (strTop1[len(strTop1) - 1] == '-' or strTop2[len(strTop1) - 1] == '-'): i = len(strTop1) - 1 while (i >= 0 and (strTop1[i] == '-' or strTop2[i] == '-')): i = i - 1 nend = i + 1 tmpStrTop1 = strTop1[nbegin:nend] tmpStrTop2 = strTop2[nbegin:nend] # 3. remove unnecessary gaps strNewTop1 = '' strNewTop2 = '' for i in range(len(tmpStrTop1)): if not (tmpStrTop1[i] == '-' and tmpStrTop2[i] == '-'): strNewTop1 += tmpStrTop1[i] strNewTop2 += tmpStrTop2[i] print "After local treatment\n" #debug print "strTop1:%s" % (strNewTop1) #debug print "strTop2:%s" % (strNewTop2) #debug strNewTop1 = trimTopo(strNewTop1) #after local treatment, gaps may still exist in the alignment, use the function trimTopo to remove these gaps strNewTop2 = trimTopo(strNewTop2) strNewTop1 = filterTopo(strNewTop1) strNewTop2 = filterTopo(strNewTop2) (intMems1, N1) = counttopo(strNewTop1) (intMems2, N2) = counttopo(strNewTop2) return compareTopos(intMems1, intMems2, strNewTop1, strNewTop2, N1, N2)
def CompareToposLocallyNew(strTop1, strTop2, strProtein1, strProtein2, fpLog): #{{{ # -----iiiMMMooo-- ___\ iiiMMMooo # iiiiii--MMMMoooo / i--MMMMoo [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) # 1. treat the beginning nbegin = 0 if (strTop1[0] == '-' or strTop2[0] == '-'): i = 0 while (i < len(strTop1) and (strTop1[i] == '-' or strTop2[i] == '-')): i = i + 1 nbegin = i # 2. treat the ending nend = len(strTop1) if (strTop1[len(strTop1) - 1] == '-' or strTop2[len(strTop1) - 1] == '-'): i = len(strTop1) - 1 while (i >= 0 and (strTop1[i] == '-' or strTop2[i] == '-')): i = i - 1 nend = i + 1 tmpStrTop1 = strTop1[nbegin:nend] tmpStrTop2 = strTop2[nbegin:nend] # 3. remove unnecessary gaps strNewTop1 = '' strNewTop2 = '' for i in range(len(tmpStrTop1)): if not (tmpStrTop1[i] == '-' and tmpStrTop2[i] == '-'): strNewTop1 += tmpStrTop1[i] strNewTop2 += tmpStrTop2[i] strNewTop1 = trimTopo(strNewTop1) #after local treatment, gaps may still exist in the alignment, use the function trimTopo to remove these gaps strNewTop2 = trimTopo(strNewTop2) strNewTop1 = filterTopo(strNewTop1) strNewTop2 = filterTopo(strNewTop2) if fpLog != 0: print >> fpLog, "Locally" print >> fpLog, "%-20s:%s" % (strProtein1, strNewTop1) print >> fpLog, "%-20s:%s" % (strProtein2, strNewTop2) print >> fpLog if len(strNewTop1) <= 0 and len(strNewTop2) <= 0: return ("DIFF", 0, 0) elif len(strNewTop1) * len( strNewTop2) == 0 and len(strNewTop1) + len(strNewTop2) > 0: print >> sys.stderr, "%s %s local length does not match" % ( strProtein1, strProtein2) sys.exit(1) (intMems1, N1) = counttopo(strNewTop1) (intMems2, N2) = counttopo(strNewTop2) return compareTopos(intMems1, intMems2, strNewTop1, strNewTop2, N1, N2)
def action(method, alnfile, outfile): (seqidList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile) if (method == 0): newSeqList = lcmp.RemoveUnnecessaryGap_old(seqList) else: newSeqList = lcmp.RemoveUnnecessaryGap(seqList) try: if outfile == "": fpout = sys.stdout else: fpout = open(outfile, "w") for i in range(len(seqidList)): fpout.write(">%s\n" % (seqAnnoList[i])) fpout.write("%s\n" % (newSeqList[i])) if fpout and fpout != sys.stdout: fpout.close() return 0 except IOError: click.echo("Failed to write to file %s" % (outfile)) return 1
def AddPairwiseAlignmentFactor(pairlistDict, msapath, msaext, #{{{ isLocalAlignment): cntfamid = 0 verbose = g_params['verbose'] for famid in pairlistDict: cntfamid += 1 if verbose >= 2: print "Add pairwise alignment factor for %d: %s"%(cntfamid, famid) msafile = msapath + os.sep + famid + msaext if not os.path.exists(msafile): print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile continue (idList, annoList, seqList) = myfunc.ReadFasta(msafile) msaDict = {} for i in xrange(len(idList)): msaDict[idList[i]] = seqList[i] pairlist = pairlistDict[famid] #print "pairlist=", pairlist for i in xrange(len(pairlist)): pair = pairlist[i] #print "pair = ", pair seq1 = "" seq2 = "" id1 = pair[0] id2 = pair[1] if id1 in msaDict and id2 in msaDict: seq1 = msaDict[id1] seq2 = msaDict[id2] [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2]) if len(seq1) != len(seq2): print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2) else: alignFactor = lcmp.GetAlignmentFactorFromPairAlignment( seq1,seq2, isLocalAlignment) pair.append(alignFactor) else: if id1 not in msaDict: print >> sys.stderr, "%s not in msafile %s"%(id1, msafile) if id2 not in msaDict: print >> sys.stderr, "%s not in msafile %s"%(id2, msafile) return 0
def compareToposGaplessly(strAlitopFile): #{{{ # 1st version: # Cut off unaligned ends: # By Nanjiang 2010-08-11: this actually cut off all unaligned regions, not only # the two endings. The code really deal with the function described below in # written in another def, and compareToposLocally is renamed as # compareToposGaplessly # that is # -----iiiMMMooo ___\ iMMMooo # iiiiii--MMMMoo / iMMMMoo print "##########\nCompareToposGaplessly\n" #debug (strTop1, strTop2) = readAliTopo(strAlitopFile) [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) print "strTop1:%s" % (strTop1) #debug print "strTop2:%s" % (strTop2) #debug strNewTop1 = '' strNewTop2 = '' for i in range(len(strTop1)): if not (strTop1[i] == '-' or strTop2[i] == '-'): strNewTop1 += strTop1[i] strNewTop2 += strTop2[i] print "After Gapless treatment\n" #debug print "strTop1:%s" % (strNewTop1) #debug print "strTop2:%s" % (strNewTop2) #debug strNewTop1 = filterTopo(strNewTop1) strNewTop2 = filterTopo(strNewTop2) (intMems1, N1) = counttopo(strNewTop1) (intMems2, N2) = counttopo(strNewTop2) return compareTopos(intMems1, intMems2, strNewTop1, strNewTop2, N1, N2)
def compareToposGlobally(strAlitopFile): #{{{ print "########\nCompareToposGlobally" #debug (strTop1, strTop2) = readAliTopo(strAlitopFile) [strTop1, strTop2] = lcmp.RemoveUnnecessaryGap([strTop1, strTop2]) print "strTop1:%s" % (strTop1) #debug print "strTop2:%s" % (strTop2) #debug strTop1 = trimTopo(strTop1) strTop2 = trimTopo(strTop2) print "After trimming" #debug print "strTop1:%s" % (strTop1) #debug print "strTop2:%s" % (strTop2) #debug strTop1 = filterTopo(strTop1) strTop2 = filterTopo(strTop2) (intNumMem1, Nterm1) = counttopo(strTop1) (intNumMem2, Nterm2) = counttopo(strTop2) return compareTopos(intNumMem1, intNumMem2, strTop1, strTop2, Nterm1, Nterm2)
def WritePairAln(pairlistDict, msapath, msaext, outname):#{{{ verbose = g_params['verbose'] outAlnFile = outname + ".pairaln" outTableFile = outname + ".tableinfo" outSelPairList = outname + ".pairlistwithpfamid" try: fpout_aln = open(outAlnFile, "w") except IOError: print >> sys.stderr, "Failed to write to file", outAlnFile return 1 try: fpout_table = open(outTableFile, "w") except IOError: print >> sys.stderr, "Failed to write to file", outTableFile return 1 try: fpout_list = open(outSelPairList, "w") except IOError: print >> sys.stderr, "Failed to write to file", outSelPairList return 1 fpout_table.write("#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" % ( "Seq1","Seq2", "IDT0", "SIM0", "AlnLength", "Len1","Len2", "Score","N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2")) for famid in pairlistDict: if verbose >= 2: print "Write pairwise alignment for %s"%(famid) msafile = msapath + os.sep + famid + msaext if not os.path.exists(msafile): print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile continue (idList, annoList, seqList) = myfunc.ReadFasta(msafile) msaDict = {} annoDict = {} for i in xrange(len(idList)): msaDict[idList[i]] = seqList[i] annoDict[idList[i]] = annoList[i] pairlist = pairlistDict[famid] #print "pairlist2=", pairlist for pair in pairlist: #print "pair2 = ", pair seq1 = "" seq2 = "" id1 = pair[0] id2 = pair[1] if id1 in msaDict and id2 in msaDict: seq1 = msaDict[id1] seq2 = msaDict[id2] [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2]) if len(seq1) != len(seq2): print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2) else: rd = pair[2] fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%( annoDict[id1], id2, rd['seqidt0'], rd['seqidt1'])) fpout_aln.write("%s\n"%seq1) fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%( annoDict[id2], id1, rd['seqidt0'], rd['seqidt1'])) fpout_aln.write("%s\n"%seq2) fpout_table.write("%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n"% ( id1, id2, rd['seqidt0'], -1.0, rd['alnLength'], rd['seqLength1'], rd['seqLength2'], -1.0, rd['numIDT'], -1, rd['numGap'], rd['seqidt1'], rd['seqidt2'])) fpout_list.write("%s %s %s\n"%(id1, id2, famid)) fpout_aln.close() fpout_table.close() fpout_list.close() print "Result output to " print "\t%s"%outAlnFile print "\t%s"%outTableFile return 0
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))