def FilterSignalPeptide(topofile, sigpepDict, outfile, isDeleteSeqWithSignalPeptide): hdl = myfunc.ReadFastaByBlock(topofile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: try: sp_pos = sigpepDict[rd.seqid] except KeyError: sp_pos = -1 if sp_pos != -1: if isDeleteSeqWithSignalPeptide: newtopo = "" else: newtopo = lcmp.FilterSignalPeptideInTopology(rd.seq, sp_pos) else: newtopo = rd.seq if newtopo != "" and myfunc.CountTM(newtopo) > 0: fpout.write(">%s\n"%(rd.description)) fpout.write("%s\n"%(newtopo)) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def MatchMSATopo_using_topofile( msafile, topofile, isIgnoreBadseq, #{{{ method_match, outfile): topoDict = GetTopoDict(topofile) hdl = myfunc.ReadFastaByBlock(msafile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: try: topo = topoDict[rd.seqid] except KeyError: print("topo not found for ID %s" % (rd.seqid), file=sys.stderr) topo = "" matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match) if not (matchedtopo == "BADSEQ" and isIgnoreBadseq): print(">%s" % (rd.description), file=fpout) print("%s" % (matchedtopo), file=fpout) recordList = hdl.readseq() myfunc.myclose(fpout) hdl.close() return 0
def MFA2MPA_obsolete(infile, fpout):#{{{ hdl = myfunc.ReadFastaByBlock(infile, 0, 1) if hdl.failure: return 1 lengthList = [] recordList = hdl.readseq() while recordList != None: for rd in recordList: print >> fpout, ">%s"%(rd.description) print >> fpout, "Length: %d"%(len(rd.seq)) seq = rd.seq firstrespos = -1 for i in xrange(len(seq)): if seq[i] != GAP: if firstrespos == -1: firstrespos = i fpout.write("%d %s "%(i, seq[i])) else: fpout.write("%d %s "%(i-firstrespos, seq[i])) fpout.write("\n") lengthList.append(len(seq)) recordList = hdl.readseq() hdl.close() if len(set(lengthList)) > 1: msg = "Warning! Length of the MSA file %s are not equal!" print >> sys.stderr, msg%(infile) return 0
def MFA2MPA(infile, fpout):#{{{ hdl = myfunc.ReadFastaByBlock(infile, 0, 1) if hdl.failure: return 1 lengthList = [] recordList = hdl.readseq() while recordList != None: for rd in recordList: print >> fpout, ">%s"%(rd.description) seq = rd.seq gapPosList = myfunc.GetSegPos(seq, GAP) num = len(gapPosList) length = len(seq) if num < 1 : fpout.write("%s\n"%(seq)) else: if gapPosList[0][0] > 0: fpout.write("%s "%(seq[0:gapPosList[0][0]])) for i in xrange(num-1): fpout.write("%d "%(gapPosList[i][1] - gapPosList[i][0])) fpout.write("%s "%(seq[gapPosList[i][1]:gapPosList[i+1][0]])) fpout.write("%d "%(gapPosList[num-1][1] - gapPosList[num-1][0])) if gapPosList[num-1][1] < length: fpout.write("%s"%(seq[gapPosList[num-1][1]:length])) fpout.write("\n") lengthList.append(length) recordList = hdl.readseq() hdl.close() if len(set(lengthList)) > 1: msg = "Warning! Length of the MSA file %s are not equal!" print >> sys.stderr, msg%(infile) return 0
def IsUniqueSeq(infile, method, isUseMD5): #{{{ """ return value yes : 1 no : 0 failed : -1 """ hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return -1 myset = set([]) recordList = hdl.readseq() while recordList != None: for rd in recordList: if method == "id": key = rd.seqid elif method == "seq": if isUseMD5: key = md5.new(rd.seq).digest() else: key = rd.seq if key in myset: # duplicated return 0 # not unique myset.add(key) recordList = hdl.readseq() hdl.close() return 1 #unique
def ExcludeConsensus(infile, g_outpath): if g_outpath == "": outpath = os.path.dirname(infile) if outpath == "": outpath = "." else: outpath = g_outpath rootname = os.path.basename(os.path.splitext(infile)[0]) outfile = "%s%s%s.nocons.fasta" % (outpath, os.sep, rootname) try: fpout = open(outfile, "w") hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return 1 recordList = hdl.readseq() while recordList != None: for record in recordList: if record.seqid.lower().find("consensus") == -1: fpout.write(">%s\n%s\n" % (record.description, record.seq)) recordList = hdl.readseq() fpout.close() hdl.close() except IOError: print sys.stderr, "Failed to write to file %s" % (outfile) return 1
def IsUniqueSeq(infile, method):#{{{ hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return -1 isunique = 1 #init value myset = set([]) recordList = hdl.readseq() while recordList != None: for rd in recordList: if method == "id": if rd.seqid in myset: isunique = 0 break myset.add(rd.seqid) elif method == "seq": if rd.seq in myset: isunique = 0 break myset.add(rd.seq) recordList = hdl.readseq() hdl.close() return isunique
def SplitPfamFasta(infile, outpath): hdl = myfunc.ReadFastaByBlock(infile, 0, 0) if hdl.failure: return 1 pfamIDThis = "" liThis = [] recordList = hdl.readseq() cnt = 0 while recordList != None: for rd in recordList: pfamid = ExtractPfamIDFromDescription(rd.description) if pfamIDThis == "": pfamIDThis = pfamid if pfamid == pfamIDThis: liThis.append(rd) else: # starting a new record, export this record ExportFastaForPfamID(pfamIDThis, liThis, outpath, cnt) cnt += 1 pfamIDThis = pfamid liThis = [] liThis.append(rd) recordList = hdl.readseq() if len(liThis) > 0: ExportFastaForPfamID(pfamIDThis, liThis, outpath, cnt) cnt += 1 hdl.close() return 0
def ReWriteFasta(infile, outfile): #{{{ fpout = myfunc.myopen(outfile, sys.stdout, "w", False) hdl = myfunc.ReadFastaByBlock(infile, 0, 1) if hdl.failure: return 1 recordList = hdl.readseq() while recordList != None: for rd in recordList: fpout.write(">%s\n" % rd.description) fpout.write("%s\n" % rd.seq) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def IsHasNonStandardAminoAcid(infile): #{{{ isHasNonStdAA = 0 hdl = myfunc.ReadFastaByBlock(infile, 0, 0) if hdl.failure: return -1 recordList = hdl.readseq() while recordList != None: for rd in recordList: for i in xrange(len(rd.seq)): if STD1CharAA_alphabet.find(rd.seq[i]) == -1: isHasNonStdAA = 1 break recordList = hdl.readseq() hdl.close() return isHasNonStdAA
def CleanAASeq(infile, isOverWrite, fpout): #{{{ isHasNonStandardAminoAcid = IsHasNonStandardAminoAcid(infile) if isHasNonStandardAminoAcid == 0: if isOverWrite: msg = "seqfile %s is already cleaned. Ignore" print >> sys.stderr, msg % (infile) else: fpin = open(infile, "r") BLOCK_S = 10000 buff = fpin.read(BLOCK_S) while buff: fpout.write(buff) buff = fpin.read(BLOCK_S) fpin.close() return 0 elif isHasNonStandardAminoAcid == -1: return -1 fpout_local = None if not isOverWrite: fpout_local = fpout else: try: fpout_local = tempfile.NamedTemporaryFile(delete=False) except IOError: msg = "Failed to write to temporary file for running seqfile %s" print >> sys.stderr, msg % (infile) return -1 hdl = myfunc.ReadFastaByBlock(infile, 0, 0) if hdl.failure: return -1 recordList = hdl.readseq() while recordList != None: for rd in recordList: seq = ReplaceNonStandardAminoAcid(rd.seq) fpout_local.write(">%s\n" % (rd.description)) fpout_local.write("%s\n" % (seq)) recordList = hdl.readseq() hdl.close() if isOverWrite: fpout_local.close() #print "tmpfile=",fpout_local.name os.system("/bin/mv -f %s %s" % (fpout_local.name, infile)) os.system("chmod 644 %s" % (infile)) if not g_params['isQuiet']: msg = "seqfile %s cleaned" print msg % (infile)
def MatchMSATopo_using_topodb( msafile, topodb, isIgnoreBadseq, #{{{ method_match, outfile): hdl_topo = myfunc.MyDB(topodb) if hdl_topo.failure: return 1 hdl = myfunc.ReadFastaByBlock(msafile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: topowithanno = hdl_topo.GetRecord(rd.seqid) if topowithanno != None: (topoid, topoanno, topo) = myfunc.ExtractFromSeqWithAnno(topowithanno) else: print("topo not found for ID %s" % (rd.seqid), file=sys.stderr) topo = "" matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match) if not (matchedtopo == "BADSEQ" and isIgnoreBadseq): print(">%s" % (rd.description), file=fpout) print("%s" % (matchedtopo), file=fpout) recordList = hdl.readseq() myfunc.myclose(fpout) hdl.close() hdl_topo.close() return 0
def RemoveDupSeq(infile, g_outpath, method, isUseMD5): #{{{ if g_outpath == "": outpath = myfunc.my_dirname(infile) else: outpath = g_outpath rootname = os.path.basename(os.path.splitext(infile)[0]) outfile = "%s%s%s" % (outpath, os.sep, rootname) fpout = myfunc.myopen(outfile, None, "w", False) if fpout == None: return 1 hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return -1 myset = set([]) recordList = hdl.readseq() while recordList != None: for rd in recordList: if method == "id": key = rd.seqid elif method == "seq": if isUseMD5: key = md5.new(rd.seq).digest() else: key = rd.seq if not key in myset: myset.add(key) fpout.write(">%s\n%s\n" % (rd.description, rd.seq)) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):#{{{ rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start"%(outpath) runjob_errfile = "%s/runjob.err"%(outpath) runjob_logfile = "%s/runjob.log"%(outpath) finishtagfile = "%s/runjob.finish"%(outpath) rmsg = "" resultpathname = jobid outpath_result = "%s/%s"%(outpath, resultpathname) tarball = "%s.tar.gz"%(resultpathname) zipfile = "%s.zip"%(resultpathname) tarball_fullpath = "%s.tar.gz"%(outpath_result) zipfile_fullpath = "%s.zip"%(outpath_result) outfile = "%s/%s/Topcons/topcons.top"%(outpath_result, "seq_%d"%(0)) resultfile_text = "%s/%s"%(outpath_result, "query.result.txt") tmp_outpath_result = "%s/%s"%(tmpdir, resultpathname) isOK = True try: os.makedirs(tmp_outpath_result) isOK = True except OSError: msg = "Failed to create folder %s"%(tmp_outpath_result) myfunc.WriteFile(msg+"\n", runjob_errfile, "a") isOK = False print "isOK =", isOK if isOK: tmp_mapfile = "%s/seqid_index_map.txt"%(tmp_outpath_result) maplist = [] maplist_simple = [] hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: recordList = hdl.readseq() cnt = 0 while recordList != None: for rd in recordList: maplist.append("%s\t%d\t%s\t%s"%("seq_%d"%cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append("%s\t%d\t%s"%("seq_%d"%cnt, len(rd.seq), rd.description)) cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple), tmp_mapfile) if isOK: # g_params['runjob_log'].append("tmpdir = %s"%(tmpdir)) #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"] datetime = time.strftime("%Y-%m-%d %H:%M:%S") rt_msg = myfunc.WriteFile(datetime, starttagfile) if rt_msg: g_params['runjob_err'].append(rt_msg) cmd = [runscript, infile, tmp_outpath_result, blastdir, blastdb ] g_params['runjob_log'].append(" ".join(cmd)) begin_time = time.time() try: rmsg = subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e)+"\n") g_params['runjob_err'].append(rmsg + "\n") suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir)) if len(suqoutfilelist)>0: suqoutfile = suqoutfilelist[0] g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile)) end_time = time.time() runtime_in_sec = end_time - begin_time if os.path.exists(tmp_outpath_result): cmd = ["cp","-rf", tmp_outpath_result, outpath] try: subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e)) if len(g_params['runjob_log']) > 0 : rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']), runjob_logfile, "a") if rt_msg: g_params['runjob_err'].append(rt_msg) datetime = time.strftime("%Y-%m-%d %H:%M:%S") if os.path.exists(outfile): rt_msg = myfunc.WriteFile(datetime, finishtagfile) if rt_msg: g_params['runjob_err'].append(rt_msg) # now write the text output to a single file WriteTextResultFile(resultfile_text, maplist, runtime_in_sec) # now making zip instead (for windows users) pwd = os.getcwd() os.chdir(outpath) # cmd = ["tar", "-czf", tarball, resultpathname] cmd = ["zip", "-rq", zipfile, resultpathname] try: subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e))
def RunJob(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start" % (outpath) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) rmsg = "" resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0)) resultfile_text = "%s/%s" % (outpath_result, "query.result.txt") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) isOK = True try: os.makedirs(tmp_outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (tmp_outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass try: os.makedirs(outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass if isOK: try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] maplist_simple = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: datetime = time.strftime("%Y-%m-%d %H:%M:%S") rt_msg = myfunc.WriteFile(datetime, starttagfile) recordList = hdl.readseq() cnt = 0 origpath = os.getcwd() while recordList != None: for rd in recordList: isSkip = False # temp outpath for the sequence is always seq_0, and I feed # only one seq a time to the workflow tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % 0) outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt) subfoldername_this_seq = "seq_%d" % (cnt) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass maplist.append( "%s\t%d\t%s\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append( "%s\t%d\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description)) if not g_params['isForceRun']: md5_key = hashlib.md5(rd.seq).hexdigest() subfoldername = md5_key[:2] md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): # create a symlink to the cache rela_path = os.path.relpath( md5_link, outpath_result) #relative path os.chdir(outpath_result) os.symlink(rela_path, subfoldername_this_seq) if os.path.exists(outpath_this_seq): runtime = 0.0 #in seconds topfile = "%s/%s/topcons.top" % ( outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % cnt, str(len(rd.seq)), str(numTM), str(isHasSP), "cached", str(runtime), rd.description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists if os.path.exists(outpath_this_seq): try: shutil.rmtree(outpath_this_seq) except OSError: pass origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile) # run scampi single to estimate the number of TM helices and then run # the query sequences in the descending order of numTM torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist topfile_scampiseq = "%s/%s" % (tmp_outpath_result, "query.torun.fa.topo") if os.path.exists(torun_all_seqfile): # run scampi to estimate the number of TM helices cmd = [ script_scampi, torun_all_seqfile, "-outpath", tmp_outpath_result ] try: rmsg = subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") pass if os.path.exists(topfile_scampiseq): (idlist_scampi, annolist_scampi, toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq) for jj in xrange(len(idlist_scampi)): numTM = myfunc.CountTM(toplist_scampi[jj]) try: toRunDict[int(idlist_scampi[jj])][1] = numTM except (KeyError, ValueError, TypeError): pass sortedlist = sorted(toRunDict.items(), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: # g_params['runjob_log'].append("tmpdir = %s"%(tmpdir)) #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"] origIndex = item[0] seq = item[1][0] description = item[1][2] outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): g_params['runjob_err'].append( "failed to generate seq index %d" % (origIndex)) continue cmd = [ runscript, seqfile_this_seq, tmp_outpath_result, blastdir, blastdb ] g_params['runjob_log'].append(" ".join(cmd)) begin_time = time.time() try: rmsg = subprocess.check_output(cmd) g_params['runjob_log'].append("workflow:\n" + rmsg + "\n") except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") g_params['runjob_err'].append(rmsg + "\n") pass #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir)) #if len(suqoutfilelist)>0: # suqoutfile = suqoutfilelist[0] #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile)) end_time = time.time() runtime_in_sec = end_time - begin_time if os.path.exists(tmp_outpath_this_seq): cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq] isCmdSuccess = False try: subprocess.check_output(cmd) isCmdSuccess = True except subprocess.CalledProcessError, e: msg = "Failed to run prediction for sequence No. %d\n" % ( origIndex) g_params['runjob_err'].append(msg) g_params['runjob_err'].append(str(e) + "\n") pass timefile = "%s/time.txt" % (tmp_outpath_result) targetfile = "%s/time.txt" % (outpath_this_seq) if os.path.exists(timefile) and os.path.exists( outpath_this_seq): try: shutil.move(timefile, targetfile) except: g_params['runjob_err'].append( "Failed to move %s/time.txt" % (tmp_outpath_result) + "\n") pass if isCmdSuccess: runtime = runtime_in_sec #in seconds topfile = "%s/%s/topcons.top" % (outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % origIndex, str(len(seq)), str(numTM), str(isHasSP), "newrun", str(runtime), description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) # now write the text output for this seq info_this_seq = "%s\t%d\t%s\t%s" % ( "seq_%d" % origIndex, len(seq), description, seq) resultfile_text_this_seq = "%s/%s" % (outpath_this_seq, "query.result.txt") myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq, outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url']) # create or update the md5 cache # create cache only on the front-end if g_params['base_www_url'].find("topcons.net") != -1: md5_key = hashlib.md5(seq).hexdigest() subfoldername = md5_key[:2] md5_subfolder = "%s/%s" % (path_md5cache, subfoldername) md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): try: os.unlink(md5_link) except: pass subfolder_md5 = "%s/%s" % (path_md5cache, subfoldername) if not os.path.exists(subfolder_md5): try: os.makedirs(subfolder_md5) except: pass rela_path = os.path.relpath( outpath_this_seq, md5_subfolder) #relative path try: os.chdir(md5_subfolder) os.symlink(rela_path, md5_key) except: pass
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))