def DrawSeqMSA(seqmsafile, outpath): print "Remove gaps from sequence" (idList, annotationList, seqList) = myfunc.ReadFasta(seqmsafile) rootname = os.path.basename(os.path.splitext(seqmsafile)[0]) basename = os.path.basename(seqmsafile) seqfile = outpath + os.sep + rootname + '.fa' fpout = open(seqfile, "w") for i in xrange(len(idList)): fpout.write(">%s\n" % annotationList[i]) fpout.write("%s\n" % seqList[i].replace("-", "").replace(".", "")) fpout.close() print "Predicting topologies..." scampi_exe = "%s/mySCAMPI_run.pl" % g_params['newscampiscriptpath'] scampi_dir = g_params['scampi_dir'] modhmm_bin = g_params['modhmm_bin'] cmd = "%s %s --scampipath %s --modhmmpath %s --outpath %s" % ( scampi_exe, seqfile, scampi_dir, modhmm_bin, outpath) os.system(cmd) os.system("rm -f %s/*.res" % outpath) print "Get topomsa" binpath = g_params['binpath'] topofile = outpath + os.sep + rootname + '.fa.topo' topomsafile = outpath + os.sep + rootname + '.topomsa.fa' cmd = "%s/matchMSAtopo -msa %s -topo %s -o %s" % (binpath, seqmsafile, topofile, topomsafile) os.system(cmd) print "Draw topomsa" cmd = "python %s/drawMSATopo.py %s -text y -outpath %s -aaseq %s" % ( binpath, topomsafile, outpath, seqfile) os.system(cmd)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 topofile = "" outfile = "" isGapLess = False i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: topofile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-i", "--i"]: topofile = argv[i + 1] i += 2 elif argv[i] in ["-gapless", "--gapless"]: isGapLess = True i += 1 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: topofile = argv[i] i += 1 if topofile == "": print >> sys.stderr, "topofile not set. exit" return 1 try: (idList, annoList, seqList) = myfunc.ReadFasta(topofile) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) for i in xrange(len(idList)): topo = seqList[i] seqid = idList[i] if isGapLess: topo = topo.replace("-", "").replace(".", "") posTMList = myfunc.GetTMPosition(topo) print >> fpout, seqid, posTMList myfunc.myclose(fpout) except (IOError, IndexError): pass
def WriteSeqAlnHTML(seqAlnFileList, extTopoMSA, outfile): # {{{ try: fpout = open(outfile, "w") except IOError: print("Failed to write to %s" % (outfile), file=sys.stderr) return 1 WriteHTMLHeader( 'Alignment highlighted by <font color=%s>TM regions</font>' % ('red'), fpout) print("Processed alignments:") for alnfile in seqAlnFileList: rootname_alnfile = os.path.basename(os.path.splitext(alnfile)[0]) topomsafile = '.'.join([os.path.splitext(alnfile)[0], extTopoMSA]) if not (os.path.exists(alnfile) and os.path.exists(topomsafile)): if not os.path.exists(alnfile): sys.stderr.write('alnfile %s does not exist\n' % (alnfile)) if not os.path.exists(topomsafile): sys.stderr.write('topomsafile %s does not exist\n' % (topomsafile)) continue (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile) #print(seqIDList) (topoIDList, topoAnnoList, topoList) = myfunc.ReadFasta(topomsafile) #print(topoIDList) if g_params['removeUnnecessaryGap']: seqList = lcmp.RemoveUnnecessaryGap(seqList) topoList = lcmp.RemoveUnnecessaryGap(topoList) # since there is no shrinking, index map is always p->p final2seq_idxMapList = [] for i in range(len(seqIDList)): seqlength = len(seqList[i]) idxmap = {} for j in range(seqlength): idxmap[j] = j final2seq_idxMapList.append(idxmap) print(('\t' + rootname_alnfile)) WriteHTMLAlignment2(rootname_alnfile, topoIDList, topoAnnoList, topoList, topoList, seqList, final2seq_idxMapList, fpout) WriteHTMLTail(fpout) fpout.close() return 0
def MatchTopoPairAln(queryTopoFile,alignFile, targetsTopologyFile, fpout):#{{{ # fptmp=open(queryTopoFile); # print fptmp.readlines(); # fptmp.close(); try: (queryID, queryAnnotation, queryTopology) = myfunc.ReadSingleFasta(queryTopoFile); # read in alignment alns = ReadNeedleAlignment(alignFile); # read in topologys (targetIDList, targetAnnotationList, targetTopoList) = myfunc.ReadFasta(targetsTopologyFile); # match and print the result print >> fpout, "#Number of alignments: %d" % len(targetIDList); for i in range (len(targetIDList)): seqID=targetIDList[i]; alnseq1=alns[i]['alnseq1']; alnseq2=alns[i]['alnseq2']; topoaln1=""; topoaln2=""; if seqID != alns[i]['seqid2']: print >> sys.stderr, "seqID does not match, record %d" %i; cnt1=0; cnt2=0; for j in range(len(alnseq1)): if alnseq1[j] != '-': if alnseq2[j] != '-': topoaln1+=queryTopology[cnt1]; topoaln2+=targetTopoList[i][cnt2]; else: topoaln1+=queryTopology[cnt1]; topoaln2+='-'; else: if alnseq2[j] != '-': topoaln1+='-'; topoaln2+=targetTopoList[i][cnt2]; else: topoaln1+='-'; topoaln2+='-'; if alnseq1[j] != '-': cnt1 +=1; if alnseq2[j] != '-': cnt2 += 1; #print the result print >> fpout, "#Topology alignment %d" %( i+1); print >> fpout, ">%s" % queryAnnotation; print >> fpout, "%s" % topoaln1; print >> fpout, ">%s" % targetAnnotationList[i]; print >> fpout, "%s" % topoaln2; print >> fpout; except: print >>sys.stderr, "except for the function:%s"%sys._getframe().f_code.co_name ; raise ; return 0;
def GetPairTopoAln(pairalnTopoFile):#{{{ (idList, annoList, seqList) = myfunc.ReadFasta(pairalnTopoFile); numPair = len(idList)/2; pairTopoAlnDict = {}; for i in xrange(numPair): pair = {}; pair['id1'] = idList[i*2]; pair['id2'] = idList[i*2+1]; pair['anno1'] = annoList[i*2]; pair['anno2'] = annoList[i*2+1]; pair['seq1'] = seqList[i*2]; pair['seq2'] = seqList[i*2+1]; key = "%s-%s"%(idList[i*2], idList[i*2+1]); pairTopoAlnDict[key] = pair; return pairTopoAlnDict;
def RandFasta(inFile, N, rand_seed, fpout): #{{{ (idList, annotationList, seqList) = myfunc.ReadFasta(inFile, BLOCK_SIZE) if idList == None: print("Failed to read fastafile %s. Exit." % inFile, file=sys.stderr) return -1 random.seed(rand_seed) Nseq = len(idList) if N > Nseq: N = Nseq idxArray = list(range(Nseq)) idxSample = random.sample(idxArray, N) for i in range(N): idx = idxSample[i] fpout.write(">%s\n" % annotationList[idx]) fpout.write("%s\n" % seqList[idx]) return 0
def action(method, alnfile, outfile): (seqidList, seqAnnoList, seqList) = myfunc.ReadFasta(alnfile) if (method == 0): newSeqList = lcmp.RemoveUnnecessaryGap_old(seqList) else: newSeqList = lcmp.RemoveUnnecessaryGap(seqList) try: if outfile == "": fpout = sys.stdout else: fpout = open(outfile, "w") for i in range(len(seqidList)): fpout.write(">%s\n" % (seqAnnoList[i])) fpout.write("%s\n" % (newSeqList[i])) if fpout and fpout != sys.stdout: fpout.close() return 0 except IOError: click.echo("Failed to write to file %s" % (outfile)) return 1
def AddPairwiseAlignmentFactor(pairlistDict, msapath, msaext, #{{{ isLocalAlignment): cntfamid = 0 verbose = g_params['verbose'] for famid in pairlistDict: cntfamid += 1 if verbose >= 2: print "Add pairwise alignment factor for %d: %s"%(cntfamid, famid) msafile = msapath + os.sep + famid + msaext if not os.path.exists(msafile): print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile continue (idList, annoList, seqList) = myfunc.ReadFasta(msafile) msaDict = {} for i in xrange(len(idList)): msaDict[idList[i]] = seqList[i] pairlist = pairlistDict[famid] #print "pairlist=", pairlist for i in xrange(len(pairlist)): pair = pairlist[i] #print "pair = ", pair seq1 = "" seq2 = "" id1 = pair[0] id2 = pair[1] if id1 in msaDict and id2 in msaDict: seq1 = msaDict[id1] seq2 = msaDict[id2] [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2]) if len(seq1) != len(seq2): print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2) else: alignFactor = lcmp.GetAlignmentFactorFromPairAlignment( seq1,seq2, isLocalAlignment) pair.append(alignFactor) else: if id1 not in msaDict: print >> sys.stderr, "%s not in msafile %s"%(id1, msafile) if id2 not in msaDict: print >> sys.stderr, "%s not in msafile %s"%(id2, msafile) return 0
#!/usr/bin/env python import os, sys, myfunc from math import ceil file_pairalnfile="/data3/wk/MPTopo/pfamAna_refpro/cellular_filter_all/pairwise/withinClan/Pfam-A-full.perTM75_nseq20.nr100.filtered.withinclan.max30000.kalignP.pairaln" (idList, annoList, seqList) = myfunc.ReadFasta(file_pairalnfile) numseq = len(idList) outpath = "splitted" os.system("mkdir -p %s"%outpath) nsplit = 10 numPair = numseq / 2 pairPerSplit = int(ceil(float(numPair) / nsplit)) bp = 0 for i in xrange(nsplit): outfile=outpath + os.sep + "split_%d" %i + ".fa" fpout = open(outfile, "w") for p in range(bp, bp + pairPerSplit): if p < numPair: anno1 = annoList[2*p] anno2 = annoList[2*p+1] seq1 = seqList[2*p] seq2 = seqList[2*p+1] fpout.write(">%s\n"%anno1) fpout.write("%s\n"%seq1)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" outfile = "" real_topofile = "" seqfile = "" restrictIDListFile = "" outfile_wrong_predtopo = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-owrong", "--owrong"]: (outfile_wrong_predtopo, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-realtopo", "--realtopo"]: (real_topofile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqfile", "--seqfile"]: (seqfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mode", "--mode"]: (g_params['mode'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-path_predtopo", "--path_predtopo"]: (g_params['path_predtopo'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-basename", "--basename"]: (g_params['basename'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-restrictidlist", "--restrictidlist"]: (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-rmsp", "--rmsp"]: g_params['isRMSP'] = True i += 1 elif argv[i] in ["-debug", "--debug"]: g_params['isDEBUG'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 i += 1 if myfunc.checkfile(g_params['path_predtopo'], "path_predtopo") != 0: return 1 if g_params['basename'] == "": print >> sys.stderr, "%s: basename not set. exit" % (argv[0]) return 1 if myfunc.checkfile(real_topofile, "real_topofile") != 0: return 1 if restrictIDListFile != "": g_params['restrictIDset'] = set(myfunc.ReadIDList(restrictIDListFile)) g_params['isRestrictIDList'] = True if g_params['mode'] == "": if g_params['path_predtopo'].find("topcons_single") >= 0: g_params['mode'] = "tps" elif g_params['path_predtopo'].find("topcons") >= 0: g_params['mode'] = "tp" else: print >> sys.stderr, "mode not set, and can not be recognized from path_predtopo=%s" % ( path_predtopo) return 1 if not g_params['mode'] in ["tp", "tps"]: print >> sys.stderr, "Unrecognized mode = %s" % (g_params['mode']) return 1 (real_idlist, real_annolist, real_topolist) = myfunc.ReadFasta(real_topofile) seqDict = {} if seqfile != "" and os.path.exists(seqfile): (seq_idlist, seq_annolist, seqlist) = myfunc.ReadFasta(seqfile) for i in xrange(len(seq_idlist)): seqDict[seq_idlist[i]] = seqlist[i] if len(real_idlist) <= 0: print >> sys.stderr, "Failed to read real_topofile %s" % ( real_topofile) return 1 real_topodict = {} for i in xrange(len(real_idlist)): real_topodict[real_idlist[i]] = real_topolist[i] fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpout_wrong = myfunc.myopen(outfile_wrong_predtopo, None, "w", False) idSet_single = set([]) idSet_multi = set([]) for seqid in real_topodict: topo = real_topodict[seqid] numTM = myfunc.CountTM(topo) if numTM == 1: idSet_single.add(seqid) elif numTM > 1: idSet_multi.add(seqid) # print "len(real_topodict)", len(real_topodict) # print "len(idSet_single)", len(idSet_single) # print "len(idSet_multi)", len(idSet_multi) #for TM_type in ["All_Alpha", "Single", "Multi"]: for TM_type in ["All_Alpha"]: if TM_type == "All_Alpha": sub_real_topodict = real_topodict else: sub_real_topodict = {} for seqid in real_topodict: topo = real_topodict[seqid] numTM = myfunc.CountTM(topo) if TM_type == "Single" and numTM == 1: sub_real_topodict[seqid] = topo elif TM_type == "Multi" and numTM > 1: sub_real_topodict[seqid] = topo Benchmark(sub_real_topodict, idSet_single, idSet_multi, TM_type, fpout, fpout_wrong, seqDict) myfunc.myclose(fpout)
def Benchmark(real_topodict, idSet_single, idSet_multi, TM_type, fpout, fpout_wrong, seqDict): #{{{ if g_params['mode'] == "tps": itemlist = ["40", "41", "42", "43", "44", "All"] elif g_params['mode'] == "tp": itemlist = ["50", "51", "52", "53", "54", "55", "All"] isRestrictIDList = g_params['isRestrictIDList'] addname = "" if g_params['isRMSP']: addname = ".RMSP" numRealTopo = len(real_topodict) if isRestrictIDList: numRealTopo = len(g_params['restrictIDset'] & set(real_topodict.keys())) pred_topofile_list = [] pred_topodict_list = [] # Step 1, read in predicted topology for item in itemlist: pred_topofile = "" if item.upper() == "ALL": if g_params['mode'] == "tps": pred_topofile = "%s/%s.topcons-single_topcons_single%s.topo" % ( g_params['path_predtopo'], g_params['basename'], addname) elif g_params['mode'] == "tp": pred_topofile = "%s/%s.topcons.result_TOPCONS%s.topo" % ( g_params['path_predtopo'], g_params['basename'], addname) else: if g_params['mode'] == "tps": pred_topofile = "%s/%s_topcons_single.m1.agree-%s%s.topo" % ( g_params['path_predtopo'], g_params['basename'], item, addname) elif g_params['mode'] == "tp": pred_topofile = "%s/%s.topcons.result_TOPCONS.m1.agree-%s%s.topo" % ( g_params['path_predtopo'], g_params['basename'], item, addname) (pred_idlist, pred_annolist, pred_topolist) = myfunc.ReadFasta(pred_topofile) if len(pred_idlist) <= 0: print >> sys.stderr, "Failed to read pred_topofile %s" % ( pred_topofile) pred_topodict = {} for i in xrange(len(pred_idlist)): if ((not isRestrictIDList) or pred_idlist[i] in g_params['restrictIDset']): #if (TM_type == "All_Alpha" or (TM_type == "Single" and pred_idlist[i] in idSet_single) or (TM_type == "Multi" and pred_idlist[i] in idSet_multi)): pred_topodict[pred_idlist[i]] = pred_topolist[i] pred_topodict_list.append(pred_topodict) # Step 2, calculate precision of the prediction #header line fpout.write("#%s\n" % (TM_type)) fpout.write("#%2s %7s %8s %8s %8s %8s %8s %8s %8s\n" % ("No", "Group", "nIDT", "nINV", "nPred", "PPV(%)", "NPV_INV", "NPV_Other", "nAllReal")) for i in xrange(len(itemlist)): item = itemlist[i] pred_topodict = pred_topodict_list[i] numPredTopo = len(pred_topodict) (numIDTtopo, numINVtopo) = CountIdenticalTopology(pred_topodict, real_topodict, item, TM_type, fpout_wrong, seqDict, item) ss = "%-3d %7s %8d %8d %8d %8.1f %8.1f %8.1f %8d" % ( i, item, numIDTtopo, numINVtopo, numPredTopo, myfunc.FloatDivision(numIDTtopo, numPredTopo) * 100.0, myfunc.FloatDivision(numINVtopo, numPredTopo) * 100.0, myfunc.FloatDivision(numPredTopo - numIDTtopo - numINVtopo, numPredTopo) * 100.0, numRealTopo) fpout.write("%s\n" % (ss)) fpout.write("\n")
def main(g_params):#{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" infile = "" signalp_file = "" format_sp = "signalp" i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o"]: outfile = argv[i+1] i += 2 elif argv[i] in ["-sp", "--sp"] : signalp_file = argv[i+1] i += 2 elif argv[i] in ["-f", "--f", "-format", "--format"] : format_sp = argv[i+1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if infile == "" or not os.path.exists(infile): print >> sys.stderr, "infile not set or does not exist" return 1 if signalp_file == "" or not os.path.exists(signalp_file): print >> sys.stderr, "signalp file not set or does not exist" return 1 if not format_sp in ["signalp", "phobius"]: print >> sys.stderr, "format_sp = %s is not supported. Exit." %( format_sp) signalpDict = ReadSignalPeptide(signalp_file, format_sp) (idList, annoList, topoList) = myfunc.ReadFasta(infile) newTopoList = MaskTopologyBySignalPeptide(idList, topoList, signalpDict) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) for i in xrange(len(idList)): fpout.write(">%s\n"%(annoList[i])) fpout.write("%s\n"%(newTopoList[i])) myfunc.myclose(fpout)
def ReadSeqDBDict(infile): #{{{ seqdbDict = {} (idList, annotationList, seqList) = myfunc.ReadFasta(infile) for i in xrange(len(idList)): seqdbDict[idList[i]] = (annotationList[i], seqList[i]) return seqdbDict
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" seqdbfile = "" infile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outfile", "--outfile"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-seqdb", "--seqdb"]: seqdbfile = argv[i + 1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if infile == "": print >> sys.stderr, "annotation file not set" return 1 elif not os.path.exists(infile): print >> sys.stderr, "annotation file %s does not exist" % (infile) return 1 if seqdbfile == "": print >> sys.stderr, "seqdbfile file not set" return 1 elif not os.path.exists(seqdbfile): print >> sys.stderr, "seqdbfile file %s does not exist" % (seqdbfile) return 1 seqDict = GetSeqDict(seqdbfile) if seqDict == {}: print >> sys.stderr, "Failed to read seqdbfile %s" % (seqdbfile) return 1 (idList, annoList, contentList) = myfunc.ReadFasta(infile) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) for i in xrange(len(idList)): seqid = idList[i] try: seq = seqDict[seqid] fpout.write(">%s\n" % (annoList[i])) fpout.write("%s\n" % (seq)) if contentList[i] != "": fpout.write("%s\n" % (contentList[i])) except KeyError: print >> sys.stderr, "seqid %s not found in seqdb" % (seqid) myfunc.myclose(fpout)
# read in taxonomy def if not os.path.exists(fastafile): print("Error! file fastafile (%s) does not exist." % fastafile, file=sys.stderr) sys.exit(1) if not os.path.exists(treefile): print("Error! file treefile (%s) does not exist." % treefile, file=sys.stderr) sys.exit(1) t = Tree(treefile) leaves = t.get_leaves() leafNameList = [x.name for x in leaves] leafNameSet = set(leafNameList) (idList, annotationList, seqList) = myfunc.ReadFasta(fastafile) # write out taxdef fpout = sys.stdout numSeq = len(idList) # write settings dataset_settings = """\ TREE_COLORS #use this template to define branch colors and styles, colored ranges and label colors/font styles/backgrounds #lines starting with a hash are comments and ignored during parsing #=================================================================# # MANDATORY SETTINGS # #=================================================================# #select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
progname = os.path.basename(sys.argv[0]) general_usage = """ usage: %s TESTMODE options """ % (sys.argv[0]) numArgv = len(sys.argv) if numArgv <= 1: print(general_usage) sys.exit(1) TESTMODE = sys.argv[1] g_params = {} if TESTMODE == "loadpil": g_params['font_dir'] = "%s/../fonts/truetype/ttf-dejavu/" % (rundir) g_params['font_size'] = 16 fontpath = g_params['font_dir'] + "DejaVuSerif.ttf" print(fontpath) g_params['fntTMbox_label'] = ImageFont.truetype(fontpath, 10) if TESTMODE == "getgapposition": topo = sys.argv[2] posGAP = myfunc.GetGapPosition(topo) print(posGAP) if TESTMODE == "readfasta": seqfile = sys.argv[2] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) print(idList) print(seqList)
def start_boctopus(infile, blastpath, modHome, hmmfilename, ws_cytosolic, ws_extracellular, ws_lipidfacing, ws_porefacing, \ fakedbpath, dbpath, blastpgppath, hhsearchpath, hhblitspath, rpath): print "boctopus2 will start with ", infile # f = open(infile, "r")#{{{ DELETED # lines = f.readlines() # f.close() # # pname = [] # seqname = [] # tempseq = "" # for line in lines: # line = line.strip() # # if line.startswith(">"): # pname.append(line[1:]) # if len(tempseq) > 0: # seqname.append(tempseq) # tempseq = "" # else: # tempseq += line # # if len(tempseq) > 0: # seqname.append(tempseq) # # print pname # print seqname # # if len(pname) != len(seqname): # print "number of pnames and seqs not the same." # else:#}}} # rewrite sequence reading part (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(infile) if len(seqidlist) <= 0: print >> sys.stderr, "No valid sequences read from file '%s'"%(infile) return 1 #for i in range(0, len(pname)): for i in xrange(len(seqidlist)): seqid = seqidlist[i] seq = seqlist[i] seqanno = seqannolist[i] print "processing ", i , seqanno subtmpdir = "%s/seq_%d"%(tmpdir, i) if os.path.exists(subtmpdir): shutil.rmtree(subtmpdir) os.makedirs(subtmpdir) singleseqfile = "%s/query.fa"%(subtmpdir) myfunc.WriteFile(">%s\n%s\n"%(seqanno, seq), singleseqfile, mode="w", isFlush=True) if not os.path.exists(singleseqfile): print >> sys.stderr, "Failed to write to singleseqfile %s"%(singleseqfile) continue command = "python "+ "%s/boctopus_startHMM.py "%(rundir) + singleseqfile + " " + blastpath + " " + modHome + " " + hmmfilename + " " + ws_cytosolic + " " + ws_extracellular + " " + ws_lipidfacing + " " + ws_porefacing + " " + rpath+ " " +fakedbpath+\ " " + dbpath+ " " + blastpgppath+ " " + hhsearchpath + " " + hhblitspath print command os.system(command) outpath_this_seq = "%s/seq_%d"%(outpath, i) if not os.path.exists(outpath_this_seq): os.makedirs(outpath_this_seq) filepair_to_copy = [ ("%s/query.fa"%subtmpdir, "%s/query.fa"%outpath_this_seq), ("%s/output/query_ioIOS.prf.txt_svm_topo.png"%subtmpdir, "%s/query.predict.png"%(outpath_this_seq)), ("%s/output/query_topologies.txt"%(subtmpdir), "%s/query_topologies.txt"%outpath_this_seq), ("%s/svmoutput/query_ioIOS.prf.txt"%subtmpdir, "%s/profile.txt"%outpath_this_seq), ("%s/pssm/query.filtered.pssmvals"%subtmpdir, "%s/pssm.txt"%(outpath_this_seq)) ] for tup in filepair_to_copy: shutil.move(tup[0], tup[1]) return
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))
def WritePairAln(pairlistDict, msapath, msaext, outname):#{{{ verbose = g_params['verbose'] outAlnFile = outname + ".pairaln" outTableFile = outname + ".tableinfo" outSelPairList = outname + ".pairlistwithpfamid" try: fpout_aln = open(outAlnFile, "w") except IOError: print >> sys.stderr, "Failed to write to file", outAlnFile return 1 try: fpout_table = open(outTableFile, "w") except IOError: print >> sys.stderr, "Failed to write to file", outTableFile return 1 try: fpout_list = open(outSelPairList, "w") except IOError: print >> sys.stderr, "Failed to write to file", outSelPairList return 1 fpout_table.write("#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" % ( "Seq1","Seq2", "IDT0", "SIM0", "AlnLength", "Len1","Len2", "Score","N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2")) for famid in pairlistDict: if verbose >= 2: print "Write pairwise alignment for %s"%(famid) msafile = msapath + os.sep + famid + msaext if not os.path.exists(msafile): print >> sys.stderr, "msafile %s does not exist. Ignore" % msafile continue (idList, annoList, seqList) = myfunc.ReadFasta(msafile) msaDict = {} annoDict = {} for i in xrange(len(idList)): msaDict[idList[i]] = seqList[i] annoDict[idList[i]] = annoList[i] pairlist = pairlistDict[famid] #print "pairlist2=", pairlist for pair in pairlist: #print "pair2 = ", pair seq1 = "" seq2 = "" id1 = pair[0] id2 = pair[1] if id1 in msaDict and id2 in msaDict: seq1 = msaDict[id1] seq2 = msaDict[id2] [seq1, seq2] = lcmp.RemoveUnnecessaryGap([seq1, seq2]) if len(seq1) != len(seq2): print >> sys.stderr, "Bad alignment for %s and %s" %(id1,id2) else: rd = pair[2] fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%( annoDict[id1], id2, rd['seqidt0'], rd['seqidt1'])) fpout_aln.write("%s\n"%seq1) fpout_aln.write(">%s aligned_to=%s seqIDT=%.1f seqIDT1=%.1f\n"%( annoDict[id2], id1, rd['seqidt0'], rd['seqidt1'])) fpout_aln.write("%s\n"%seq2) fpout_table.write("%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n"% ( id1, id2, rd['seqidt0'], -1.0, rd['alnLength'], rd['seqLength1'], rd['seqLength2'], -1.0, rd['numIDT'], -1, rd['numGap'], rd['seqidt1'], rd['seqidt2'])) fpout_list.write("%s %s %s\n"%(id1, id2, famid)) fpout_aln.close() fpout_table.close() fpout_list.close() print "Result output to " print "\t%s"%outAlnFile print "\t%s"%outTableFile return 0
PrintHelp(); sys.exit(0); elif sys.argv[i] == "-i" or sys.argv[i] == "--infile": inFile=sys.argv[i+1]; i = i + 2; elif sys.argv[i] == "-mintm" or sys.argv[i] == "--mintm": MIN_NUMTM=int(sys.argv[i+1]); i = i + 2; elif sys.argv[i] == "-o" or sys.argv[i] == "--out": outFile=sys.argv[i+1]; i = i + 2; else: print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i]); sys.exit(1); else: inFile=sys.argv[i]; i+=1; if inFile == "": print >> sys.stderr,"Error! Topology file not set."; sys.exit(1); try : (idListTopo,annotationListTopo, topoList) = myfunc.ReadFasta(inFile); CleanSingleSpanTMPro(idListTopo, annotationListTopo, topoList); except : print >>sys.stderr, "except for the input file: %s" % inFile; raise ;
def DrawPairwiseTopo(pairtopoAlnFile, aaSeqDict, pairCmpclassDict, outpath): (idList, annoList, seqList) = myfunc.ReadFasta(pairtopoAlnFile) numSeq = len(idList) numPair = numSeq / 2 print "numSeq = ", numSeq print "numPair = ", numPair for i in range(numPair): id1 = idList[2 * i] id2 = idList[2 * i + 1] if len(seqList[2 * i]) != len(seqList[2 * i + 1]): print "Error for %s - %s " % (idList[2 * i], idList[2 * i + 1]) continue basename = "%s-%s" % (id1, id2) isSatisfied = True # if basename in pairCmpclassDict: # if g_params['cmpclassList'] != []: # if (not pairCmpclassDict[basename] in # g_params['cmpclassList']): # isSatisfied = False # elif pairCmpclassDict[basename] == 'OK': # isSatisfied = False if isSatisfied: outPairAlnFile = outpath + os.sep + "%s.topoaln.fa" % (basename) fpout = open(outPairAlnFile, 'w') print >> fpout, ">%s" % annoList[2 * i] print >> fpout, "%s" % seqList[2 * i] print >> fpout, ">%s" % annoList[2 * i + 1] print >> fpout, "%s" % seqList[2 * i + 1] fpout.close() outAASeqFile = outpath + os.sep + "%s.fa" % (basename) fpout = open(outAASeqFile, "w") if id1 in aaSeqDict: print >> fpout, ">%s" % id1 print >> fpout, "%s" % aaSeqDict[id1] if id2 in aaSeqDict: print >> fpout, ">%s" % id2 print >> fpout, "%s" % aaSeqDict[id2] fpout.close() # Output dgscan file dgpfile = outpath + os.sep + basename + '.dgscan' cmd = "%s %s -lmin 21 -lmax 21 -o %s" % (dgscanprog, outAASeqFile, dgpfile) os.system(cmd) outpngfile = outpath + os.sep + "%s.topoaln.png" % basename outShrinkedFile = (outpath + os.sep + "%s.topoaln.shrinked.png" % basename) thumb_outShrinkedFile = (outpath + os.sep + 'thumb.' + "%s.topoaln.shrinked.png" % basename) outNonShrinkedFile = (outpath + os.sep + "%s.topoaln.nonshrinked.png" % basename) thumb_outNonShrinkedFile = ( outpath + os.sep + 'thumb.' + "%s.topoaln.nonshrinked.png" % basename) os.system( "python %s/drawMSATopo.py %s -pfm no -shrink yes -method mat" % (binpath, outPairAlnFile)) os.system("mv %s %s" % (outpngfile, outShrinkedFile)) os.system( "python %s/drawMSATopo.py %s -pfm no -shrink no -pdg yes -method yes -dgpfile %s" % (binpath, outPairAlnFile, dgpfile)) os.system("mv %s %s" % (outpngfile, outNonShrinkedFile)) os.system("convert -thumbnail 200 %s %s" % (outShrinkedFile, thumb_outShrinkedFile)) os.system("convert -thumbnail 200 %s %s" % (outNonShrinkedFile, thumb_outNonShrinkedFile)) os.system("rm -f %s %s" % (outAASeqFile, dgpfile))
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG, isWriteRel): #{{{ (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile) outfile_fa = "%s.fa" % (outfile) outfile_unfinished_fa = "%s.unfinished.fa" % (outfile) numseq = len(seqidlist) fpout = None try: fpout = open(outfile, "w") except IOError: print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile) return 1 fpout_fa = None try: fpout_fa = open(outfile_fa, "w") except IOError: print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa) return 1 fpout_unfinished_fa = None try: fpout_unfinished_fa = open(outfile_unfinished_fa, "w") except IOError: print >> sys.stderr, "Failed to write to file \"%s\"" % ( outfile_unfinished_fa) return 1 methodlist = [ 'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS', 'Homology' ] cntUnFinished = 0 for iseq in xrange(len(seqidlist)): seq = seqlist[iseq] length = len(seq) desp = seqannolist[iseq] if g_params['resultPathFormat'] == "md5": md5_key2 = hashlib.md5(seq + "\n").hexdigest() md5_key1 = hashlib.md5(seq).hexdigest() subdirname = "seq_%d" % (0) isFound = False for md5_key in [md5_key1, md5_key2]: dir1 = md5_key[:2] dir2 = md5_key[2:4] datapath_this_seq = "%s%s%s%s%s%s%s" % ( path_result, os.sep, dir1, os.sep, dir2, os.sep, md5_key) subdir = "%s/%s" % (datapath_this_seq, subdirname) if os.path.exists(subdir): break else: subdirname = "seq_%d" % (iseq) subdir = "%s/%s" % (path_result, subdirname) if g_params['verbose']: print "subdir = %s" % (subdir) rstfile = "%s/Topcons/topcons.top" % (subdir) if os.path.exists(rstfile): print >> fpout, "Sequence number: %d" % (iseq + 1) print >> fpout, "Sequence name: %s" % (desp) print >> fpout, "Sequence length: %d aa." % (length) print >> fpout, "Sequence:\n%s\n\n" % (seq) topo_consensus = "" for i in xrange(len(methodlist)): method = methodlist[i] seqid = "" seqanno = "" top = "" if method == "TOPCONS": topfile = "%s/%s/topcons.top" % (subdir, "Topcons") elif method == "Philius": topfile = "%s/%s/query.top" % (subdir, "philius") elif method == "SCAMPI": topfile = "%s/%s/query.top" % (subdir, method + "_MSA") else: topfile = "%s/%s/query.top" % (subdir, method) if os.path.exists(topfile): (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile) else: top = "" if top == "": #top = "***No topology could be produced with this method topfile=%s***"%(topfile) top = "***No topology could be produced with this method***" if method == "TOPCONS": topo_consensus = top if method == "Homology": showtext_homo = method if seqid != "": showtext_homo = seqid print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top) else: print >> fpout, "%s predicted topology:\n%s\n\n" % (method, top) if isWriteDG: dgfile = "%s/dg.txt" % (subdir) dg_content = "" if os.path.exists(dgfile): dg_content = myfunc.ReadFile(dgfile) lines = dg_content.split("\n") dglines = [] for line in lines: if line and line[0].isdigit(): dglines.append(line) if len(dglines) > 0: print >> fpout, "\nPredicted Delta-G-values (kcal/mol) "\ "(left column=sequence position; right column=Delta-G)\n" print >> fpout, "\n".join(dglines) if isWriteRel: reliability_file = "%s/Topcons/reliability.txt" % (subdir) reliability = "" if os.path.exists(reliability_file): reliability = myfunc.ReadFile(reliability_file) if reliability != "": print >> fpout, "\nPredicted TOPCONS reliability (left "\ "column=sequence position; right column=reliability)\n" print >> fpout, reliability print >> fpout, "##############################################################################" # write the concensus prediction in FASTA format print >> fpout_fa, ">%s" % (desp) print >> fpout_fa, topo_consensus else: # write unfinished fpout_unfinished_fa.write(">%s\n%s\n" % (desp, seq)) cntUnFinished += 1 if cntUnFinished > 1: print >> sys.stderr, "%s out of %d sequences are with unfinished predictions, please check." % ( cntUnFinished, numseq) for fp in [fpout, fpout_fa, fpout_unfinished_fa]: if fp: try: fp.close() except IOError: pass return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" idListFile = None idList = [] seqfile = "" topofile = "" max_dist = 12 # maximal distance to the TM helix so that K, R residues are counted flank_win = 5 # flanking window of the TM helix, residues at position #TMbeg-flank_win and TMend+flank_win are also counted i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: idList.append(argv[i]) isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-maxdist", "--maxdist"]: (max_dist, i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-flankwin", "--flankwin"]: (flank_win, i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-seqfile", "--seqfile"]: (seqfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topofile", "--topofile"]: (topofile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (idListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-debug"]: g_params['isDEBUG'] = True i += 1 else: print("Error! Wrong argument:", argv[i], file=sys.stderr) return 1 else: idList.append(argv[i]) i += 1 (idListSeq, annoListSeq, seqList) = myfunc.ReadFasta(seqfile) (idListTopo, annoListTopo, topoList) = myfunc.ReadFasta(topofile) numSeq = len(idListSeq) numTopo = len(idListTopo) if numSeq < 1 or numTopo < 1: print("No seq set", file=sys.stderr) return 1 seqDict = {} for i in range(numSeq): seqDict[idListSeq[i]] = seqList[i] topoDict = {} for i in range(numTopo): topoDict[idListTopo[i]] = topoList[i] cmpclassDict = {} for anno in annoListTopo: anno = anno.lstrip(">") strs = anno.split() cmpclassDict[strs[0]] = strs[1] outpath = os.path.dirname(seqfile) if outpath == "": outpath = "." rootname = os.path.basename(os.path.splitext(seqfile)[0]) outfile_kr_list = outpath + os.sep + rootname + ".krlist.txt" outfile_krbias = outpath + os.sep + rootname + ".krbias.txt" fpout_krlist = open(outfile_kr_list, "w") fpout_krbias = open(outfile_krbias, "w") for idd in idListSeq: if g_params['isDEBUG']: print("seqid: %s" % (idd)) try: topo = topoDict[idd] except KeyError: print("no topo for %s" % idd, file=sys.stderr) continue try: seq = seqDict[idd] except KeyError: print("no seq for %s" % idd, file=sys.stderr) continue try: cmpclass = cmpclassDict[idd] except KeyError: cmpclass = "INV" (kr_bias, KR_pos_list, numTM) = CalKRBias(seq, topo, flank_win, max_dist) WriteResult(idd, cmpclass, seq, numTM, kr_bias, KR_pos_list, fpout_krlist) if cmpclass in ["IDT", "INV"]: fpout_krbias.write("%d\n" % kr_bias) fpout_krlist.close() fpout_krbias.close()
def main():#{{{ numArgv = len(sys.argv) if numArgv < 2: PrintHelp() return 1 global isPrintSeqID outFile="" inFile="" fastaFile="" i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: isNonOptionArg=False i = i + 1 elif sys.argv[i] == "--": isNonOptionArg=True i = i + 1 elif sys.argv[i][0] == "-": if sys.argv[i] == "-h" or sys.argv[i] == "--help": PrintHelp() return 1 elif sys.argv[i] == "-i" or sys.argv[i] == "--infile": inFile=sys.argv[i+1] i = i + 2 elif sys.argv[i] == "-f" or sys.argv[i] == "--fasta": fastaFile=sys.argv[i+1] i = i + 2 elif sys.argv[i] == "-printid" or sys.argv[i] == "--printid": if (sys.argv[i+1].lower())[0] == "y": isPrintSeqID=True else: isPrintSeqID=False i = i + 2 elif sys.argv[i] == "-o" or sys.argv[i] == "--outfile": outFile=sys.argv[i+1] i = i + 2 else: print >> sys.stderr,("Error! Wrong argument:%s" % sys.argv[i]) return 1 else: inFile=sys.argv[i] i+=1 if inFile == "": print >> sys.stderr,"Error! Topology file not set." return 1 if fastaFile == "": print >> sys.stderr,"Error! amino acid fasta file not set." return 1 fpout = sys.stdout if outFile != "": fpout = open(outFile,"w") if not fpout: print >> sys.stderr, "Failed to write to outfile %s. "%(outFile) print >> sys.stderr, "Reset output to stdout." fpout = sys.stdout sizeAASeqFile = os.path.getsize(fastaFile) if sizeAASeqFile > MAX_FASTA_AA_FILE_SIZE: print >> sys.stderr, ("size (%d)"%sizeAASeqFile + " of fasta sequence file (%s)"%fastaFile + " is over the limit (%d). Exit."% MAX_FASTA_AA_FILE_SIZE) return 1 (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile) if idListSeq == None: print >> sys.stderr, "%s exit with error."%sys.argv[0] return 1 elif idListSeq < 1: print >> sys.stderr, ("Warning! zero aa sequences have" + " been read in for file %s" %fastaFile) aaSeqDict={} for i in xrange (len(idListSeq)): aaSeqDict[idListSeq[i]] = seqList[i] fpin = open (inFile, "rb") if not fpin: print >> sys.stderr, "Failed to open input file %s"%(inFile) return -1 unprocessedBuffer="" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached=True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff,recordList, isEOFreached) if len(recordList) > 0: idListTopo = [r[0] for r in recordList] topoList = [r[2] for r in recordList] Topo2TMFrag(idListTopo, topoList,aaSeqDict, processedTopoIDSet, fpout) if isEOFreached == True: break fpin.close() if fpout != None and fpout != sys.stdout: fpout.close()
def MakeTMplot(seqAlnFile, topAlnFile, outpath, tmpdir):# {{{ """Make topology plot for TM family. """ rootname = os.path.basename(os.path.splitext(seqAlnFile)[0]) basename_seqAlnFile = os.path.basename(seqAlnFile) basename_topAlnFile = os.path.basename(topAlnFile) ext_topAlnFile = os.path.splitext(topAlnFile)[1].lstrip('.') shutil.copy2(seqAlnFile, os.path.join(tmpdir, basename_seqAlnFile)) shutil.copy2(topAlnFile, os.path.join(tmpdir, basename_topAlnFile)) cwd = os.getcwd() os.chdir(tmpdir) # generate topology one line plot cmd = [python_exec, os.path.join(rundir, "drawMSATopo.py"), "-m-shrink", str(0), "-method", "pil", "-pfm", "no", "-text", "n", "-pdg", "n", "-pfm", "n", "-pmsa", "y", "-ptag", "y", "-showTMidx", "-sep", "n", "--advtopo", "-cleanplot", "-h2wratio", str(g_params["H2W_ratio"]), "-shrink", "no", "-showgap", basename_topAlnFile] if g_params['verbose']: print(("Generating toplogy alignment figure for %s"%(rootname))) (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 topalnfigure = "%s.png"%(rootname) if not os.path.exists(topalnfigure): return 1 # resize the figure file resized_topalnfigure = "%s.s%d.png"%(rootname, g_params['figure_resize']) shutil.copy2(topalnfigure, resized_topalnfigure) cmd = ["mogrify", "-resize", str(g_params['figure_resize']), resized_topalnfigure] if g_params['verbose']: print(("Resizing the topology alignment figure for %s"%(rootname))) (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 # generate seqaln figure seqaln_htmlfigure = "%s.%s"%(rootname, "seqaln.html") cmd = [python_exec, os.path.join(rundir, "write_seqaln_colorTM.py"), basename_seqAlnFile, "-ext-topomsa", ext_topAlnFile, "-ws", str(g_params['window_size']), "-o", seqaln_htmlfigure, "-cleanplot", "-rmgap"] if g_params['isBreakTM']: cmd += ["-breakTM"] if g_params['verbose']: print(("Generating sequence alignment highlighted by TM regions for %s"%(rootname))) (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 # convert html to pdf seqaln_pdffigure = "%s.%s"%(rootname, "seqaln.pdf") cmd = ["wkhtmltopdf", seqaln_htmlfigure, seqaln_pdffigure] if os_dist.lower() in ["debian", "ubuntu"]: cmd = ["xvfb-run"] + cmd if g_params['verbose']: print("Convert the html figure to PDF for sequence alignment") (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 # crop the PDF figure cmd = ["pdfcrop", seqaln_pdffigure] (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 seqaln_pdffigure_crop = "%s.%s"%(rootname, "seqaln-crop.pdf") # merge figures (seqIDList, seqAnnoList, seqList) = myfunc.ReadFasta(basename_seqAlnFile) str_evalue = "" if len(seqAnnoList) > 0: str_evalue = seqAnnoList[0].split('/')[-1] outfile = "%s.seqtopaln.pdf"%(rootname) cmd = ["bash", os.path.join(rundir, "merge_tmplot.sh"), resized_topalnfigure, seqaln_pdffigure_crop, "-cap", "%s"%(rootname), "-o", outfile] capList = [] for i in range(len(seqIDList)): capList += ["-cap", "%s: %s"%(alphabet[i], seqIDList[i])] cmd += capList if g_params['verbose']: print(("Merging the topology alignment figure and sequence alignment figure for %s"%(rootname))) (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 # copy the pdf figure generated by latex to a tmp file (a hack for the # PDFcrop tmpoutfile = "tt1.pdf" shutil.copy2(outfile, tmpoutfile) # crop the merged PDF figure cmd = ["pdfcrop", tmpoutfile] (isCmdSuccess, t_runtime, t_msg) = myfunc.RunCmd(cmd) if not isCmdSuccess: print(t_msg) return 1 outfile_crop = "tt1-crop.pdf" if os.path.exists(outfile_crop): final_targetfile = os.path.join(outpath, "%s.seqtopaln.pdf"%(rootname)) shutil.copy2(outfile_crop, final_targetfile) if g_params['verbose']: print(("Copy the result to final target %s"%(os.path.join(outpath, outfile)))) os.chdir(cwd) return 0
sys.exit(1) if topoWithDGScoreFile == "" and dgscanFile == "": print >> sys.stderr, "Error! Either topoWithDGScoreFile or dgscanFile should be set." sys.exit(1) if topoWithDGScoreFile != "" and dgscanFile != "": print >> sys.stderr, "Error! Only one of the topoWithDGScoreFile and dgscanFile can be set." sys.exit(1) fpout = sys.stdout if outFile != "": fpout = open(outFile, "w") try: gapopenList = [] topoWithDGScoreList = [] (idListSeq, annotationListSeq, seqList) = myfunc.ReadFasta(fastaFile) if topoWithDGScoreFile != "": (topoWithDGScoreList, indexID) = ReadTopoWithDGScore(topoWithDGScoreFile) gapopenList = GetGapOpenValues(topoWithDGScoreList) if not (len(gapopenList) == len(idListSeq) and len(gapopenList) == len(topoWithDGScoreList)): print >> sys.stderr, "length mismatch" print >> sys.stderr, "len(gapopenList)=", len(gapopenList) print >> sys.stderr, "len(idListSeq)=", len(idListSeq) print >> sys.stderr, "len(topoWithDGScoreList)=", len( topoWithDGScoreList) sys.exit(1) elif dgscanFile != "": (dgscanList, indexID) = ReadDGScan(dgscanFile) gapopenList = GetGapOpenValuesFromDGScan(dgscanList)
def RunJob(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start" % (outpath) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) rmsg = "" resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0)) resultfile_text = "%s/%s" % (outpath_result, "query.result.txt") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) isOK = True try: os.makedirs(tmp_outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (tmp_outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass try: os.makedirs(outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass if isOK: try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] maplist_simple = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: datetime = time.strftime("%Y-%m-%d %H:%M:%S") rt_msg = myfunc.WriteFile(datetime, starttagfile) recordList = hdl.readseq() cnt = 0 origpath = os.getcwd() while recordList != None: for rd in recordList: isSkip = False # temp outpath for the sequence is always seq_0, and I feed # only one seq a time to the workflow tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % 0) outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt) subfoldername_this_seq = "seq_%d" % (cnt) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass maplist.append( "%s\t%d\t%s\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append( "%s\t%d\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description)) if not g_params['isForceRun']: md5_key = hashlib.md5(rd.seq).hexdigest() subfoldername = md5_key[:2] md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): # create a symlink to the cache rela_path = os.path.relpath( md5_link, outpath_result) #relative path os.chdir(outpath_result) os.symlink(rela_path, subfoldername_this_seq) if os.path.exists(outpath_this_seq): runtime = 0.0 #in seconds topfile = "%s/%s/topcons.top" % ( outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % cnt, str(len(rd.seq)), str(numTM), str(isHasSP), "cached", str(runtime), rd.description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists if os.path.exists(outpath_this_seq): try: shutil.rmtree(outpath_this_seq) except OSError: pass origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile) # run scampi single to estimate the number of TM helices and then run # the query sequences in the descending order of numTM torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist topfile_scampiseq = "%s/%s" % (tmp_outpath_result, "query.torun.fa.topo") if os.path.exists(torun_all_seqfile): # run scampi to estimate the number of TM helices cmd = [ script_scampi, torun_all_seqfile, "-outpath", tmp_outpath_result ] try: rmsg = subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") pass if os.path.exists(topfile_scampiseq): (idlist_scampi, annolist_scampi, toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq) for jj in xrange(len(idlist_scampi)): numTM = myfunc.CountTM(toplist_scampi[jj]) try: toRunDict[int(idlist_scampi[jj])][1] = numTM except (KeyError, ValueError, TypeError): pass sortedlist = sorted(toRunDict.items(), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: # g_params['runjob_log'].append("tmpdir = %s"%(tmpdir)) #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"] origIndex = item[0] seq = item[1][0] description = item[1][2] outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): g_params['runjob_err'].append( "failed to generate seq index %d" % (origIndex)) continue cmd = [ runscript, seqfile_this_seq, tmp_outpath_result, blastdir, blastdb ] g_params['runjob_log'].append(" ".join(cmd)) begin_time = time.time() try: rmsg = subprocess.check_output(cmd) g_params['runjob_log'].append("workflow:\n" + rmsg + "\n") except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") g_params['runjob_err'].append(rmsg + "\n") pass #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir)) #if len(suqoutfilelist)>0: # suqoutfile = suqoutfilelist[0] #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile)) end_time = time.time() runtime_in_sec = end_time - begin_time if os.path.exists(tmp_outpath_this_seq): cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq] isCmdSuccess = False try: subprocess.check_output(cmd) isCmdSuccess = True except subprocess.CalledProcessError, e: msg = "Failed to run prediction for sequence No. %d\n" % ( origIndex) g_params['runjob_err'].append(msg) g_params['runjob_err'].append(str(e) + "\n") pass timefile = "%s/time.txt" % (tmp_outpath_result) targetfile = "%s/time.txt" % (outpath_this_seq) if os.path.exists(timefile) and os.path.exists( outpath_this_seq): try: shutil.move(timefile, targetfile) except: g_params['runjob_err'].append( "Failed to move %s/time.txt" % (tmp_outpath_result) + "\n") pass if isCmdSuccess: runtime = runtime_in_sec #in seconds topfile = "%s/%s/topcons.top" % (outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % origIndex, str(len(seq)), str(numTM), str(isHasSP), "newrun", str(runtime), description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) # now write the text output for this seq info_this_seq = "%s\t%d\t%s\t%s" % ( "seq_%d" % origIndex, len(seq), description, seq) resultfile_text_this_seq = "%s/%s" % (outpath_this_seq, "query.result.txt") myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq, outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url']) # create or update the md5 cache # create cache only on the front-end if g_params['base_www_url'].find("topcons.net") != -1: md5_key = hashlib.md5(seq).hexdigest() subfoldername = md5_key[:2] md5_subfolder = "%s/%s" % (path_md5cache, subfoldername) md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): try: os.unlink(md5_link) except: pass subfolder_md5 = "%s/%s" % (path_md5cache, subfoldername) if not os.path.exists(subfolder_md5): try: os.makedirs(subfolder_md5) except: pass rela_path = os.path.relpath( outpath_this_seq, md5_subfolder) #relative path try: os.chdir(md5_subfolder) os.symlink(rela_path, md5_key) except: pass
def main(g_params): numArgv=len(sys.argv) if numArgv < 2: PrintHelp() return 1 outFile = "" orderlistfile = "" msafile = "" outformat = "fasta" # fasta or anno i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: msafile = sys.argv[i] isNonOptionArg=False i = i + 1 elif sys.argv[i] == "--": isNonOptionArg=True i = i + 1 elif sys.argv[i][0] == "-": if sys.argv[i] == "-h" or sys.argv[i] == "--help": PrintHelp() return 1 elif sys.argv[i] in [ "-o", "--o"] : outFile=sys.argv[i+1] i = i + 2 elif sys.argv[i] == "-orderlist" or sys.argv[i] == "--orderlist": orderlistfile = sys.argv[i+1] i = i + 2 elif sys.argv[i] == "-msafile" or sys.argv[i] == "--msafile": msafile = sys.argv[i+1] i = i + 2 elif sys.argv[i] in ["-of", "--of", "-outformat", "--outformat"]: outformat = sys.argv[i+1].lower() i += 2 else: print(("Error! Wrong argument:%s" % sys.argv[i]), file=sys.stderr) return 1 else: msafile = sys.argv[i] i+=1 if not outformat in ["anno", "fasta"]: print("Unrecognized outformat \"%s\","%( outformat) + " should be either \"anno\" or \"fasta\".", file=sys.stderr) return 1 if orderlistfile == "": print("orderlist file not set. Exit", file=sys.stderr) return 1 if msafile == "": print("msafile not set. Exit", file=sys.stderr) orderList = ReadOrderList(orderlistfile) (idList, annoList, seqList) = myfunc.ReadFasta(msafile) if len(orderList) > 0 and len(idList) > 0: fpout = sys.stdout fpout = myfunc.myopen(outFile, sys.stdout, "w", False) seqDict = {} annoDict = {} numSeq = len(idList) for i in range(numSeq): annoDict[idList[i]] = annoList[i] if outformat != "anno": for i in range(numSeq): seqDict[idList[i]] = seqList[i] for sid in orderList: if sid in annoDict: fpout.write(">%s\n"%annoDict[sid]) if outformat != "anno": fpout.write("%s\n"%seqDict[sid]) else: print("seqid %s not in msafile %s"%( sid, msafile), file=sys.stderr) myfunc.myclose(fpout) return 0
def DumpPredictionTOPCONS2(seqfile, path_result, outfile, isWriteDG, isWriteRel): #{{{ (seqidlist, seqannolist, seqlist) = myfunc.ReadFasta(seqfile) outfile_fa = "%s.fa" % (outfile) fpout = None try: fpout = open(outfile, "w") except IOError: print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile) return 1 fpout_fa = None try: fpout_fa = open(outfile_fa, "w") except IOError: print >> sys.stderr, "Failed to write to file \"%s\"" % (outfile_fa) return 1 methodlist = [ 'TOPCONS', 'OCTOPUS', 'Philius', 'PolyPhobius', 'SCAMPI', 'SPOCTOPUS', 'Homology' ] for i in xrange(len(seqidlist)): subdirname = "seq_%d" % (i) subdir = "%s/%s" % (path_result, subdirname) seq = seqlist[i] length = len(seq) desp = seqannolist[i] print >> fpout, "Sequence number: %d" % (i + 1) print >> fpout, "Sequence name: %s" % (desp) print >> fpout, "Sequence length: %d aa." % (length) print >> fpout, "Sequence:\n%s\n\n" % (seq) topo_consensus = "" for i in xrange(len(methodlist)): method = methodlist[i] seqid = "" seqanno = "" top = "" if method == "TOPCONS": topfile = "%s/%s/topcons.top" % (subdir, "Topcons") elif method == "Philius": topfile = "%s/%s/query.top" % (subdir, "philius") elif method == "SCAMPI": topfile = "%s/%s/query.top" % (subdir, method + "_MSA") else: topfile = "%s/%s/query.top" % (subdir, method) if os.path.exists(topfile): (seqid, seqanno, top) = myfunc.ReadSingleFasta(topfile) else: top = "" if top == "": #top = "***No topology could be produced with this method topfile=%s***"%(topfile) top = "***No topology could be produced with this method***" if method == "TOPCONS": topo_consensus = top if method == "Homology": showtext_homo = method if seqid != "": showtext_homo = seqid print >> fpout, "%s:\n%s\n\n" % (showtext_homo, top) else: print >> fpout, "%s predicted topology:\n%s\n\n" % (method, top) if isWriteDG: dgfile = "%s/dg.txt" % (subdir) dg_content = "" if os.path.exists(dgfile): dg_content = myfunc.ReadFile(dgfile) lines = dg_content.split("\n") dglines = [] for line in lines: if line and line[0].isdigit(): dglines.append(line) if len(dglines) > 0: print >> fpout, "\nPredicted Delta-G-values (kcal/mol) "\ "(left column=sequence position; right column=Delta-G)\n" print >> fpout, "\n".join(dglines) if isWriteRel: reliability_file = "%s/Topcons/reliability.txt" % (subdir) reliability = "" if os.path.exists(reliability_file): reliability = myfunc.ReadFile(reliability_file) if reliability != "": print >> fpout, "\nPredicted TOPCONS reliability (left "\ "column=sequence position; right column=reliability)\n" print >> fpout, reliability print >> fpout, "##############################################################################" # write the concensus prediction in FASTA format print >> fpout_fa, ">%s" % (desp) print >> fpout_fa, topo_consensus if fpout: try: fpout.close() except IOError: pass if fpout_fa: try: fpout_fa.close() except IOError: pass return 0