def GetTopoDict(topoDB, idList): hdl = myfunc.MyDB(topoDB) if hdl.failure: return {} dt = {} for seqid in idList: data = hdl.GetRecord(seqid) if data: (tmp_id, tmp_anno, tmp_seq) = myfunc.ExtractFromSeqWithAnno(data) dt[seqid] = tmp_seq hdl.close() return dt
def createHitDB(pfamList, prot_name, work_dir): hdl = myfunc.MyDB(cddseqdb) if hdl.failure: print "Error" return 1 with open(work_dir + prot_name + ".hits.db.temp", "w") as outFile: for pfamid in pfamList: record = hdl.GetRecord(pfamid) if record: outFile.write(record) hdl.close() os.system("python my_uniqueseq.py " + work_dir + prot_name + ".hits.db.temp")
def createHitDB_mydb(pfamList, prot_name, work_dir, pfamseqdb): # {{{ """Create the blast seqdb from pfam_scan hits, using the MyDB version of pfamfullseqdb""" outfile_seqdb = os.path.join(work_dir, prot_name + ".hits.db.temp") hdl = myfunc.MyDB(pfamseqdb) if hdl.failure: print("Failed to open pfamseqdb %s with MyDB()" % (pfamseqdb)) return 1 with open(outfile_seqdb, "w") as outFile: for pfamid in pfamList: record = hdl.GetRecord(pfamid) if record: outFile.write(record) hdl.close() os.system("python my_uniqueseq.py " + outfile_seqdb)
def MatchMSATopo_using_topodb( msafile, topodb, isIgnoreBadseq, #{{{ method_match, outfile): hdl_topo = myfunc.MyDB(topodb) if hdl_topo.failure: return 1 hdl = myfunc.ReadFastaByBlock(msafile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: topowithanno = hdl_topo.GetRecord(rd.seqid) if topowithanno != None: (topoid, topoanno, topo) = myfunc.ExtractFromSeqWithAnno(topowithanno) else: print("topo not found for ID %s" % (rd.seqid), file=sys.stderr) topo = "" matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match) if not (matchedtopo == "BADSEQ" and isIgnoreBadseq): print(">%s" % (rd.description), file=fpout) print("%s" % (matchedtopo), file=fpout) recordList = hdl.readseq() myfunc.myclose(fpout) hdl.close() hdl_topo.close() return 0
except subprocess.CalledProcessError, e: print e return 1 if myfunc.checkfile(pfamid2seqidFile, "pfamid2seqidFile") != 0: return 1 if myfunc.checkfile("%s0.db" % topodb, "topodb") != 0: return 1 if myfunc.checkfile("%s0.db" % seqdb, "seqdb") != 0: return 1 pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidFile) hdl_topo = myfunc.MyDB(topodb) if not hdl_topo.failure: idSet_topo = set(hdl_topo.indexedIDList) else: idSet_topo = set([]) print >> sys.stderr, "Failed to open topology database %s" % (topodb) return 1 hdl_seq = myfunc.MyDB(seqdb) if hdl_seq.failure: print >> sys.stderr, "Failed to open sequence database %s" % (seqdb) return 1 GetTMProList_per_family(pfamid2seqidDict, idSet_topo, hdl_seq, hdl_topo, outpath)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 SPE_PAIR_LIST = [(2, 1), (2, 4), (2, 6), (2, 8), (3, 6), (3, 7), (4, 6), (4, 8), (4, 10), (5, 7), (5, 10), (6, 8), (6, 10), (6, 12), (7, 14), (8, 10), (8, 12), (10, 12), (10, 13), (11, 13), (12, 14)] outfile = "" infile = "" pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3) signalpFile = "%s/wk/MPTopo/pfamAna_refpro/pred_signalp/refpro20120604-celluar.selmaxlength-m1.nr100.signalp_list" % ( DATADIR3) #seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2clanid"%(DATADIR3) #seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2pfamid"%(DATADIR3) seqid2clanidMapFile = "" seqid2pfamidMapFile = "" tm_pfamidListFile = "" tm_clanidListFile = "" pfamid2seqidMapFile = "" clanid2seqidMapFile = "" dbname_predTM = "" pairlistwithpfamidFile = "" pfamtype = "" pairListFile = "" #classList_TableNumTMHeatMap = ["ALL", "RMSP"] classList_TableNumTMHeatMap = ["ALL"] i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outpath", "--outpath"]: (g_params['outpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (fileListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-signalp", "--signalp"]: (signalpFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mp", "--mp"]: g_params[ 'pairwise_comparison_method'], i = myfunc.my_getopt_int( argv, i) elif argv[i] in ["-mindiffpair", "--mindiffpair"]: g_params['mindiffpair'], i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-pfamtype", "--pfamtype"]: pfamtype, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanidlist", "--clanidlist"]: (tm_clanidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamidlist", "--pfamidlist"]: (tm_pfamidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2clanid", "--seqid2clanid"]: (seqid2clanidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamid2seqid", "--pfamid2seqid"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanid2seqid", "--clanid2seqid"]: (clanid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlistwithpfamid", "--pairlistwithpfamid"]: (pairlistwithpfamidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-predTMdbname", "--predTMdbname"]: (dbname_predTM, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlist", "--pairlist"]: (pairListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-winsize", "--winsize"]: (g_params['winsize'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-outname", "--outname"]: (g_params['outname'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-prokar", "--prokar"]: g_params['isOnlyAnaProkar'] = True i += 1 elif argv[i] in ["-eukar", "--eukar"]: g_params['isOnlyAnaEukar'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if myfunc.checkfile( infile, "%s (line %d): infile" % (__file__, inspect.currentframe().f_lineno)) != 0: return 1 dirpath = myfunc.my_dirname(infile) # try to obtain Pfam family tag tag = "" if pfamtype != "": if pfamtype.upper().find("FAM") != -1: tag = ".Family" elif pfamtype.upper().find("DOM") != -1: tag = ".Domain" elif pfamtype.upper().find("REP") != -1: tag = ".Repeat" elif pfamtype.upper().find("MOT") != -1: tag = ".Motif" else: tag = "" else: if infile.find(".Family.") != -1: tag = ".Family" elif infile.find(".Domain.") != -1: tag = ".Domain" elif infile.find(".Repeat.") != -1: tag = ".Repeat" elif infile.find(".Motif.") != -1: tag = ".Motif" else: tag = "" if seqid2clanidMapFile == "": seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.seqid2clanid" % ( DATADIR3) if myfunc.checkfile( seqid2clanidMapFile, "%s (line %d): seqid2clanidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if seqid2pfamidMapFile == "": seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.seqid2pfamid" % ( DATADIR3, tag) if myfunc.checkfile( seqid2pfamidMapFile, "%s (line %d): seqid2pfamidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if pfamid2seqidMapFile == "": pfamid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.pfamid2seqid" % ( DATADIR3) if myfunc.checkfile( pfamid2seqidMapFile, "%s (line %d): pfamid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if clanid2seqidMapFile == "": clanid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.clanid2seqid" % ( DATADIR3, tag) if myfunc.checkfile( clanid2seqidMapFile, "%s (line %d): clanid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_pfamidListFile == "": tm_pfamidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.pfamidlist" % ( DATADIR3, tag) if myfunc.checkfile( tm_pfamidListFile, "%s (line %d): tm_pfamidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_clanidListFile == "": tm_clanidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.clanidlist" % ( DATADIR3) if myfunc.checkfile( tm_clanidListFile, "%s (line %d): tm_clanidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if dbname_predTM == "": dbname_predTM = "%s/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.RMSP" % ( DATADIR3) if myfunc.checkfile( "%s0.db" % (dbname_predTM), "%s (line %d): dbname_predTM" % (__file__, inspect.currentframe().f_lineno)): return 1 if g_params['isOnlyAnaProkar']: prokarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Prokaryota.seqidlist" % ( DATADIR3) g_params['prokarSeqIDSet'] = set(myfunc.ReadIDList(prokarseqidfile)) if len(g_params['prokarSeqIDSet']) < 1: return 1 if g_params['isOnlyAnaEukar']: eukarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Eukaryota.seqidlist" % ( DATADIR3) g_params['eukarSeqIDSet'] = set(myfunc.ReadIDList(eukarseqidfile)) if len(g_params['eukarSeqIDSet']) < 1: return 1 if pairlistwithpfamidFile == "": pairlistwithpfamidFile = "%s/../../Pfam-.maxpair100.pairlistwithpfamid" % ( dirpath) if myfunc.checkfile( pairlistwithpfamidFile, "%s (line %d): pairlistwithpfamidFile" % (__file__, inspect.currentframe().f_lineno)): return 1 pfamid_2_seqidpair_Dict = ReadPairListWithFamID(pairlistwithpfamidFile) usedPfamIDSet = set( pfamid_2_seqidpair_Dict.keys()) # pfamids used in pair selection if pairListFile != "": li = myfunc.ReadPairList(pairListFile) SPE_PAIR_LIST = [] for tup in li: SPE_PAIR_LIST.append((int(tup[0]), int(tup[1]))) (pfamidDefDict, clanidDefDict) = ReadPfamDefFile(pfamDefFile) signalpDict = lcmp.ReadSignalPDict(signalpFile) seqid2clanidDict = myfunc.ReadFam2SeqidMap(seqid2clanidMapFile) seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) clanid2seqidDict = myfunc.ReadFam2SeqidMap(clanid2seqidMapFile) pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tm_pfamidList = myfunc.ReadIDList(tm_pfamidListFile) tm_clanidList = myfunc.ReadIDList(tm_clanidListFile) tm_pfamidSet = set(tm_pfamidList) tm_clanidSet = set(tm_clanidList) hdl_predTM = myfunc.MyDB(dbname_predTM) if not hdl_predTM.failure: idSet_TMpro = set(hdl_predTM.indexedIDList) else: idSet_TMpro = set([]) #classList_TableNumTMHeatMap = ["ALL", "RMSP", "RMDUP"] #alignrangeList = ['FULL_ALIGNED', 'all', 'PART_ALIGNED'] alignrangeList = ['FULL_ALIGNED'] if g_params['outpath'] != "" and not os.path.exists(g_params['outpath']): cmd = ["mkdir", "-p", g_params['outpath']] try: subprocess.check_call(cmd) except subprocess.CalledProcessError, e: print e return 1
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" outfile = "" idListFile = "" uniprotDBname = "" idList = [] i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: idList.append(argv[i]) isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (idListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-uniprotdb", "--uniprotdb"]: (uniprotDBname, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: idList.append(argv[i]) i += 1 if idListFile != "": idList += myfunc.ReadIDList(idListFile) if uniprotDBname == "": print >> sys.stderr, "uniprotdb not set" return 1 uniprotdbfile = "%s0.db" % uniprotDBname if myfunc.checkfile(uniprotdbfile, "uniprotdbfile") != 0: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) hdl = myfunc.MyDB(uniprotDBname) if hdl.failure: return 1 for seqid in idList: data = hdl.GetRecord(seqid) if data != None: goinfo = GetGOInfoFromUniprotData(data) WriteGOInfo(seqid, goinfo, fpout) hdl.close() myfunc.myclose(fpout)
def main(g_params):#{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" infile = "" seqdb = "" listfile = "" i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile", "--outfile"]: outfile = argv[i+1] i += 2 elif argv[i] in ["-i", "--i"] : infile = argv[i+1] i += 2 elif argv[i] in ["-l", "--l"] : listfile = argv[i+1] i += 2 elif argv[i] in ["-seqdb", "--seqdb"] : seqdb = argv[i+1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 runList = [] # runList is a list of tuples (infile, outfile) if infile != "": runList.append((infile, outfile)) if listfile != "": if os.path.exists(listfile): try: fpin = open(listfile, "rU") lines = fpin.readlines() fpin.close() for line in lines: if not line or line[0]== "#": continue strs = line.split("\t") infile = strs[0].strip() try: outfile = strs[1].strip() except IndexError: outfile = "" runList.append((infile, outfile)) except IOError: print >> sys.stderr, "Failed to read file %s"%(listfile) else: print >> sys.stderr, "listfile %s does not exist"%(listfile) numInput = len(runList) if numInput < 1: print >> sys.stderr, "No input set. Exit" return 1 if seqdb == "": print >> sys.stderr, "seqdb not set." return 1 elif not os.path.exists(seqdb+"0.db"): print >> sys.stderr, "seqdb %s does not exist."%(seqdb) return 1 hdl_seqdb = myfunc.MyDB(seqdb) if hdl_seqdb.failure: print >> sys.stderr, "Failed to open seqdb %s"%(seqdb) return 1 for (infile, outfile) in runList: GetSeqFromMSA(infile, outfile, hdl_seqdb) hdl_seqdb.close() return 0
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" outfile_tableinfo = "" outfile_stat = "" fileListFile = "" fileList = [] seqdb = "" evalue_threshold = 1e-3 coverage_threshold = 0.0 i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: fileList.append(argv[i]) isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-ot", "--ot"]: (outfile_tableinfo, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-os", "--os"]: (outfile_stat, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-evalue", "--evalue"]: (evalue_threshold, i) = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-coverage", "--coverage"]: (coverage_threshold, i) = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-seqdb", "--seqdb"]: (seqdb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (fileListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: fileList.append(argv[i]) i += 1 if fileListFile != "": fileList += myfunc.ReadIDList(fileListFile) if len(fileList) < 1: print >> sys.stderr, "No input set. exit" return 1 if seqdb == "": print >> sys.stderr, "Seqdb not set. exit" return 1 hdl_seq = myfunc.MyDB(seqdb) if hdl_seq.failure: return 1 fpout = sys.stdout fpout_tableinfo = None fpout_stat = None if outfile != "": fpout = myfunc.myopen(outfile, sys.stdout, "w", False) if outfile_tableinfo != "": fpout_tableinfo = myfunc.myopen(outfile_tableinfo, None, "w", False) if outfile_stat != "": fpout_stat = myfunc.myopen(outfile_stat, None, "w", False) if fpout_stat != None: fpout_stat.write("%-8s %-8s %7s %8s %6s %6s %6s %9s %4s %9s %4s\n" % ( "#ID1", "ID2", "Evalue", "Coverage", "IDT", "Prob", "AlnCol", "PosQuery", "LenQ", "PosTemp", "LenT", )) if fpout_tableinfo != None: fpout_tableinfo.write( "#%-15s %-15s %6s %6s %9s %6s %6s %9s %6s %6s %6s %6s %6s\n" % ("Seq1", "Seq2", "IDT0", "SIM0", "AlnLength", "Len1", "Len2", "Score", "N_IDT", "N_SIM", "N_GAP", "IDT1", "IDT2")) for infile in fileList: HHAlign2Pairaln(infile, evalue_threshold, coverage_threshold, hdl_seq, fpout, fpout_tableinfo, fpout_stat) if outfile != "": myfunc.myclose(fpout) if outfile_tableinfo != "": myfunc.myclose(fpout_tableinfo) if outfile_stat != "": myfunc.myclose(fpout_stat)
try: subprocess.check_call(["mkdir", "-p", tmpdir]) except subprocess.CalledProcessError, e: return 1 # famid2seqidDict = myfunc.ReadFam2SeqidMap(mapfile) g_params['cdhit_wordsize'] = GetCDHitWordSize(g_params['nrlevel']) seqdbDict = None hdl_seqdb = None if g_params['isBigmem']: seqdbDict = ReadSeqDBDict(seqdb) else: hdl_seqdb = myfunc.MyDB(seqdb) if hdl_seqdb.failure: print >> sys.stderr, "Failed to load seqdb %s. exit" % (seqdb) return 1 pfamidList = [] extra_desp_dict = {} if extra_description_file != "": hdl_extra = myfunc.ReadLineByBlock(extra_description_file) if hdl_extra.failure: print >> sys.stderr, "Failed to read extra_description_file %s." % ( extra_description_file) return 1 lines = hdl_extra.readlines() while lines != None:
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "" pairListFile = "" seqlenFile = "" shortid2fullidFile = "" seqid2pfamidMapFile = "" pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv' topodb = "" seqdb = "" pdb2spFile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: isNonOptionArg = False i += 1 return 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topodb", "--topodb"]: (topodb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]: (pdb2spFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqdb", "--seqdb"]: (seqdb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqmsapath", "--seqmsapath"]: (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-datapath", "--datapath"]: (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seq2pfam", "--seq2pfam"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfam2seq", "--pfam2seq"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-description", "--description"]: (g_params['description'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-alignrange", "--alignrange"]: g_params['alignrange'], i = myfunc.my_getopt_str(argv, i) if not g_params['alignrange'] in ['all', 'full', 'part']: print >> sys.stderr, "alignrange must be one of [all, full, part]" return 1 else: if g_params['alignrange'] == 'full': g_params['alignrange'] = 'FULL_ALIGNED' elif g_params['alignrange'] == 'part': g_params['alignrange'] = 'PART_ALIGNED' elif argv[i] in ["-basename", "--basename"]: (g_params['basename'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-treepath", "--treepath"]: (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairalnpath", "--pairalnpath"]: (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-maxperfamily", "--maxperfamily"]: (g_params['max_num_output_per_family'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-min-seqidt", "--min-seqidt"]: g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-max-seqidt", "--max-seqidt"]: g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]: (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-debug", "--debug"]: if argv[i + 1][0].lower() == 'y': g_params['isDEBUG'] = True else: g_params['isDEBUG'] = False i += 2 elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 if g_params['basename'] == "": print >> sys.stderr, "basename not set. exit" return 1 if myfunc.checkfile(g_params['datapath'], "datapath") != 0: return 1 if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0: return 1 if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0: return 1 if myfunc.checkfile(topodb + "0.db", "topodb") != 0: return 1 if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0: return 1 if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0: return 1 if pdb2spFile != "": (g_params['pdb2uniprotMap'], g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile) if g_params['datapath'] == "": print >> sys.stderr, "datapath not set" return 1 elif not os.path.exists(g_params['datapath']): print >> sys.stderr, "datapath %s does not exist" % ( g_params['datapath']) return 1 if outpath == "": print >> sys.stderr, "outpath not set" return 1 elif not os.path.exists(outpath): cmd = ["mkdir", "-p", outpath] subprocess.check_call(cmd) paircmpfile = "%s/%s.paircmp" % (g_params['datapath'], g_params['basename']) if myfunc.checkfile(paircmpfile, "paircmpfile") != 0: return 1 (g_params['pfamidDefDict'], g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile) g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tmpdir = tempfile.mkdtemp() if g_params['msapath'] == "": g_params['msapath'] = tmpdir if g_params['treepath'] == "": g_params['treepath'] = tmpdir if g_params['pairalnpath'] == "": g_params['pairalnpath'] = tmpdir pairCmpRecordList = [] unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False try: fpin = open(paircmpfile, "r") except IOError: print >> sys.stderr, "Failed to open input file %s" % (paircmpfile) return 1 while 1: buff = fpin.read(myfunc.BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff rdList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList) rdList = FilterPairCmpResult(rdList) cntTotalReadInRecord += len(rdList) pairCmpRecordList += rdList if isEOFreached == True: break fpin.close() print "cntTotalReadInRecord =", cntTotalReadInRecord g_params['hdl_seqdb'] = myfunc.MyDB(seqdb) g_params['hdl_topodb'] = myfunc.MyDB(topodb) g_params['OS'] = os.uname()[0] if g_params['OS'].find('Linux') != -1: g_params['CP_EXE'] = "/bin/cp -uf" else: g_params['CP_EXE'] = "/bin/cp -f" if shortid2fullidFile != "": g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap( shortid2fullidFile) addname = "" if g_params['alignrange'] != 'all': addname += ".%s" % (g_params['alignrange']) dataTable = {} # structure of dataTable # dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}} # first read in pairCmpRecordList AddAllSeqInPairCmp(dataTable, pairCmpRecordList, g_params['seqid2pfamidDict']) pairInfoFileList = [] for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]: ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % ( g_params['datapath'], g_params['basename'], cmpclass) pairInfoFileList.append(ss) pairinfoList = ReadPairInfo_cmpclass(ss) AddPairInfo(dataTable, pairinfoList, cmpclass) # print "\n".join(pairInfoFileList) if g_params['isDEBUG']: #{{{ for pfamid in dataTable: print pfamid print "\tset_seqid" print dataTable[pfamid]['set_seqid'] print "\tdifftopopair" for cls in dataTable[pfamid]['difftopopair']: print "\t\t", cls for tup in dataTable[pfamid]['difftopopair'][cls]: print "\t\t\t", tup #}}} WriteHTML(dataTable, outpath) os.system("rm -rf %s" % (tmpdir))