def main(g_params):#{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" infile = "" mapfile = "" i = 1 isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-map", "--map", "-mapfile"]: (mapfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if myfunc.checkfile(infile) != 0: return 1 if myfunc.checkfile(mapfile) != 0: return 1 clanid2pfamidDict = myfunc.ReadFam2SeqidMap(mapfile) pfamPercentTMDict = ReadPercentTM(infile) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) GetPercentTMOfClan(pfamPercentTMDict, clanid2pfamidDict, fpout) myfunc.myclose(fpout)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" outfile_with_famid = "" outfile_with_pdb = "" outfile_fam2seqmap = "" idListFile = "" mapfile = "%s%s%s" % ( DATADIR3, os.sep, "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.clanid2seqid" ) restrictIDListFile = "" idList = [] maxseq_for_fam = 200 maxpair_for_fam = 300 method = 0 rand_seed = None pdbtospFile = "" isOnlyPDB = False i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: idList.append(argv[i]) isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile", "--outfile"]: outfile, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outwithfamid", "--outwithfamid"]: outfile_with_famid, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outfam2seqmap", "--outfam2seqmap"]: outfile_fam2seqmap, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outwithpdb", "--outwithpdb"]: outfile_with_pdb, i = myfunc.my_getopt_str(argv, i) elif argv[i] in [ "-tmprolist", "--tmprolist", "-restrictlist", "--restrictlist" ]: restrictIDListFile, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mapfile", "--mapfile"]: mapfile, i = myfunc.my_getopt_str(argv, i) elif (argv[i] in ["-pdbtosp", "--pdbtosp"]): pdbtospFile, i = myfunc.my_getopt_str(argv, i) elif sys.argv[i] in ["-seed", "--seed"]: rand_seed, i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-l", "--l"]: idListFile, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-maxseq", "--maxseq"]: maxseq_for_fam, i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-maxpair", "--maxpair"]: maxpair_for_fam, i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-m", "--m", "-method", "--method"]: method, i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-onlypdb", "--onlypdb"]: g_params['isOnlyPDB'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: idList.append(argv[i]) i += 1 if os.path.exists(idListFile): idList += myfunc.ReadIDList(idListFile) if len(idList) < 1: print >> sys.stderr, "no ID set. exit" return 1 if myfunc.checkfile(mapfile, "idMapFile") != 0: return 1 idMapDict = myfunc.ReadFam2SeqidMap(mapfile) # Read in pdbtosp map if pdbtospFile != "": (pdb2uniprotMap, uniprot2pdbMap) =\ myfunc.ReadPDBTOSP(pdbtospFile) g_params['uniprotidlist_with_pdb'] = set(uniprot2pdbMap.keys()) g_params['uniprot2pdbMap'] = uniprot2pdbMap if g_params['isOnlyPDB'] == True: if pdbtospFile == "": print >> sys.stderr, "onlypdb is true but pdbtospFile is not set. exit." return 1 elif g_params['uniprotidlist_with_pdb'] == set([]): print >> sys.stderr, "onlypdb is true but uniprotidlist_with_pdb is empty. exit." return 1 restrictIDSet = set([]) if restrictIDListFile != "": restrictIDSet = set(myfunc.ReadIDList(restrictIDListFile)) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpout_withfamid = myfunc.myopen(outfile_with_famid, None, "w", False) fpout_withpdb = myfunc.myopen(outfile_with_pdb, None, "w", False) fpout_fam2seqmap = myfunc.myopen(outfile_fam2seqmap, None, "w", False) if method == 0: GeneratePairWithinFam_m_0(idList, idMapDict, restrictIDSet, maxseq_for_fam, rand_seed, fpout, fpout_withfamid) elif method == 1: GeneratePairWithinFam_m_1(idList, idMapDict, restrictIDSet, maxpair_for_fam, rand_seed, fpout, fpout_withfamid, fpout_fam2seqmap) elif method == 2: #all to all GeneratePairWithinFam_m_2(idList, idMapDict, restrictIDSet, fpout, fpout_withfamid, fpout_withpdb) myfunc.myclose(fpout) myfunc.myclose(fpout_withfamid) myfunc.myclose(fpout_withpdb) myfunc.myclose(fpout_fam2seqmap) return 0
try: subprocess.check_output(["mkdir", "-p", outpath]) except subprocess.CalledProcessError, e: print e return 1 if myfunc.checkfile(pfamid2seqidFile, "pfamid2seqidFile") != 0: return 1 if myfunc.checkfile("%s0.db" % topodb, "topodb") != 0: return 1 if myfunc.checkfile("%s0.db" % seqdb, "seqdb") != 0: return 1 pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidFile) hdl_topo = myfunc.MyDB(topodb) if not hdl_topo.failure: idSet_topo = set(hdl_topo.indexedIDList) else: idSet_topo = set([]) print >> sys.stderr, "Failed to open topology database %s" % (topodb) return 1 hdl_seq = myfunc.MyDB(seqdb) if hdl_seq.failure: print >> sys.stderr, "Failed to open sequence database %s" % (seqdb) return 1 GetTMProList_per_family(pfamid2seqidDict, idSet_topo, hdl_seq, hdl_topo,
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 infile = "" outfile = "" seqid2pfamidFile = datadir3 + os.sep + "wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.seqid2pfamid" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]: seqid2pfamidFile = argv[i + 1] i += 2 elif argv[i] in ["-o", "--o"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print("Error! Wrong argument:", argv[i], file=sys.stderr) return 1 else: infile = argv[i] i += 1 if infile == "" or not os.path.exists(infile): print("Error. Infile not set. exit", file=sys.stderr) return 1 if seqid2pfamidFile == "" or not os.path.exists(seqid2pfamidFile): print("Error. seqid2pfamidFile does not exist. exit", file=sys.stderr) return 1 seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidFile) if seqid2pfamidFile == {}: print("Read seqid2pfamidFile failed.", file=sys.stderr) return 1 (idList, topoList) = myfunc.ReadFasta_without_annotation(infile) if len(idList) < 1: print("Read infile failed.", file=sys.stderr) return 1 idList.remove("Consensus") fpout = myfunc.myopen(outfile, sys.stdout, "w", False) WritePfamColorDef(idList, seqid2pfamidDict, fpout) myfunc.myclose(fpout) return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 SPE_PAIR_LIST = [(2, 1), (2, 4), (2, 6), (2, 8), (3, 6), (3, 7), (4, 6), (4, 8), (4, 10), (5, 7), (5, 10), (6, 8), (6, 10), (6, 12), (7, 14), (8, 10), (8, 12), (10, 12), (10, 13), (11, 13), (12, 14)] outfile = "" infile = "" pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3) signalpFile = "%s/wk/MPTopo/pfamAna_refpro/pred_signalp/refpro20120604-celluar.selmaxlength-m1.nr100.signalp_list" % ( DATADIR3) #seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2clanid"%(DATADIR3) #seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2pfamid"%(DATADIR3) seqid2clanidMapFile = "" seqid2pfamidMapFile = "" tm_pfamidListFile = "" tm_clanidListFile = "" pfamid2seqidMapFile = "" clanid2seqidMapFile = "" dbname_predTM = "" pairlistwithpfamidFile = "" pfamtype = "" pairListFile = "" #classList_TableNumTMHeatMap = ["ALL", "RMSP"] classList_TableNumTMHeatMap = ["ALL"] i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outpath", "--outpath"]: (g_params['outpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (fileListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-signalp", "--signalp"]: (signalpFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mp", "--mp"]: g_params[ 'pairwise_comparison_method'], i = myfunc.my_getopt_int( argv, i) elif argv[i] in ["-mindiffpair", "--mindiffpair"]: g_params['mindiffpair'], i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-pfamtype", "--pfamtype"]: pfamtype, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanidlist", "--clanidlist"]: (tm_clanidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamidlist", "--pfamidlist"]: (tm_pfamidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2clanid", "--seqid2clanid"]: (seqid2clanidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamid2seqid", "--pfamid2seqid"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanid2seqid", "--clanid2seqid"]: (clanid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlistwithpfamid", "--pairlistwithpfamid"]: (pairlistwithpfamidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-predTMdbname", "--predTMdbname"]: (dbname_predTM, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlist", "--pairlist"]: (pairListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-winsize", "--winsize"]: (g_params['winsize'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-outname", "--outname"]: (g_params['outname'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-prokar", "--prokar"]: g_params['isOnlyAnaProkar'] = True i += 1 elif argv[i] in ["-eukar", "--eukar"]: g_params['isOnlyAnaEukar'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if myfunc.checkfile( infile, "%s (line %d): infile" % (__file__, inspect.currentframe().f_lineno)) != 0: return 1 dirpath = myfunc.my_dirname(infile) # try to obtain Pfam family tag tag = "" if pfamtype != "": if pfamtype.upper().find("FAM") != -1: tag = ".Family" elif pfamtype.upper().find("DOM") != -1: tag = ".Domain" elif pfamtype.upper().find("REP") != -1: tag = ".Repeat" elif pfamtype.upper().find("MOT") != -1: tag = ".Motif" else: tag = "" else: if infile.find(".Family.") != -1: tag = ".Family" elif infile.find(".Domain.") != -1: tag = ".Domain" elif infile.find(".Repeat.") != -1: tag = ".Repeat" elif infile.find(".Motif.") != -1: tag = ".Motif" else: tag = "" if seqid2clanidMapFile == "": seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.seqid2clanid" % ( DATADIR3) if myfunc.checkfile( seqid2clanidMapFile, "%s (line %d): seqid2clanidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if seqid2pfamidMapFile == "": seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.seqid2pfamid" % ( DATADIR3, tag) if myfunc.checkfile( seqid2pfamidMapFile, "%s (line %d): seqid2pfamidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if pfamid2seqidMapFile == "": pfamid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.pfamid2seqid" % ( DATADIR3) if myfunc.checkfile( pfamid2seqidMapFile, "%s (line %d): pfamid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if clanid2seqidMapFile == "": clanid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.clanid2seqid" % ( DATADIR3, tag) if myfunc.checkfile( clanid2seqidMapFile, "%s (line %d): clanid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_pfamidListFile == "": tm_pfamidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.pfamidlist" % ( DATADIR3, tag) if myfunc.checkfile( tm_pfamidListFile, "%s (line %d): tm_pfamidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_clanidListFile == "": tm_clanidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.clanidlist" % ( DATADIR3) if myfunc.checkfile( tm_clanidListFile, "%s (line %d): tm_clanidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if dbname_predTM == "": dbname_predTM = "%s/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.RMSP" % ( DATADIR3) if myfunc.checkfile( "%s0.db" % (dbname_predTM), "%s (line %d): dbname_predTM" % (__file__, inspect.currentframe().f_lineno)): return 1 if g_params['isOnlyAnaProkar']: prokarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Prokaryota.seqidlist" % ( DATADIR3) g_params['prokarSeqIDSet'] = set(myfunc.ReadIDList(prokarseqidfile)) if len(g_params['prokarSeqIDSet']) < 1: return 1 if g_params['isOnlyAnaEukar']: eukarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Eukaryota.seqidlist" % ( DATADIR3) g_params['eukarSeqIDSet'] = set(myfunc.ReadIDList(eukarseqidfile)) if len(g_params['eukarSeqIDSet']) < 1: return 1 if pairlistwithpfamidFile == "": pairlistwithpfamidFile = "%s/../../Pfam-.maxpair100.pairlistwithpfamid" % ( dirpath) if myfunc.checkfile( pairlistwithpfamidFile, "%s (line %d): pairlistwithpfamidFile" % (__file__, inspect.currentframe().f_lineno)): return 1 pfamid_2_seqidpair_Dict = ReadPairListWithFamID(pairlistwithpfamidFile) usedPfamIDSet = set( pfamid_2_seqidpair_Dict.keys()) # pfamids used in pair selection if pairListFile != "": li = myfunc.ReadPairList(pairListFile) SPE_PAIR_LIST = [] for tup in li: SPE_PAIR_LIST.append((int(tup[0]), int(tup[1]))) (pfamidDefDict, clanidDefDict) = ReadPfamDefFile(pfamDefFile) signalpDict = lcmp.ReadSignalPDict(signalpFile) seqid2clanidDict = myfunc.ReadFam2SeqidMap(seqid2clanidMapFile) seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) clanid2seqidDict = myfunc.ReadFam2SeqidMap(clanid2seqidMapFile) pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tm_pfamidList = myfunc.ReadIDList(tm_pfamidListFile) tm_clanidList = myfunc.ReadIDList(tm_clanidListFile) tm_pfamidSet = set(tm_pfamidList) tm_clanidSet = set(tm_clanidList) hdl_predTM = myfunc.MyDB(dbname_predTM) if not hdl_predTM.failure: idSet_TMpro = set(hdl_predTM.indexedIDList) else: idSet_TMpro = set([]) #classList_TableNumTMHeatMap = ["ALL", "RMSP", "RMDUP"] #alignrangeList = ['FULL_ALIGNED', 'all', 'PART_ALIGNED'] alignrangeList = ['FULL_ALIGNED'] if g_params['outpath'] != "" and not os.path.exists(g_params['outpath']): cmd = ["mkdir", "-p", g_params['outpath']] try: subprocess.check_call(cmd) except subprocess.CalledProcessError, e: print e return 1
def main(g_params):#{{{ argv = sys.argv numArgv=len(argv) if numArgv < 2: PrintHelp() return 1 i = 1 isNonOptionArg=False isPickOne = False infile = "" pfamDefFile = '/data3/data/pfam/pfam26.0/Pfam-A.clans.tsv' seqDefFile = '/data3/wk/MPTopo/pfamAna/pfam2-selTM-giid-refseqid-pfamid-description.txt' idwithannoFile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq.idwithanno" seqLengthFile = "/data3/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq.seqlen" topoDB = "/data3/wk/MPTopo/pfamAna_refpro/pred_topcons/refpro20120604-celluar.selmaxlength-m1.topcons.result_TOPCONS.topo" seqDB = "/data3/wk/MPTopo/pfamAna_refpro/cellular_filter_fragment/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.uniq" pfamscanFile = "/data3/wk/MPTopo/pfamAna_refpro/result_pfamscan/Pfam-A-full.perTM75_nseq20.nr100.filter.fragmented.pfamscan" outpath = "" outfile = "" htmlname = 'index' while i < numArgv:#{{{ if isNonOptionArg == True: infile = argv[i] isNonOptionArg=False i += 1 elif argv[i] == "--": isNonOptionArg=True i += 1 elif argv[i][0] == "-": if argv[i] == "-h" or argv[i] == "--help": PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: outpath = argv[i+1] i += 2 elif argv[i] in ["-o", "--o"]: (outfile, i) = myfunc.my_getopt_str(argv,i) elif argv[i] in ["-htmlname", "--htmlname"]: htmlname = argv[i+1] i += 2 elif argv[i] in ["-seqlen", "--seqlen"]: seqLengthFile = argv[i+1] i += 2 elif argv[i] in ["-topodb", "--topodb"]: topoDB = argv[i+1] i += 2 elif argv[i] in ["-seqdb", "--seqdb"]: seqDB = argv[i+1] i += 2 elif argv[i] in [ "-pfamdef", "--pfamdef"]: pfamDefFile = argv[i+1] i += 2 elif argv[i] in ["-pfamscan", "--pfamscan"]: pfamscanFile = argv[i+1] i += 2 elif argv[i] in ["-seqdef", "--seqdef"]: seqDefFile = argv[i+1] i += 2 elif argv[i] in ["-q"]: isQuiet = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 #}}} g_params['outpath'] = outpath if outpath == "": print >> sys.stderr, "outpath not set" return 1 elif outpath != "" and not os.path.exists(outpath): os.makedirs(outpath) if infile == "": print >> sys.stderr, "infile not set" return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) seqid2pfamidDict = myfunc.ReadFam2SeqidMap(infile) # group sequences by domains groupDict = {} for seqid in seqid2pfamidDict: famlist = seqid2pfamidDict[seqid] ss = "\t".join(famlist) if not ss in groupDict: groupDict[ss] = [] groupDict[ss].append(seqid) groupList = [] for ss in groupDict: groupList.append((ss, len(groupDict[ss]), groupDict[ss])) groupList = sorted(groupList, key=lambda x:x[1], reverse=True) seqlenDict = ReadSeqLengthDict(seqLengthFile) seqannoDict = ReadIDWithAnnoInfo(idwithannoFile) (pfamidDefDict, clanidDefDict) = lcmp.ReadPfamDefFile(pfamDefFile) topoDict = GetTopoDict(topoDB, seqid2pfamidDict.keys()) seqDict = GetTopoDict(seqDB, seqid2pfamidDict.keys()) pfamScanDict = myfunc.ReadPfamScan(pfamscanFile) groupedPfamScanDict = GroupPfamScanDict(pfamScanDict) # WriteHTML(seqid2pfamidDict, seqlenDict, seqannoDict, pfamidDefDict, # clanidDefDict, topoDict, htmlname, outpath) WriteInfo(groupList, seqlenDict, seqannoDict, pfamidDefDict, clanidDefDict, topoDict, groupedPfamScanDict, htmlname, fpout) MakeAlignment(groupList, seqDict, topoDict, outpath) myfunc.myclose(fpout) return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "" pairListFile = "" seqlenFile = "" shortid2fullidFile = "" seqid2pfamidMapFile = "" pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv' topodb = "" seqdb = "" pdb2spFile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: isNonOptionArg = False i += 1 return 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topodb", "--topodb"]: (topodb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]: (pdb2spFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqdb", "--seqdb"]: (seqdb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqmsapath", "--seqmsapath"]: (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-datapath", "--datapath"]: (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seq2pfam", "--seq2pfam"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfam2seq", "--pfam2seq"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-description", "--description"]: (g_params['description'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-alignrange", "--alignrange"]: g_params['alignrange'], i = myfunc.my_getopt_str(argv, i) if not g_params['alignrange'] in ['all', 'full', 'part']: print >> sys.stderr, "alignrange must be one of [all, full, part]" return 1 else: if g_params['alignrange'] == 'full': g_params['alignrange'] = 'FULL_ALIGNED' elif g_params['alignrange'] == 'part': g_params['alignrange'] = 'PART_ALIGNED' elif argv[i] in ["-basename", "--basename"]: (g_params['basename'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-treepath", "--treepath"]: (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairalnpath", "--pairalnpath"]: (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-maxperfamily", "--maxperfamily"]: (g_params['max_num_output_per_family'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-min-seqidt", "--min-seqidt"]: g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-max-seqidt", "--max-seqidt"]: g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]: (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-debug", "--debug"]: if argv[i + 1][0].lower() == 'y': g_params['isDEBUG'] = True else: g_params['isDEBUG'] = False i += 2 elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 if g_params['basename'] == "": print >> sys.stderr, "basename not set. exit" return 1 if myfunc.checkfile(g_params['datapath'], "datapath") != 0: return 1 if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0: return 1 if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0: return 1 if myfunc.checkfile(topodb + "0.db", "topodb") != 0: return 1 if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0: return 1 if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0: return 1 if pdb2spFile != "": (g_params['pdb2uniprotMap'], g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile) if g_params['datapath'] == "": print >> sys.stderr, "datapath not set" return 1 elif not os.path.exists(g_params['datapath']): print >> sys.stderr, "datapath %s does not exist" % ( g_params['datapath']) return 1 if outpath == "": print >> sys.stderr, "outpath not set" return 1 elif not os.path.exists(outpath): cmd = ["mkdir", "-p", outpath] subprocess.check_call(cmd) paircmpfile = "%s/%s.paircmp" % (g_params['datapath'], g_params['basename']) if myfunc.checkfile(paircmpfile, "paircmpfile") != 0: return 1 (g_params['pfamidDefDict'], g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile) g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tmpdir = tempfile.mkdtemp() if g_params['msapath'] == "": g_params['msapath'] = tmpdir if g_params['treepath'] == "": g_params['treepath'] = tmpdir if g_params['pairalnpath'] == "": g_params['pairalnpath'] = tmpdir pairCmpRecordList = [] unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False try: fpin = open(paircmpfile, "r") except IOError: print >> sys.stderr, "Failed to open input file %s" % (paircmpfile) return 1 while 1: buff = fpin.read(myfunc.BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff rdList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList) rdList = FilterPairCmpResult(rdList) cntTotalReadInRecord += len(rdList) pairCmpRecordList += rdList if isEOFreached == True: break fpin.close() print "cntTotalReadInRecord =", cntTotalReadInRecord g_params['hdl_seqdb'] = myfunc.MyDB(seqdb) g_params['hdl_topodb'] = myfunc.MyDB(topodb) g_params['OS'] = os.uname()[0] if g_params['OS'].find('Linux') != -1: g_params['CP_EXE'] = "/bin/cp -uf" else: g_params['CP_EXE'] = "/bin/cp -f" if shortid2fullidFile != "": g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap( shortid2fullidFile) addname = "" if g_params['alignrange'] != 'all': addname += ".%s" % (g_params['alignrange']) dataTable = {} # structure of dataTable # dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}} # first read in pairCmpRecordList AddAllSeqInPairCmp(dataTable, pairCmpRecordList, g_params['seqid2pfamidDict']) pairInfoFileList = [] for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]: ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % ( g_params['datapath'], g_params['basename'], cmpclass) pairInfoFileList.append(ss) pairinfoList = ReadPairInfo_cmpclass(ss) AddPairInfo(dataTable, pairinfoList, cmpclass) # print "\n".join(pairInfoFileList) if g_params['isDEBUG']: #{{{ for pfamid in dataTable: print pfamid print "\tset_seqid" print dataTable[pfamid]['set_seqid'] print "\tdifftopopair" for cls in dataTable[pfamid]['difftopopair']: print "\t\t", cls for tup in dataTable[pfamid]['difftopopair'][cls]: print "\t\t\t", tup #}}} WriteHTML(dataTable, outpath) os.system("rm -rf %s" % (tmpdir))