def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("pepFile", help="A file containing a list of peptide names", nargs='*') parser.add_argument("-n", "--name", help="Column name for peptide names in metadata file", default="CodeName") parser.add_argument( "-c", "--cat", help="Column name for category of interest in metadata file", default="Species") reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument("-o", "--out", help="Output matrix file name", required=True) reqArgs.add_argument("-m", "--meta", help="Metadata file name", required=True) args = parser.parse_args() # Read in info from metadata file catD = io.fileDictHeader(args.meta, args.name, args.cat) # Create dictionary to hold counts countD = defaultdict(dict) # Step through each peptides file for pF in args.pepFile: # Read in peptide names peps = io.fileList(pF, header=False) for p in peps: c = catD[p] countD[c][pF] = countD[c].get(pF, 0) + 1 # Sorted list of categories allCats = sorted(list(countD.keys())) #Write output file with open(args.out, "w") as fout: fout.write("File\t%s\n" % ("\t".join(allCats))) for pF in args.pepFile: counts = [ str(countD[c][pF]) if pF in countD[c] else "0" for c in allCats ] fout.write("%s\t%s\n" % (pF, "\t".join(counts)))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-e", "--binary", help="How to call the pepSIRF binary/executable", default="pepsirf") inArgs = parser.add_argument_group('input files') inArgs.add_argument( "-r", "--raw", help="Input raw count matrix. This is an optional starting place.") inArgs.add_argument( "-c", "--colsum", help= "Input colsum normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-d", "--diff", help= "Input diff normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-f", "--diffratio", help= "Input diff_ratio normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-b", "--bins", help="Peptide bin file. Required for calculating Z scores.") inArgs.add_argument( "-z", "--zscore", help="Z score matrix. This is an optional starting place.") inArgs.add_argument( "-n", "--names", help="File containing sample names. This is an optional starting place." ) inArgs.add_argument( "-p", "--pairs", help="File containing sample pairs. This is an optional starting place." ) inArgs.add_argument( "-t", "--thresh", help="Threshold file for pEnrich. This is an optional starting place.") threshArgs = parser.add_argument_group('thresholds') threshArgs.add_argument( "--zThresh", default="6,10", help= "Z score threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( "--csThresh", default="20", help= "Colum-Sum norm count threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( "--sbdrThresh", default="4", help= "Negative control ratio threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( '--rawThresh', default="488000", help= 'Total raw read count for a sample to be included in enrichment analyses. Can include up to two floating points separated by a comma.' ) threshArgs.add_argument( '--hdi', default=0.95, type=float, help= 'The highest density interval to be used for calculation of mean and stdev in the zscore module.' ) controlArgs = parser.add_argument_group('control info') controlArgs.add_argument( "--negNormMat", help= "Alternative colsum normalized matrix form which to obtain the negative controls." ) controlArgs.add_argument( "--negative_id", help= "Optional approach for identifying negative controls. Provide a unique string at the start of all negative control samples." ) controlArgs.add_argument( "--negative_names", help= "Optional approach for identifying negative controls. Comma-separated list of negative control sample names." ) enrichArgs = parser.add_argument_group('enrich options') enrichArgs.add_argument( "--sEnrich", default=False, action="store_true", help= "Generate lists of enriched peptides separately for each pulldown. Will actually run p_enrich, but with the same sample specified for each replicate." ) enrichArgs.add_argument( "--inferPairs", default=False, action="store_true", help= "Infer sample pairs from names. This option assumes names of replicates will be identical with the exception of a final string denoted with a '_'. For example, these names would be considered two replicates of the same sample: VW_100_1X_A and VW_100_1X_B" ) args = parser.parse_args() #Creat base string for output files if args.raw: base = ".".join(args.raw.split(".")[:-1]) elif args.colsum: base = ".".join(args.colsum.split(".")[:-1])[:-3] elif args.diff: base = ".".join(args.diff.split(".")[:-1])[:-4] elif args.diffratio: base = ".".join(args.diffratio.split(".")[:-1])[:-5] elif args.zscore: base = ".".join(args.zscore.split(".")[:-1])[:-2] else: base = None print( "Not going to be able to do much without a score matrix of some type." ) # If a raw count matrix is provided, but a colusum norm matrix is NOT if args.raw and not args.colsum: args.colsum = "%s_CS.tsv" % (base) cmd = "%s norm -a col_sum -p %s -o %s >> norm.out" % ( args.binary, args.raw, args.colsum) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([args.binary, "norm -a col_sum -p", args.raw, "-o", args.colsum, ">> norm.out"]) if args.colsum: if not args.negative_id and not args.negative_names: print( "You must proivde either '--negative_id' or '--negative_names' to allow negative control based normalization." ) else: # Generate string for specifying how to get neg control data negInfo = "" if args.negNormMat: negInfo += ' --negative_control "%s" ' % (args.negNormMat) if args.negative_id: negInfo += " --negative_id %s " % (args.negative_id) if args.negative_names: negInfo += " --negative_names %s " % (args.negative_names) # Generate other normalized files if not args.diff: args.diff = "%s_SBD.tsv" % (base) cmd = "%s norm -a diff -p %s -o %s %s >> norm.out" % ( args.binary, args.colsum, args.diff, negInfo) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([opts.binary, "norm -a diff -p", args.colsum, "-o", args.diff, negInfo, ">> norm.out"]) if not args.diffratio: args.diffratio = "%s_SBDR.tsv" % (base) cmd = "%s norm -a diff_ratio -p %s -o %s %s >> norm.out" % ( args.binary, args.colsum, args.diffratio, negInfo) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([opts.binary, "norm -a diff_ratio -p", args.colsum, "-o", args.diffratio, negInfo, ">> norm.out"]) if args.bins and args.diff: # Generate Z scores args.zscore = "%s_Z-HDI%d.tsv" % (base, int(args.hdi * 100)) args.znan = "%s_Z-HDI%d.nan" % (base, int(args.hdi * 100)) cmd = '%s zscore -s %s -o %s -n %s -b "%s" -d %f >> zscore.out' % ( args.binary, args.diff, args.zscore, args.znan, args.bins, args.hdi) print(cmd) subprocess.run(cmd, shell=True) elif not args.bins: print("You must proivde '--bins' for Z score calculation.") # Generate list of sample names, if not provided if not args.names and base: args.names = "%s_SN.tsv" % (base) if args.raw: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.raw, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.colsum: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.colsum, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.zscore: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.zscore, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.diff: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.diff, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.diffratio: cmd = '%s info -i %s -s %s >> info.out' % ( args.binary, args.diffratio, args.names) print(cmd) subprocess.run(cmd, shell=True) else: print( "No file was providing for generating a list of sample names.") # Generate pairs file, if not provided if not args.pairs and args.names and base: sNames = io.fileList(args.names, header=False) if args.sEnrich and args.names: args.pairs = "%s_pseudoPN.tsv" % (base) with open(args.pairs, "w") as fout: for sn in sNames: fout.write("%s\t%s\n" % (sn, sn)) elif args.inferPairs: pDict = defaultdict(list) for each in sNames: simple = "_".join(each.split("_")[:-1]) pDict[simple].append(each) args.pairs = "%s_PN.tsv" % (base) with open(args.pairs, "w") as fout: for k, v in pDict.items(): if len( v ) == 2: # If exactly two replicates were found, add them to the pairs file fout.write("%s\n" % ("\t".join(v))) elif len( v ) > 2: # If more than two replicates were found, add all possible pairs for a, b in it.combinations(v, 2): fout.write("%s\t%s\n" % (a, b)) else: print("Only one replicate found for %s: %s" % (simple, each)) else: print( "To run p_enrich module, you must provide one of the following: '--pairs', '--inferPairs', '--sEnrich'" ) # Generate threshold file if not args.thresh and base: args.thresh = "%s_thresh.tsv" % (base) with open(args.thresh, "w") as fout: if args.zscore: fout.write("%s\t%s\n" % (args.zscore, args.zThresh)) if args.colsum: fout.write("%s\t%s\n" % (args.colsum, args.csThresh)) if args.diffratio: fout.write("%s\t%s\n" % (args.diffratio, args.sbdrThresh)) # Run p_enrich module if args.thresh and args.pairs and base: enrDir = makeDirName(args) if args.raw: cmd = '%s p_enrich -t %s -s %s -r %s --raw_score_constraint %s -x _enriched.txt -o %s >> penrich.out' % ( args.binary, args.thresh, args.pairs, args.raw, args.rawThresh, enrDir) else: cmd = '%s p_enrich -t %s -s %s -x _enriched.txt -o %s >> penrich.out' % ( args.binary, args.thresh, args.pairs, enrDir) print(cmd) subprocess.run(cmd, shell=True) if matplotReady: if args.raw: #Generate reac counts file args.readCounts = "%s_RC.tsv" % (base) cmd = '%s info -i %s -c %s >> info.out' % (args.binary, args.raw, args.readCounts) print(cmd) subprocess.run(cmd, shell=True) #Read in counts rcD = io.fileDictHeader(args.readCounts, "Sample name", "Sum of probe scores") boxplot(list(rcD.values()), "readCountBoxplot.png", args.rawThresh) if args.thresh and args.pairs and base: enrFiles = glob.glob("%s/*enriched.txt" % (enrDir)) enrCounts = [ len(io.fileList(f, header=False)) for f in enrFiles ] if len(enrCounts) > 0: boxplot(enrCounts, "enrichedCountBoxplot.png", "200")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument('-d', '--data', help='Data matrix for generating scatterlots.', required=True) parser.add_argument('-o', '--outfile', help='Name for out file.') parser.add_argument('-x', '--xHead', help='Header in data file for x-axis.') parser.add_argument('-y', '--yHead', help='Header in data file for y-axis.') parser.add_argument('-c', '--color', help='Header name to use to color points in plot.') parser.add_argument('--cMap', help='Optional way of mapping the "color" variable in the data matrix to another categorical variable. 3 comma-separated variables should be provided: map file, key header, value header.') parser.add_argument('--xLog', default=False, type=float, help="Use if you want x-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.") parser.add_argument('--yLog', default=False, type=float, help="Use if you want y-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.") parser.add_argument('--delim', default="\t", help="Delimiter used in the data file.") parser.add_argument('--xLab', help="String for x-label. If not provided, --xHead is used.") parser.add_argument('--yLab', help="String for y-label. If not provided, --yHead is used.") # parser.add_argument('--width', default=5, type=int, help="Figure width.") # parser.add_argument('--height', default=4, type=int, help="Figure height.") parser.add_argument('--include', help="Header,Value pairs used to indicate a subset of rows to include.", nargs='*') parser.add_argument('--exclude', help="Header,Value pairs used to indicate a subset of rows to exclude.", nargs='*') parser.add_argument('--xLegend', default=0.1, type=float, help="x-coordinate to use for color legend.") parser.add_argument('--yLegend', default='top', help="Indicate whether you want the legend at the 'top' or 'bottom' of the plot.") parser.add_argument('--xeqy', default=False, action="store_true", help="Use if you want an x=y line included in the plot.") parser.add_argument('--markerSize', default=10, type=int, help="Size of marker used in plot.") parser.add_argument('--alpha', default=0.6, type=float, help="Alpha (transparency) value to use in plot.") parser.add_argument('-b', '--batch', help='An alternative way to provide input/output files that allows the generation of multiple plots with a single command. File provided should be tab-delimited, one line per output plot: xHeader, yHeader, outfile, colorHeader. Colorheader column is optional.') parser.add_argument('-a', '--allByAll', help='Optional way to specify xHead and yHead. Should be a list of column headers. A plot will be generated for all pairwise comparisons of the columns in this file. Output names will be generated based on the column name.') opts = parser.parse_args() if opts.cMap: mapF, kHead, vHead = opts.cMap.split(",") opts.cMap = io.fileDictHeader(mapF, kHead, vHead) #Read in data file dataD = io.fileDictFull(opts.data, opts.delim) if opts.batch: with open(opts.batch) as fin: for line in fin: cols = line.rstrip("\n").split("\t") opts.xHead = cols[0] opts.yHead = cols[1] opts.outfile = cols[2] if len(cols)==4: opts.color = cols[3] # Make a subset dict that will be manipulated subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")} # Make sure data columns are formatted properly prepData(subD, opts) #Generate plot scatter(subD, opts) #Reset Labels opts.xLab=None opts.yLab=None elif opts.allByAll: heads = io.fileList(opts.allByAll, header=False) for h1, h2 in it.combinations(heads, 2): opts.xHead = h1 opts.yHead = h2 opts.outfile = "%s_%s.png" % (h1, h2) # Make a subset dict that will be manipulated subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")} # Make sure data columns are formatted properly prepData(subD, opts) #Generate plot scatter(subD, opts) #Reset Labels opts.xLab=None opts.yLab=None else: if opts.include: for each in opts.include: header, val = each.split(",") dataD = onlyInclude(dataD, header, val) if opts.exclude: for each in opts.exclude: header, val = each.split(",") dataD = toExclude(dataD, header, val) # Make sure data columns are formatted properly prepData(dataD, opts) #Generate plot scatter(dataD, opts)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("pepFile", help="A file containing a list of peptide names", nargs='*') parser.add_argument("--name", help="Column name for peptide names in metadata file", default="CodeName") parser.add_argument("--sid", help="Column name for species IDs in metadata file", default="SpeciesID") parser.add_argument("--species", help="Column name for species names in metadata file", default="Species") parser.add_argument( '-a', '--alignInfo', default= "/Volumes/GoogleDrive/Shared drives/LadnerLab/Projects/panviral_pepseq/analysis/alignments/AlignmentInfoCoded.txt", help='Contains info for species-level seq alignments.') parser.add_argument( '--sampleOrder', help= 'Can provide a plain text file with the order you would like the samples to appear in the plot.' ) parser.add_argument( '--annots', help= 'File with annotation info. If provided, a graphical representation of the different proteins with also be plotted.' ) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument("-o", "--out", help="Output matrix file name", required=True) reqArgs.add_argument("-m", "--meta", help="Metadata file name", required=True) reqArgs.add_argument('-s', '--speciesIDs', help='Comma-separated species IDs of interest.', required=True) args = parser.parse_args() # p.add_option('-p', '--probes', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna", help='Fasta file with probe/tag sequences. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna]') # p.add_option('-m', '--map', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map", help='Probe name map. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map]') # p.add_option('-t', '--taxa', default="/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt", help='Taxa info to link IDs to names [/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt]') # p.add_option('--withLabels', action='store_true', default=False, help='Use this flag to include sample labels on y-axis. [False]') # Read in info from metadata file sidD = io.fileDictHeader(args.meta, args.name, args.sid) id2name = io.fileDictHeader(args.meta, args.sid, args.species) #Read in sample order for plots, IF provided sampOrderList = io.fileList(args.sampleOrder, header=False) #Read in probes # names,tseqs = read_fasta_lists(opts.probes) #Creat dict with seqs as keys, names as values # tagnames={tseqs[i]:names[i] for i in range(len(names))} #Generate dict to translate names # mapDict=readmap(opts.map, order=1) #Read in alignment data alInfo = {} with open(args.alignInfo, "r") as fin: lc = 0 for line in fin: lc += 1 if lc > 1: cols = line.rstrip("\n").split("\t") alInfo[cols[0]] = ["%s/%s" % (cols[1], x) for x in cols[2:]] #Read in annotation info, if provided annotD = {} if args.annots: with open(args.annots, "r") as fin: lc = 0 for line in fin: lc += 1 if lc > 1: cols = line.rstrip("\n").split("\t") if cols[0] not in annotD: annotD[cols[0]] = {} annotD[cols[0]][cols[1]] = [ x.split(",") for x in cols[2].split("~") ] #Make probe count plots #By default, any sample that starts with "Super" is excluded. These are expected to be negative controls for sid in args.speciesIDs.split(","): if args.sampleOrder: plotAlignHits(sampOrderList, sid, alInfo, id2name[sid], annotD, args) else: plotAlignHits( [x for x in list(data.keys()) if not x.startswith("Super")], each, alInfo, data, mapDict, id2name, opts.minDepth, annotD, opts)
def main(): usage = '%prog [options] probes1.txt [probes2.txt ...]' p = optparse.OptionParser() # p.add_option('-e', '--enriched', help='List of peptide names, one per line. [None, REQ]') p.add_option('-m', '--meta', help='Metadata file. [None, REQ]') p.add_option( '-o', "--outDir", default="pepAligned", help= 'Name for dirctory in which output files will be generated. This Dirctory should NOT exist already. [pepAligned]' ) p.add_option( "--master", help= 'Use this option to provide a directory name to contain output alignments that will combine peptides enriched in all samples. [OPT]' ) p.add_option( "--plot", help= 'If this option is used, along with some type of matrix, then heat map plots will be generated for each alignment. [OPT]' ) p.add_option("--minToPlot", default=0, type="int", help='Minimum score to include in plots. [0]') p.add_option( "--plotLog", default=False, action="store_true", help= 'Use this flag if you want the y-axis for plots to be on the log scale. [False]' ) opts, args = p.parse_args() #Check for output directory and create IF it doesn't already exist if not os.path.isdir(opts.outDir): os.mkdir(opts.outDir) #If master alignments are requested, generate output directory and create dictionary to hold into if opts.master: if not os.path.isdir(opts.master): os.mkdir(opts.master) masterByProt = defaultdict(list) #Read in info from metadata file pepD = io.fileDictHeader(opts.meta, "CodeName", "Peptide") catD = io.fileDictHeader(opts.meta, "CodeName", "Category") protD = io.fileDictHeader(opts.meta, "CodeName", "Protein") startD = io.fileDictHeader(opts.meta, "CodeName", "AlignStart") startD = {k: int(v) for k, v in startD.items() if v} stopD = io.fileDictHeader(opts.meta, "CodeName", "AlignStop") stopD = {k: int(v) for k, v in stopD.items() if v} #Read in score matrix for generating heat maps, if provided if opts.plot: scoreDict = parseCounts(opts.plot) print("Read scores for plots") #Step through each list of probes provided for each in args: print(each) #Read in peptides of interest peps = filelist(each) print(len(peps)) #Make separate lists of peptides from different proteins byProt = defaultdict(list) for p in peps: if catD[p] != "Control": byProt[protD[p]].append(p) if opts.master: masterByProt[protD[p]].append(p) #Generate clusters for each protein for prot, pL in byProt.items(): clu = clustProbes(pL, pepD, startD, stopD) print(len(clu)) #Check clusters for a, b in clu.items(): if set(a) != set(b): print( "Not all starting places resulted in the same cluster!!!", a, sorted(b)) #String for output files outStr = "%s_%s" % (prot, ".".join( each.split("/")[-1].split(".")[:-1])) #Write aligned fasta files for c in clu: names, seqs, bounds = alignSeqs({ x: [pepD[x], list(range(startD[x], stopD[x]))] for x in c }) write_fasta(names, seqs, "%s/%s_%s.fasta" % (opts.outDir, outStr, bounds)) #Gnerate master output, if requested if opts.master: for prot, pL in masterByProt.items(): clu = clustProbes(pL, pepD, startD, stopD) print(len(clu)) #Check clusters for a, b in clu.items(): if set(a) != set(b): print( "Not all starting places resulted in the same cluster!!!", a, sorted(b)) #String for output files outStr = "%s_master" % (prot) #Write aligned fasta files for c in clu: names, seqs, bounds = alignSeqs({ x: [pepD[x], list(range(startD[x], stopD[x]))] for x in c }) write_fasta(names, seqs, "%s/%s_%s.fasta" % (opts.master, outStr, bounds)) #Generate heat map if opts.plot: heatMap( names, seqs, { k: {p: s for p, s in v.items() if p in c} for k, v in scoreDict.items() }, "%s/%s_%s.pdf" % (opts.master, outStr, bounds), opts.minToPlot, opts.plotLog)
def main(): usage = '%prog [options]' p = optparse.OptionParser() # p.add_option('-e', '--enriched', help='List of peptide names, one per line. [None, REQ]') p.add_option('-i', '--inputInfo', help='Tab-delimited file indicating the sample names and enriched probe lists. If you want replicates to be averaged, probe name as a unique starting string common to all replicates. [None, REQ]') p.add_option('-m', '--meta', help='Metadata file. [None, REQ]') p.add_option('-o', "--outDir", default="pepAligned", help='Name for dirctory in which output files will be generated. This Dirctory should NOT exist already. [pepAligned]') p.add_option("--plot", help='If this option is used, along with some type of matrix, then heat map plots will be generated for each alignment. [None, REQ]') p.add_option("--minToPlot", default=1, type="float", help='Minimum score to include in plots. [1]') p.add_option("--plotLog", default=False, action="store_true", help='Use this flag if you want the y-axis for plots to be on the log scale. [False]') p.add_option("--addNegNorm", help='Use this flag if you want to add info about the norm read counts for negative controls y-axis labels. Argument should be tab-delimited file. First column should be comma-sep list of negative control names, the second the name of the norm read count matrix. [None, OPT]') opts, args = p.parse_args() #Check for output directory and create IF it doesn't already exist if not os.path.isdir(opts.outDir): os.mkdir(opts.outDir) #Get neg norm counts, if provided: normNeg = {} if opts.addNegNorm: with open(opts.addNegNorm, "r") as fin: lc=0 for line in fin: lc+=1 if lc==1: cols = line.rstrip("\n").split("\t") negNames = cols[0].split(",") normDict = parseCounts(cols[1]) for p in normDict[negNames[0]]: normNeg[p] = np.mean([normDict[x][p] for x in negNames]) #Read in info from metadata file pepD = io.fileDictHeader(opts.meta, "CodeName", "Peptide") catD=io.fileDictHeader(opts.meta, "CodeName", "Category") protD=io.fileDictHeader(opts.meta, "CodeName", "Protein") startD=io.fileDictHeader(opts.meta, "CodeName", "AlignStart") startD={k:int(v) for k,v in startD.items() if v} stopD=io.fileDictHeader(opts.meta, "CodeName", "AlignStop") stopD={k:int(v) for k,v in stopD.items() if v} #Read in score matrix for generating heat maps, if provided scoreDict = parseCounts(opts.plot) #Reads in names and probe lists enrichD = io.fileDictHeader(opts.inputInfo, "Sample", "PepFile") #Generate replicate lists for each name provided repD = {s:[n for n in scoreDict if n.startswith(s)] for s in enrichD} #Step through each list of probes provided masterByProt = defaultdict(list) #To hold info for sName, each in enrichD.items(): #Read in peptides of interest peps = filelist(each) for p in peps: if catD[p] != "Control": masterByProt[protD[p]].append(p) for prot, pL in masterByProt.items(): clu = clustProbes(pL, pepD, startD, stopD) #Check clusters for a,b in clu.items(): if set(a) != set(b): print ("Not all starting places resulted in the same cluster!!!", a, sorted(b)) #String for output files outStr="%s_master" % (prot) for c in clu: names,seqs,bounds = alignSeqs({x:[pepD[x], list(range(startD[x], stopD[x]))] for x in c}) #Write aligned fasta files # write_fasta(names, seqs, "%s/%s_%s.fasta" % (opts.master, outStr, bounds)) #Generate heat map dta = {sN:{p:0.0001 for p in c} for sN in repD} for sN, each in enrichD.items(): #Read in peptides of interest peps = filelist(each) for p in peps: if catD[p] != "Control": avgV = np.mean([scoreDict[x][p] for x in repD[sN]]) if avgV == float('inf'): avgV = 10000 dta[sN][p] = avgV if opts.addNegNorm: seqsPlus = ["%.0f %s" % (normNeg[names[i]], seqs[i]) for i in range(len(seqs))] heatMap(names, seqsPlus, dta, "%s/%s_%s.pdf" % (opts.outDir, outStr, bounds), opts.minToPlot, opts.plotLog) else: heatMap(names, seqs, dta, "%s/%s_%s.pdf" % (opts.outDir, outStr, bounds), opts.minToPlot, opts.plotLog)