def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("pepFile", help="A file containing a list of peptide names", nargs='*') parser.add_argument("-n", "--name", help="Column name for peptide names in metadata file", default="CodeName") parser.add_argument( "-c", "--cat", help="Column name for category of interest in metadata file", default="Species") reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument("-o", "--out", help="Output matrix file name", required=True) reqArgs.add_argument("-m", "--meta", help="Metadata file name", required=True) args = parser.parse_args() # Read in info from metadata file catD = io.fileDictHeader(args.meta, args.name, args.cat) # Create dictionary to hold counts countD = defaultdict(dict) # Step through each peptides file for pF in args.pepFile: # Read in peptide names peps = io.fileList(pF, header=False) for p in peps: c = catD[p] countD[c][pF] = countD[c].get(pF, 0) + 1 # Sorted list of categories allCats = sorted(list(countD.keys())) #Write output file with open(args.out, "w") as fout: fout.write("File\t%s\n" % ("\t".join(allCats))) for pF in args.pepFile: counts = [ str(countD[c][pF]) if pF in countD[c] else "0" for c in allCats ] fout.write("%s\t%s\n" % (pF, "\t".join(counts)))
def plotInfo(info, samps): geneName, alPos, thisMin, thisMax = info covPos = range(thisMin, thisMax + 1) posCounts = {n: {x: 0 for x in covPos} for n in samps} for each in samps: enrPeps = io.fileList(each, header=False) for p in enrPeps: if p in alPos: for pos in alPos[p][2]: posCounts[each][pos] += 1 posCountsLists = [] for s in samps: posCountsLists.append([posCounts[s][x] for x in covPos]) xvec = np.linspace(min(covPos), max(covPos) + 1, num=len(covPos) + 1) yvec = np.linspace(1, len(samps) + 1, num=len(samps) + 1) xmat = np.array([xvec] * (len(samps) + 1)) ymat = np.column_stack([yvec[::-1]] * (len(covPos) + 1)) z = np.array(posCountsLists) return xmat, ymat, z
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-e", "--binary", help="How to call the pepSIRF binary/executable", default="pepsirf") inArgs = parser.add_argument_group('input files') inArgs.add_argument( "-r", "--raw", help="Input raw count matrix. This is an optional starting place.") inArgs.add_argument( "-c", "--colsum", help= "Input colsum normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-d", "--diff", help= "Input diff normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-f", "--diffratio", help= "Input diff_ratio normalized count matrix. This is an optional starting place." ) inArgs.add_argument( "-b", "--bins", help="Peptide bin file. Required for calculating Z scores.") inArgs.add_argument( "-z", "--zscore", help="Z score matrix. This is an optional starting place.") inArgs.add_argument( "-n", "--names", help="File containing sample names. This is an optional starting place." ) inArgs.add_argument( "-p", "--pairs", help="File containing sample pairs. This is an optional starting place." ) inArgs.add_argument( "-t", "--thresh", help="Threshold file for pEnrich. This is an optional starting place.") threshArgs = parser.add_argument_group('thresholds') threshArgs.add_argument( "--zThresh", default="6,10", help= "Z score threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( "--csThresh", default="20", help= "Colum-Sum norm count threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( "--sbdrThresh", default="4", help= "Negative control ratio threshold. Can include up to two floating points separated by a comma." ) threshArgs.add_argument( '--rawThresh', default="488000", help= 'Total raw read count for a sample to be included in enrichment analyses. Can include up to two floating points separated by a comma.' ) threshArgs.add_argument( '--hdi', default=0.95, type=float, help= 'The highest density interval to be used for calculation of mean and stdev in the zscore module.' ) controlArgs = parser.add_argument_group('control info') controlArgs.add_argument( "--negNormMat", help= "Alternative colsum normalized matrix form which to obtain the negative controls." ) controlArgs.add_argument( "--negative_id", help= "Optional approach for identifying negative controls. Provide a unique string at the start of all negative control samples." ) controlArgs.add_argument( "--negative_names", help= "Optional approach for identifying negative controls. Comma-separated list of negative control sample names." ) enrichArgs = parser.add_argument_group('enrich options') enrichArgs.add_argument( "--sEnrich", default=False, action="store_true", help= "Generate lists of enriched peptides separately for each pulldown. Will actually run p_enrich, but with the same sample specified for each replicate." ) enrichArgs.add_argument( "--inferPairs", default=False, action="store_true", help= "Infer sample pairs from names. This option assumes names of replicates will be identical with the exception of a final string denoted with a '_'. For example, these names would be considered two replicates of the same sample: VW_100_1X_A and VW_100_1X_B" ) args = parser.parse_args() #Creat base string for output files if args.raw: base = ".".join(args.raw.split(".")[:-1]) elif args.colsum: base = ".".join(args.colsum.split(".")[:-1])[:-3] elif args.diff: base = ".".join(args.diff.split(".")[:-1])[:-4] elif args.diffratio: base = ".".join(args.diffratio.split(".")[:-1])[:-5] elif args.zscore: base = ".".join(args.zscore.split(".")[:-1])[:-2] else: base = None print( "Not going to be able to do much without a score matrix of some type." ) # If a raw count matrix is provided, but a colusum norm matrix is NOT if args.raw and not args.colsum: args.colsum = "%s_CS.tsv" % (base) cmd = "%s norm -a col_sum -p %s -o %s >> norm.out" % ( args.binary, args.raw, args.colsum) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([args.binary, "norm -a col_sum -p", args.raw, "-o", args.colsum, ">> norm.out"]) if args.colsum: if not args.negative_id and not args.negative_names: print( "You must proivde either '--negative_id' or '--negative_names' to allow negative control based normalization." ) else: # Generate string for specifying how to get neg control data negInfo = "" if args.negNormMat: negInfo += ' --negative_control "%s" ' % (args.negNormMat) if args.negative_id: negInfo += " --negative_id %s " % (args.negative_id) if args.negative_names: negInfo += " --negative_names %s " % (args.negative_names) # Generate other normalized files if not args.diff: args.diff = "%s_SBD.tsv" % (base) cmd = "%s norm -a diff -p %s -o %s %s >> norm.out" % ( args.binary, args.colsum, args.diff, negInfo) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([opts.binary, "norm -a diff -p", args.colsum, "-o", args.diff, negInfo, ">> norm.out"]) if not args.diffratio: args.diffratio = "%s_SBDR.tsv" % (base) cmd = "%s norm -a diff_ratio -p %s -o %s %s >> norm.out" % ( args.binary, args.colsum, args.diffratio, negInfo) print(cmd) subprocess.run(cmd, shell=True) # subprocess.run([opts.binary, "norm -a diff_ratio -p", args.colsum, "-o", args.diffratio, negInfo, ">> norm.out"]) if args.bins and args.diff: # Generate Z scores args.zscore = "%s_Z-HDI%d.tsv" % (base, int(args.hdi * 100)) args.znan = "%s_Z-HDI%d.nan" % (base, int(args.hdi * 100)) cmd = '%s zscore -s %s -o %s -n %s -b "%s" -d %f >> zscore.out' % ( args.binary, args.diff, args.zscore, args.znan, args.bins, args.hdi) print(cmd) subprocess.run(cmd, shell=True) elif not args.bins: print("You must proivde '--bins' for Z score calculation.") # Generate list of sample names, if not provided if not args.names and base: args.names = "%s_SN.tsv" % (base) if args.raw: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.raw, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.colsum: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.colsum, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.zscore: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.zscore, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.diff: cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.diff, args.names) print(cmd) subprocess.run(cmd, shell=True) elif args.diffratio: cmd = '%s info -i %s -s %s >> info.out' % ( args.binary, args.diffratio, args.names) print(cmd) subprocess.run(cmd, shell=True) else: print( "No file was providing for generating a list of sample names.") # Generate pairs file, if not provided if not args.pairs and args.names and base: sNames = io.fileList(args.names, header=False) if args.sEnrich and args.names: args.pairs = "%s_pseudoPN.tsv" % (base) with open(args.pairs, "w") as fout: for sn in sNames: fout.write("%s\t%s\n" % (sn, sn)) elif args.inferPairs: pDict = defaultdict(list) for each in sNames: simple = "_".join(each.split("_")[:-1]) pDict[simple].append(each) args.pairs = "%s_PN.tsv" % (base) with open(args.pairs, "w") as fout: for k, v in pDict.items(): if len( v ) == 2: # If exactly two replicates were found, add them to the pairs file fout.write("%s\n" % ("\t".join(v))) elif len( v ) > 2: # If more than two replicates were found, add all possible pairs for a, b in it.combinations(v, 2): fout.write("%s\t%s\n" % (a, b)) else: print("Only one replicate found for %s: %s" % (simple, each)) else: print( "To run p_enrich module, you must provide one of the following: '--pairs', '--inferPairs', '--sEnrich'" ) # Generate threshold file if not args.thresh and base: args.thresh = "%s_thresh.tsv" % (base) with open(args.thresh, "w") as fout: if args.zscore: fout.write("%s\t%s\n" % (args.zscore, args.zThresh)) if args.colsum: fout.write("%s\t%s\n" % (args.colsum, args.csThresh)) if args.diffratio: fout.write("%s\t%s\n" % (args.diffratio, args.sbdrThresh)) # Run p_enrich module if args.thresh and args.pairs and base: enrDir = makeDirName(args) if args.raw: cmd = '%s p_enrich -t %s -s %s -r %s --raw_score_constraint %s -x _enriched.txt -o %s >> penrich.out' % ( args.binary, args.thresh, args.pairs, args.raw, args.rawThresh, enrDir) else: cmd = '%s p_enrich -t %s -s %s -x _enriched.txt -o %s >> penrich.out' % ( args.binary, args.thresh, args.pairs, enrDir) print(cmd) subprocess.run(cmd, shell=True) if matplotReady: if args.raw: #Generate reac counts file args.readCounts = "%s_RC.tsv" % (base) cmd = '%s info -i %s -c %s >> info.out' % (args.binary, args.raw, args.readCounts) print(cmd) subprocess.run(cmd, shell=True) #Read in counts rcD = io.fileDictHeader(args.readCounts, "Sample name", "Sum of probe scores") boxplot(list(rcD.values()), "readCountBoxplot.png", args.rawThresh) if args.thresh and args.pairs and base: enrFiles = glob.glob("%s/*enriched.txt" % (enrDir)) enrCounts = [ len(io.fileList(f, header=False)) for f in enrFiles ] if len(enrCounts) > 0: boxplot(enrCounts, "enrichedCountBoxplot.png", "200")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument('-d', '--data', help='Data matrix for generating scatterlots.', required=True) parser.add_argument('-o', '--outfile', help='Name for out file.') parser.add_argument('-x', '--xHead', help='Header in data file for x-axis.') parser.add_argument('-y', '--yHead', help='Header in data file for y-axis.') parser.add_argument('-c', '--color', help='Header name to use to color points in plot.') parser.add_argument('--cMap', help='Optional way of mapping the "color" variable in the data matrix to another categorical variable. 3 comma-separated variables should be provided: map file, key header, value header.') parser.add_argument('--xLog', default=False, type=float, help="Use if you want x-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.") parser.add_argument('--yLog', default=False, type=float, help="Use if you want y-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.") parser.add_argument('--delim', default="\t", help="Delimiter used in the data file.") parser.add_argument('--xLab', help="String for x-label. If not provided, --xHead is used.") parser.add_argument('--yLab', help="String for y-label. If not provided, --yHead is used.") # parser.add_argument('--width', default=5, type=int, help="Figure width.") # parser.add_argument('--height', default=4, type=int, help="Figure height.") parser.add_argument('--include', help="Header,Value pairs used to indicate a subset of rows to include.", nargs='*') parser.add_argument('--exclude', help="Header,Value pairs used to indicate a subset of rows to exclude.", nargs='*') parser.add_argument('--xLegend', default=0.1, type=float, help="x-coordinate to use for color legend.") parser.add_argument('--yLegend', default='top', help="Indicate whether you want the legend at the 'top' or 'bottom' of the plot.") parser.add_argument('--xeqy', default=False, action="store_true", help="Use if you want an x=y line included in the plot.") parser.add_argument('--markerSize', default=10, type=int, help="Size of marker used in plot.") parser.add_argument('--alpha', default=0.6, type=float, help="Alpha (transparency) value to use in plot.") parser.add_argument('-b', '--batch', help='An alternative way to provide input/output files that allows the generation of multiple plots with a single command. File provided should be tab-delimited, one line per output plot: xHeader, yHeader, outfile, colorHeader. Colorheader column is optional.') parser.add_argument('-a', '--allByAll', help='Optional way to specify xHead and yHead. Should be a list of column headers. A plot will be generated for all pairwise comparisons of the columns in this file. Output names will be generated based on the column name.') opts = parser.parse_args() if opts.cMap: mapF, kHead, vHead = opts.cMap.split(",") opts.cMap = io.fileDictHeader(mapF, kHead, vHead) #Read in data file dataD = io.fileDictFull(opts.data, opts.delim) if opts.batch: with open(opts.batch) as fin: for line in fin: cols = line.rstrip("\n").split("\t") opts.xHead = cols[0] opts.yHead = cols[1] opts.outfile = cols[2] if len(cols)==4: opts.color = cols[3] # Make a subset dict that will be manipulated subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")} # Make sure data columns are formatted properly prepData(subD, opts) #Generate plot scatter(subD, opts) #Reset Labels opts.xLab=None opts.yLab=None elif opts.allByAll: heads = io.fileList(opts.allByAll, header=False) for h1, h2 in it.combinations(heads, 2): opts.xHead = h1 opts.yHead = h2 opts.outfile = "%s_%s.png" % (h1, h2) # Make a subset dict that will be manipulated subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")} # Make sure data columns are formatted properly prepData(subD, opts) #Generate plot scatter(subD, opts) #Reset Labels opts.xLab=None opts.yLab=None else: if opts.include: for each in opts.include: header, val = each.split(",") dataD = onlyInclude(dataD, header, val) if opts.exclude: for each in opts.exclude: header, val = each.split(",") dataD = toExclude(dataD, header, val) # Make sure data columns are formatted properly prepData(dataD, opts) #Generate plot scatter(dataD, opts)
def main(): usage = '%prog [options]' p = optparse.OptionParser() p.add_option( '-d', '--data', help= 'Downsample data matrix generated by "downsampleReads.py". [None, REQ]' ) p.add_option( '-e', '--enrDir', help= 'Directory containing enriched peptide files, if available". [None, REQ]' ) p.add_option( '-o', '--out', help= 'Downsample data matrix generated by "downsampleReads.py". [None, REQ]' ) opts, args = p.parse_args() # Print command used print("Command run: '%s'" % (" ".join(sys.argv))) #Read in data file dataD = mt.parseCounts(opts.data) #Cumulative norm diff figure fig, ax = plt.subplots(figsize=(6, 5), facecolor='w') x = [] y = [] for s, d in dataD.items(): if s != "Full": x.append(int(s.split("_")[0])) y.append(cumulativeNormDiff(dataD["Full"], d)) ax.scatter(x, y, alpha=0.9, zorder=2, s=10) ax.set_xlabel("# Reads", fontsize=15) ax.set_ylabel("Cumulative Norm Difference", fontsize=15) for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): tick.label.set_fontsize(12) fig.savefig("%s_cumulativeNormDiff.png" % (opts.out), dpi=300, bbox_inches='tight') # Norm diff boxplots fig, ax = plt.subplots(figsize=(20, 5), facecolor='w') raw = [] for s, d in dataD.items(): if s != "Full" and s.split("_")[1] == "0": raw.append( (int(s.split("_")[0]), [dataD["Full"][p] - d[p] for p in d])) y = [r[1] for r in sorted(raw)] ax.boxplot(y) ax.set_xlabel("# Reads", fontsize=15) ax.set_ylabel("Norm Difference per Peptide", fontsize=15) ax.hlines([0], 1, 50, linestyle=":", color="b") for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): tick.label.set_fontsize(12) fig.savefig("%s_normDiffBoxplots.png" % (opts.out), dpi=300, bbox_inches='tight') # Enriched peptide plot if opts.enrDir: #Read in enriched peptides from full dataset fullF = glob.glob("%s/Full~Full*txt" % (opts.enrDir))[0] fullEnrD = io.fileEmptyDict(fullF, header=False) allFs = glob.glob("%s/*.txt" % (opts.enrDir)) x = [] total = [] true = [] false = [] for eachF in allFs: if eachF != fullF: thisEnrL = io.fileList(eachF, header=False) thisTrue = 0 thisFalse = 0 for p in thisEnrL: if p in fullEnrD: thisTrue += 1 else: thisFalse += 1 x.append(int(eachF.split("/")[-1].split("~")[0].split("_")[0])) total.append(len(thisEnrL)) true.append(thisTrue) false.append(thisFalse) fig, ax = plt.subplots(figsize=(6, 5), facecolor='w') ax.scatter(x, total, alpha=0.7, zorder=2, s=15, label="Total", color="b") ax.scatter(x, true, alpha=0.7, zorder=2, s=15, label="True", color="g") ax.scatter(x, false, alpha=0.7, zorder=2, s=15, label="False", color="r") ax.set_xlabel("# Reads", fontsize=15) ax.set_ylabel("Enriched peptides", fontsize=15) for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): tick.label.set_fontsize(12) ax.legend() fig.savefig("%s_enrichedPeps.png" % (opts.out), dpi=300, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("pepFile", help="A file containing a list of peptide names", nargs='*') parser.add_argument("--name", help="Column name for peptide names in metadata file", default="CodeName") parser.add_argument("--sid", help="Column name for species IDs in metadata file", default="SpeciesID") parser.add_argument("--species", help="Column name for species names in metadata file", default="Species") parser.add_argument( '-a', '--alignInfo', default= "/Volumes/GoogleDrive/Shared drives/LadnerLab/Projects/panviral_pepseq/analysis/alignments/AlignmentInfoCoded.txt", help='Contains info for species-level seq alignments.') parser.add_argument( '--sampleOrder', help= 'Can provide a plain text file with the order you would like the samples to appear in the plot.' ) parser.add_argument( '--annots', help= 'File with annotation info. If provided, a graphical representation of the different proteins with also be plotted.' ) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument("-o", "--out", help="Output matrix file name", required=True) reqArgs.add_argument("-m", "--meta", help="Metadata file name", required=True) reqArgs.add_argument('-s', '--speciesIDs', help='Comma-separated species IDs of interest.', required=True) args = parser.parse_args() # p.add_option('-p', '--probes', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna", help='Fasta file with probe/tag sequences. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna]') # p.add_option('-m', '--map', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map", help='Probe name map. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map]') # p.add_option('-t', '--taxa', default="/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt", help='Taxa info to link IDs to names [/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt]') # p.add_option('--withLabels', action='store_true', default=False, help='Use this flag to include sample labels on y-axis. [False]') # Read in info from metadata file sidD = io.fileDictHeader(args.meta, args.name, args.sid) id2name = io.fileDictHeader(args.meta, args.sid, args.species) #Read in sample order for plots, IF provided sampOrderList = io.fileList(args.sampleOrder, header=False) #Read in probes # names,tseqs = read_fasta_lists(opts.probes) #Creat dict with seqs as keys, names as values # tagnames={tseqs[i]:names[i] for i in range(len(names))} #Generate dict to translate names # mapDict=readmap(opts.map, order=1) #Read in alignment data alInfo = {} with open(args.alignInfo, "r") as fin: lc = 0 for line in fin: lc += 1 if lc > 1: cols = line.rstrip("\n").split("\t") alInfo[cols[0]] = ["%s/%s" % (cols[1], x) for x in cols[2:]] #Read in annotation info, if provided annotD = {} if args.annots: with open(args.annots, "r") as fin: lc = 0 for line in fin: lc += 1 if lc > 1: cols = line.rstrip("\n").split("\t") if cols[0] not in annotD: annotD[cols[0]] = {} annotD[cols[0]][cols[1]] = [ x.split(",") for x in cols[2].split("~") ] #Make probe count plots #By default, any sample that starts with "Super" is excluded. These are expected to be negative controls for sid in args.speciesIDs.split(","): if args.sampleOrder: plotAlignHits(sampOrderList, sid, alInfo, id2name[sid], annotD, args) else: plotAlignHits( [x for x in list(data.keys()) if not x.startswith("Super")], each, alInfo, data, mapDict, id2name, opts.minDepth, annotD, opts)