Пример #1
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("pepFile",
                        help="A file containing a list of peptide names",
                        nargs='*')
    parser.add_argument("-n",
                        "--name",
                        help="Column name for peptide names in metadata file",
                        default="CodeName")
    parser.add_argument(
        "-c",
        "--cat",
        help="Column name for category of interest in metadata file",
        default="Species")

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument("-o",
                         "--out",
                         help="Output matrix file name",
                         required=True)
    reqArgs.add_argument("-m",
                         "--meta",
                         help="Metadata file name",
                         required=True)

    args = parser.parse_args()

    # Read in info from metadata file
    catD = io.fileDictHeader(args.meta, args.name, args.cat)

    # Create dictionary to hold counts
    countD = defaultdict(dict)

    # Step through each peptides file
    for pF in args.pepFile:
        # Read in peptide names
        peps = io.fileList(pF, header=False)
        for p in peps:
            c = catD[p]
            countD[c][pF] = countD[c].get(pF, 0) + 1

    # Sorted list of categories
    allCats = sorted(list(countD.keys()))

    #Write output file
    with open(args.out, "w") as fout:
        fout.write("File\t%s\n" % ("\t".join(allCats)))
        for pF in args.pepFile:
            counts = [
                str(countD[c][pF]) if pF in countD[c] else "0" for c in allCats
            ]
            fout.write("%s\t%s\n" % (pF, "\t".join(counts)))
Пример #2
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("-e",
                        "--binary",
                        help="How to call the pepSIRF binary/executable",
                        default="pepsirf")

    inArgs = parser.add_argument_group('input files')
    inArgs.add_argument(
        "-r",
        "--raw",
        help="Input raw count matrix. This is an optional starting place.")
    inArgs.add_argument(
        "-c",
        "--colsum",
        help=
        "Input colsum normalized count matrix. This is an optional starting place."
    )
    inArgs.add_argument(
        "-d",
        "--diff",
        help=
        "Input diff normalized count matrix. This is an optional starting place."
    )
    inArgs.add_argument(
        "-f",
        "--diffratio",
        help=
        "Input diff_ratio normalized count matrix. This is an optional starting place."
    )
    inArgs.add_argument(
        "-b",
        "--bins",
        help="Peptide bin file. Required for calculating Z scores.")
    inArgs.add_argument(
        "-z",
        "--zscore",
        help="Z score matrix. This is an optional starting place.")
    inArgs.add_argument(
        "-n",
        "--names",
        help="File containing sample names. This is an optional starting place."
    )
    inArgs.add_argument(
        "-p",
        "--pairs",
        help="File containing sample pairs. This is an optional starting place."
    )
    inArgs.add_argument(
        "-t",
        "--thresh",
        help="Threshold file for pEnrich. This is an optional starting place.")

    threshArgs = parser.add_argument_group('thresholds')
    threshArgs.add_argument(
        "--zThresh",
        default="6,10",
        help=
        "Z score threshold. Can include up to two floating points separated by a comma."
    )
    threshArgs.add_argument(
        "--csThresh",
        default="20",
        help=
        "Colum-Sum norm count threshold. Can include up to two floating points separated by a comma."
    )
    threshArgs.add_argument(
        "--sbdrThresh",
        default="4",
        help=
        "Negative control ratio threshold. Can include up to two floating points separated by a comma."
    )
    threshArgs.add_argument(
        '--rawThresh',
        default="488000",
        help=
        'Total raw read count for a sample to be included in enrichment analyses. Can include up to two floating points separated by a comma.'
    )
    threshArgs.add_argument(
        '--hdi',
        default=0.95,
        type=float,
        help=
        'The highest density interval to be used for calculation of mean and stdev in the zscore module.'
    )

    controlArgs = parser.add_argument_group('control info')
    controlArgs.add_argument(
        "--negNormMat",
        help=
        "Alternative colsum normalized matrix form which to obtain the negative controls."
    )
    controlArgs.add_argument(
        "--negative_id",
        help=
        "Optional approach for identifying negative controls. Provide a unique string at the start of all negative control samples."
    )
    controlArgs.add_argument(
        "--negative_names",
        help=
        "Optional approach for identifying negative controls. Comma-separated list of negative control sample names."
    )

    enrichArgs = parser.add_argument_group('enrich options')
    enrichArgs.add_argument(
        "--sEnrich",
        default=False,
        action="store_true",
        help=
        "Generate lists of enriched peptides separately for each pulldown. Will actually run p_enrich, but with the same sample specified for each replicate."
    )
    enrichArgs.add_argument(
        "--inferPairs",
        default=False,
        action="store_true",
        help=
        "Infer sample pairs from names. This option assumes names of replicates will be identical with the exception of a final string denoted with a '_'. For example, these names would be considered two replicates of the same sample: VW_100_1X_A and VW_100_1X_B"
    )

    args = parser.parse_args()

    #Creat base string for output files
    if args.raw:
        base = ".".join(args.raw.split(".")[:-1])
    elif args.colsum:
        base = ".".join(args.colsum.split(".")[:-1])[:-3]
    elif args.diff:
        base = ".".join(args.diff.split(".")[:-1])[:-4]
    elif args.diffratio:
        base = ".".join(args.diffratio.split(".")[:-1])[:-5]
    elif args.zscore:
        base = ".".join(args.zscore.split(".")[:-1])[:-2]
    else:
        base = None
        print(
            "Not going to be able to do much without a score matrix of some type."
        )

    # If a raw count matrix is provided, but a colusum norm matrix is NOT
    if args.raw and not args.colsum:
        args.colsum = "%s_CS.tsv" % (base)
        cmd = "%s norm -a col_sum -p %s -o %s >> norm.out" % (
            args.binary, args.raw, args.colsum)
        print(cmd)
        subprocess.run(cmd, shell=True)
#        subprocess.run([args.binary, "norm -a col_sum -p", args.raw, "-o", args.colsum, ">> norm.out"])

    if args.colsum:

        if not args.negative_id and not args.negative_names:
            print(
                "You must proivde either '--negative_id' or '--negative_names' to allow negative control based normalization."
            )

        else:

            # Generate string for specifying how to get neg control data
            negInfo = ""
            if args.negNormMat:
                negInfo += ' --negative_control "%s" ' % (args.negNormMat)
            if args.negative_id:
                negInfo += " --negative_id %s " % (args.negative_id)
            if args.negative_names:
                negInfo += " --negative_names %s " % (args.negative_names)

            # Generate other normalized files
            if not args.diff:
                args.diff = "%s_SBD.tsv" % (base)

                cmd = "%s norm -a diff -p %s -o %s %s >> norm.out" % (
                    args.binary, args.colsum, args.diff, negInfo)
                print(cmd)
                subprocess.run(cmd, shell=True)
#                subprocess.run([opts.binary, "norm -a diff -p", args.colsum, "-o", args.diff, negInfo, ">> norm.out"])

            if not args.diffratio:
                args.diffratio = "%s_SBDR.tsv" % (base)

                cmd = "%s norm -a diff_ratio -p %s -o %s %s >> norm.out" % (
                    args.binary, args.colsum, args.diffratio, negInfo)
                print(cmd)
                subprocess.run(cmd, shell=True)
#                subprocess.run([opts.binary, "norm -a diff_ratio -p", args.colsum, "-o", args.diffratio, negInfo, ">> norm.out"])

    if args.bins and args.diff:
        # Generate Z scores
        args.zscore = "%s_Z-HDI%d.tsv" % (base, int(args.hdi * 100))
        args.znan = "%s_Z-HDI%d.nan" % (base, int(args.hdi * 100))
        cmd = '%s zscore -s %s -o %s -n %s -b "%s" -d %f >> zscore.out' % (
            args.binary, args.diff, args.zscore, args.znan, args.bins,
            args.hdi)
        print(cmd)
        subprocess.run(cmd, shell=True)

    elif not args.bins:
        print("You must proivde '--bins' for Z score calculation.")

    # Generate list of sample names, if not provided
    if not args.names and base:
        args.names = "%s_SN.tsv" % (base)
        if args.raw:
            cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.raw,
                                                       args.names)
            print(cmd)
            subprocess.run(cmd, shell=True)
        elif args.colsum:
            cmd = '%s info -i %s -s %s >> info.out' % (args.binary,
                                                       args.colsum, args.names)
            print(cmd)
            subprocess.run(cmd, shell=True)
        elif args.zscore:
            cmd = '%s info -i %s -s %s >> info.out' % (args.binary,
                                                       args.zscore, args.names)
            print(cmd)
            subprocess.run(cmd, shell=True)
        elif args.diff:
            cmd = '%s info -i %s -s %s >> info.out' % (args.binary, args.diff,
                                                       args.names)
            print(cmd)
            subprocess.run(cmd, shell=True)
        elif args.diffratio:
            cmd = '%s info -i %s -s %s >> info.out' % (
                args.binary, args.diffratio, args.names)
            print(cmd)
            subprocess.run(cmd, shell=True)
        else:
            print(
                "No file was providing for generating a list of sample names.")

    # Generate pairs file, if not provided
    if not args.pairs and args.names and base:
        sNames = io.fileList(args.names, header=False)

        if args.sEnrich and args.names:
            args.pairs = "%s_pseudoPN.tsv" % (base)
            with open(args.pairs, "w") as fout:
                for sn in sNames:
                    fout.write("%s\t%s\n" % (sn, sn))

        elif args.inferPairs:
            pDict = defaultdict(list)
            for each in sNames:
                simple = "_".join(each.split("_")[:-1])
                pDict[simple].append(each)

            args.pairs = "%s_PN.tsv" % (base)
            with open(args.pairs, "w") as fout:
                for k, v in pDict.items():
                    if len(
                            v
                    ) == 2:  # If exactly two replicates were found, add them to the pairs file
                        fout.write("%s\n" % ("\t".join(v)))
                    elif len(
                            v
                    ) > 2:  # If more than two replicates were found, add all possible pairs
                        for a, b in it.combinations(v, 2):
                            fout.write("%s\t%s\n" % (a, b))
                    else:
                        print("Only one replicate found for %s: %s" %
                              (simple, each))

        else:
            print(
                "To run p_enrich module, you must provide one of the following: '--pairs', '--inferPairs', '--sEnrich'"
            )

    # Generate threshold file
    if not args.thresh and base:
        args.thresh = "%s_thresh.tsv" % (base)
        with open(args.thresh, "w") as fout:
            if args.zscore:
                fout.write("%s\t%s\n" % (args.zscore, args.zThresh))
            if args.colsum:
                fout.write("%s\t%s\n" % (args.colsum, args.csThresh))
            if args.diffratio:
                fout.write("%s\t%s\n" % (args.diffratio, args.sbdrThresh))

    # Run p_enrich module
    if args.thresh and args.pairs and base:
        enrDir = makeDirName(args)
        if args.raw:
            cmd = '%s p_enrich -t %s -s %s -r %s --raw_score_constraint %s -x _enriched.txt -o %s >> penrich.out' % (
                args.binary, args.thresh, args.pairs, args.raw, args.rawThresh,
                enrDir)
        else:
            cmd = '%s p_enrich -t %s -s %s -x _enriched.txt -o %s >> penrich.out' % (
                args.binary, args.thresh, args.pairs, enrDir)

        print(cmd)
        subprocess.run(cmd, shell=True)

    if matplotReady:
        if args.raw:
            #Generate reac counts file
            args.readCounts = "%s_RC.tsv" % (base)
            cmd = '%s info -i %s -c %s >> info.out' % (args.binary, args.raw,
                                                       args.readCounts)
            print(cmd)
            subprocess.run(cmd, shell=True)

            #Read in counts
            rcD = io.fileDictHeader(args.readCounts, "Sample name",
                                    "Sum of probe scores")
            boxplot(list(rcD.values()), "readCountBoxplot.png", args.rawThresh)

            if args.thresh and args.pairs and base:
                enrFiles = glob.glob("%s/*enriched.txt" % (enrDir))
                enrCounts = [
                    len(io.fileList(f, header=False)) for f in enrFiles
                ]
                if len(enrCounts) > 0:
                    boxplot(enrCounts, "enrichedCountBoxplot.png", "200")
Пример #3
0
def main():

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument('-d', '--data',  help='Data matrix for generating scatterlots.', required=True)

    parser.add_argument('-o', '--outfile', help='Name for out file.')
    parser.add_argument('-x', '--xHead',  help='Header in data file for x-axis.')
    parser.add_argument('-y', '--yHead',  help='Header in data file for y-axis.')

    parser.add_argument('-c', '--color',  help='Header name to use to color points in plot.')
    parser.add_argument('--cMap',  help='Optional way of mapping the "color" variable in the data matrix to another categorical variable. 3 comma-separated variables should be provided: map file, key header, value header.')
    parser.add_argument('--xLog', default=False, type=float, help="Use if you want x-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.")
    parser.add_argument('--yLog', default=False, type=float, help="Use if you want y-axis to be shown on a log-scale. Argument provided should be a float to add to the y values before calculating the log value.")
    parser.add_argument('--delim', default="\t", help="Delimiter used in the data file.")
    parser.add_argument('--xLab', help="String for x-label. If not provided, --xHead is used.")
    parser.add_argument('--yLab', help="String for y-label. If not provided, --yHead is used.")
#    parser.add_argument('--width', default=5, type=int, help="Figure width.")
#    parser.add_argument('--height', default=4, type=int, help="Figure height.")
    parser.add_argument('--include', help="Header,Value pairs used to indicate a subset of rows to include.", nargs='*')
    parser.add_argument('--exclude', help="Header,Value pairs used to indicate a subset of rows to exclude.", nargs='*')
    parser.add_argument('--xLegend', default=0.1, type=float, help="x-coordinate to use for color legend.")
    parser.add_argument('--yLegend', default='top', help="Indicate whether you want the legend at the 'top' or 'bottom' of the plot.")
    parser.add_argument('--xeqy', default=False, action="store_true", help="Use if you want an x=y line included in the plot.")
    parser.add_argument('--markerSize', default=10, type=int, help="Size of marker used in plot.")
    parser.add_argument('--alpha', default=0.6, type=float, help="Alpha (transparency) value to use in plot.")

    parser.add_argument('-b', '--batch',  help='An alternative way to provide input/output files that allows the generation of multiple plots with a single command. File provided should be tab-delimited, one line per output plot: xHeader, yHeader, outfile, colorHeader. Colorheader column is optional.')
    parser.add_argument('-a', '--allByAll',  help='Optional way to specify xHead and yHead. Should be a list of column headers. A plot will be generated for all pairwise comparisons of the columns in this file. Output names will be generated based on the column name.')

    opts = parser.parse_args()

    
    if opts.cMap:
        mapF, kHead, vHead = opts.cMap.split(",")
        opts.cMap = io.fileDictHeader(mapF, kHead, vHead)
    
    #Read in data file 
    dataD = io.fileDictFull(opts.data, opts.delim)


    if opts.batch:
        with open(opts.batch) as fin:
            for line in fin:

                cols = line.rstrip("\n").split("\t")
                
                opts.xHead = cols[0]
                opts.yHead = cols[1]
                opts.outfile = cols[2]
                if len(cols)==4:
                    opts.color = cols[3]
                
                # Make a subset dict that will be manipulated
                subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")}
                
                # Make sure data columns are formatted properly 
                prepData(subD, opts)

                #Generate plot
                scatter(subD, opts)
                
                #Reset Labels
                opts.xLab=None
                opts.yLab=None

    elif opts.allByAll:
        heads = io.fileList(opts.allByAll, header=False)
        for h1, h2 in it.combinations(heads, 2):

                opts.xHead = h1
                opts.yHead = h2
                opts.outfile = "%s_%s.png" % (h1, h2)
                
                # Make a subset dict that will be manipulated
                subD = {k:dataD[k] for k in opts.xHead.split(",") + opts.yHead.split(",")}
                
                # Make sure data columns are formatted properly 
                prepData(subD, opts)

                #Generate plot
                scatter(subD, opts)
                
                #Reset Labels
                opts.xLab=None
                opts.yLab=None

            
    
    else:
        
        if opts.include:
            for each in opts.include:
                header, val = each.split(",")
                dataD = onlyInclude(dataD, header, val)

        if opts.exclude:
            for each in opts.exclude:
                header, val = each.split(",")
                dataD = toExclude(dataD, header, val)

        
        # Make sure data columns are formatted properly 
        prepData(dataD, opts)
    
        #Generate plot
        scatter(dataD, opts)
Пример #4
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("pepFile",
                        help="A file containing a list of peptide names",
                        nargs='*')
    parser.add_argument("--name",
                        help="Column name for peptide names in metadata file",
                        default="CodeName")
    parser.add_argument("--sid",
                        help="Column name for species IDs in metadata file",
                        default="SpeciesID")
    parser.add_argument("--species",
                        help="Column name for species names in metadata file",
                        default="Species")
    parser.add_argument(
        '-a',
        '--alignInfo',
        default=
        "/Volumes/GoogleDrive/Shared drives/LadnerLab/Projects/panviral_pepseq/analysis/alignments/AlignmentInfoCoded.txt",
        help='Contains info for species-level seq alignments.')
    parser.add_argument(
        '--sampleOrder',
        help=
        'Can provide a plain text file with the order you would like the samples to appear in the plot.'
    )
    parser.add_argument(
        '--annots',
        help=
        'File with annotation info. If provided, a graphical representation of the different proteins with also be plotted.'
    )

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument("-o",
                         "--out",
                         help="Output matrix file name",
                         required=True)
    reqArgs.add_argument("-m",
                         "--meta",
                         help="Metadata file name",
                         required=True)
    reqArgs.add_argument('-s',
                         '--speciesIDs',
                         help='Comma-separated species IDs of interest.',
                         required=True)

    args = parser.parse_args()

    #    p.add_option('-p', '--probes', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna",  help='Fasta file with probe/tag sequences. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/PV1_10K3000_53_encoded.fna]')
    #    p.add_option('-m', '--map', default="/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map",  help='Probe name map. [/Volumes/GoogleDrive/Shared drives/MyImmunity/PanviralDesign/PV1/encoding/unrepSW_9_30_70_design_combined_wControls.csv_map]')
    #    p.add_option('-t', '--taxa', default="/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt", help='Taxa info to link IDs to names [/Volumes/GoogleDrive/Shared drives/LadnerLab/Manuscripts/PV-PepSeq_Design-Testing/Tables/speciesIDs_2019-06-21.txt]')
    #    p.add_option('--withLabels', action='store_true', default=False, help='Use this flag to include sample labels on y-axis. [False]')

    # Read in info from metadata file
    sidD = io.fileDictHeader(args.meta, args.name, args.sid)
    id2name = io.fileDictHeader(args.meta, args.sid, args.species)

    #Read in sample order for plots, IF provided
    sampOrderList = io.fileList(args.sampleOrder, header=False)

    #Read in probes
    #    names,tseqs = read_fasta_lists(opts.probes)
    #Creat dict with seqs as keys, names as values
    #    tagnames={tseqs[i]:names[i] for i in range(len(names))}

    #Generate dict to translate names
    #    mapDict=readmap(opts.map, order=1)

    #Read in alignment data
    alInfo = {}
    with open(args.alignInfo, "r") as fin:
        lc = 0
        for line in fin:
            lc += 1
            if lc > 1:
                cols = line.rstrip("\n").split("\t")
                alInfo[cols[0]] = ["%s/%s" % (cols[1], x) for x in cols[2:]]

    #Read in annotation info, if provided
    annotD = {}
    if args.annots:
        with open(args.annots, "r") as fin:
            lc = 0
            for line in fin:
                lc += 1
                if lc > 1:
                    cols = line.rstrip("\n").split("\t")
                    if cols[0] not in annotD:
                        annotD[cols[0]] = {}
                    annotD[cols[0]][cols[1]] = [
                        x.split(",") for x in cols[2].split("~")
                    ]

    #Make probe count plots
    #By default, any sample that starts with "Super" is excluded. These are expected to be negative controls
    for sid in args.speciesIDs.split(","):
        if args.sampleOrder:
            plotAlignHits(sampOrderList, sid, alInfo, id2name[sid], annotD,
                          args)
        else:
            plotAlignHits(
                [x for x in list(data.keys()) if not x.startswith("Super")],
                each, alInfo, data, mapDict, id2name, opts.minDepth, annotD,
                opts)
Пример #5
0
def main():
    usage = '%prog [options] probes1.txt [probes2.txt ...]'
    p = optparse.OptionParser()
    #    p.add_option('-e', '--enriched',  help='List of peptide names, one per line. [None, REQ]')
    p.add_option('-m', '--meta', help='Metadata file. [None, REQ]')
    p.add_option(
        '-o',
        "--outDir",
        default="pepAligned",
        help=
        'Name for dirctory in which output files will be generated. This Dirctory should NOT exist already. [pepAligned]'
    )
    p.add_option(
        "--master",
        help=
        'Use this option to provide a directory name to contain output alignments that will combine peptides enriched in all samples. [OPT]'
    )
    p.add_option(
        "--plot",
        help=
        'If this option is used, along with some type of matrix, then heat map plots will be generated for each alignment. [OPT]'
    )
    p.add_option("--minToPlot",
                 default=0,
                 type="int",
                 help='Minimum score to include in plots. [0]')
    p.add_option(
        "--plotLog",
        default=False,
        action="store_true",
        help=
        'Use this flag if you want the y-axis for plots to be on the log scale. [False]'
    )

    opts, args = p.parse_args()

    #Check for output directory and create IF it doesn't already exist
    if not os.path.isdir(opts.outDir):
        os.mkdir(opts.outDir)

    #If master alignments are requested, generate output directory and create dictionary to hold into
    if opts.master:
        if not os.path.isdir(opts.master):
            os.mkdir(opts.master)
        masterByProt = defaultdict(list)

    #Read in info from metadata file
    pepD = io.fileDictHeader(opts.meta, "CodeName", "Peptide")
    catD = io.fileDictHeader(opts.meta, "CodeName", "Category")
    protD = io.fileDictHeader(opts.meta, "CodeName", "Protein")
    startD = io.fileDictHeader(opts.meta, "CodeName", "AlignStart")
    startD = {k: int(v) for k, v in startD.items() if v}
    stopD = io.fileDictHeader(opts.meta, "CodeName", "AlignStop")
    stopD = {k: int(v) for k, v in stopD.items() if v}

    #Read in score matrix for generating heat maps, if provided
    if opts.plot:
        scoreDict = parseCounts(opts.plot)
        print("Read scores for plots")

    #Step through each list of probes provided
    for each in args:
        print(each)
        #Read in peptides of interest
        peps = filelist(each)
        print(len(peps))

        #Make separate lists of peptides from different proteins
        byProt = defaultdict(list)
        for p in peps:
            if catD[p] != "Control":
                byProt[protD[p]].append(p)
                if opts.master:
                    masterByProt[protD[p]].append(p)

        #Generate clusters for each protein
        for prot, pL in byProt.items():
            clu = clustProbes(pL, pepD, startD, stopD)
            print(len(clu))
            #Check clusters
            for a, b in clu.items():
                if set(a) != set(b):
                    print(
                        "Not all starting places resulted in the same cluster!!!",
                        a, sorted(b))

            #String for output files
            outStr = "%s_%s" % (prot, ".".join(
                each.split("/")[-1].split(".")[:-1]))

            #Write aligned fasta files
            for c in clu:
                names, seqs, bounds = alignSeqs({
                    x: [pepD[x], list(range(startD[x], stopD[x]))]
                    for x in c
                })
                write_fasta(names, seqs,
                            "%s/%s_%s.fasta" % (opts.outDir, outStr, bounds))

    #Gnerate master output, if requested
    if opts.master:
        for prot, pL in masterByProt.items():
            clu = clustProbes(pL, pepD, startD, stopD)
            print(len(clu))
            #Check clusters
            for a, b in clu.items():
                if set(a) != set(b):
                    print(
                        "Not all starting places resulted in the same cluster!!!",
                        a, sorted(b))

            #String for output files
            outStr = "%s_master" % (prot)

            #Write aligned fasta files
            for c in clu:
                names, seqs, bounds = alignSeqs({
                    x: [pepD[x], list(range(startD[x], stopD[x]))]
                    for x in c
                })
                write_fasta(names, seqs,
                            "%s/%s_%s.fasta" % (opts.master, outStr, bounds))

                #Generate heat map
                if opts.plot:
                    heatMap(
                        names, seqs, {
                            k: {p: s
                                for p, s in v.items() if p in c}
                            for k, v in scoreDict.items()
                        }, "%s/%s_%s.pdf" % (opts.master, outStr, bounds),
                        opts.minToPlot, opts.plotLog)
Пример #6
0
def main():
    usage = '%prog [options]'
    p = optparse.OptionParser()
#    p.add_option('-e', '--enriched',  help='List of peptide names, one per line. [None, REQ]')
    p.add_option('-i', '--inputInfo',  help='Tab-delimited file indicating the sample names and enriched probe lists. If you want replicates to be averaged, probe name as a unique starting string common to all replicates. [None, REQ]')
    p.add_option('-m', '--meta',  help='Metadata file. [None, REQ]')
    p.add_option('-o', "--outDir", default="pepAligned", help='Name for dirctory in which output files will be generated. This Dirctory should NOT exist already. [pepAligned]')
    p.add_option("--plot", help='If this option is used, along with some type of matrix, then heat map plots will be generated for each alignment. [None, REQ]')
    p.add_option("--minToPlot", default=1, type="float", help='Minimum score to include in plots. [1]')
    p.add_option("--plotLog", default=False, action="store_true", help='Use this flag if you want the y-axis for plots to be on the log scale. [False]')
    p.add_option("--addNegNorm", help='Use this flag if you want to add info about the norm read counts for negative controls y-axis labels. Argument should be tab-delimited file. First column should be comma-sep list of negative control names, the second the name of the norm read count matrix. [None, OPT]')

    opts, args = p.parse_args()

    #Check for output directory and create IF it doesn't already exist
    if not os.path.isdir(opts.outDir):
        os.mkdir(opts.outDir)
    
    #Get neg norm counts, if provided:
    normNeg = {}
    if opts.addNegNorm:
        with open(opts.addNegNorm, "r") as fin:
            lc=0
            for line in fin:
                lc+=1
                if lc==1:
                    cols = line.rstrip("\n").split("\t")
                    negNames = cols[0].split(",")
                    normDict = parseCounts(cols[1])
        for p in normDict[negNames[0]]:
            normNeg[p] = np.mean([normDict[x][p] for x in negNames])
    
    #Read in info from metadata file
    pepD = io.fileDictHeader(opts.meta, "CodeName", "Peptide")
    catD=io.fileDictHeader(opts.meta, "CodeName", "Category")
    protD=io.fileDictHeader(opts.meta, "CodeName", "Protein")
    startD=io.fileDictHeader(opts.meta, "CodeName", "AlignStart")
    startD={k:int(v) for k,v in startD.items() if v}
    stopD=io.fileDictHeader(opts.meta, "CodeName", "AlignStop")
    stopD={k:int(v) for k,v in stopD.items() if v}

    #Read in score matrix for generating heat maps, if provided
    scoreDict = parseCounts(opts.plot)

    #Reads in names and probe lists
    enrichD = io.fileDictHeader(opts.inputInfo, "Sample", "PepFile")

    #Generate replicate lists for each name provided
    repD = {s:[n for n in scoreDict if n.startswith(s)] for s in enrichD}
    
    #Step through each list of probes provided
    masterByProt = defaultdict(list)    #To hold info
    for sName, each in enrichD.items():
        #Read in peptides of interest
        peps = filelist(each)
        
        for p in peps:
            if catD[p] != "Control":
                masterByProt[protD[p]].append(p)
        
    for prot, pL in masterByProt.items():
        clu = clustProbes(pL, pepD, startD, stopD)
        #Check clusters
        for a,b in clu.items():
            if set(a) != set(b):
                print ("Not all starting places resulted in the same cluster!!!", a, sorted(b))

        #String for output files
        outStr="%s_master" % (prot)

        for c in clu:
            names,seqs,bounds = alignSeqs({x:[pepD[x], list(range(startD[x], stopD[x]))] for x in c})

            #Write aligned fasta files
#            write_fasta(names, seqs, "%s/%s_%s.fasta" % (opts.master, outStr, bounds))

            #Generate heat map
            dta = {sN:{p:0.0001 for p in c} for sN in repD}
            for sN, each in enrichD.items():
                #Read in peptides of interest
                peps = filelist(each)
        
                for p in peps:
                    if catD[p] != "Control":
                        avgV = np.mean([scoreDict[x][p] for x in repD[sN]])
                        if avgV == float('inf'):
                            avgV = 10000
                        dta[sN][p] = avgV
            
            if opts.addNegNorm:
                seqsPlus = ["%.0f %s" % (normNeg[names[i]], seqs[i]) for i in range(len(seqs))]
                heatMap(names, seqsPlus, dta, "%s/%s_%s.pdf" % (opts.outDir, outStr, bounds), opts.minToPlot, opts.plotLog)
            else:
                heatMap(names, seqs, dta, "%s/%s_%s.pdf" % (opts.outDir, outStr, bounds), opts.minToPlot, opts.plotLog)