def plotDETagStats(infiles, outfile): '''plot differential expression stats''' infile, composition_file = infiles Expression.plotDETagStats( infile, outfile, additional_file=composition_file, join_columns=("contig", "start", "end"), additional_columns=("CpG_density", "length")) P.touch(outfile)
def plotRNASEQTagData( infiles, outfile ): '''perform differential expression analysis using deseq.''' design_file = infiles[0] geneset_file = infiles[1] bamfiles = infiles[2] #IMS: now running on feature counts infile = os.path.join( "feature_counts.dir", P.snip( geneset_file, ".gtf.gz") + ".feature_counts.tsv.gz" ) Expression.plotTagStats( infile, design_file, outfile ) P.touch( outfile )
def loadCuffdiff( infile, outfile ): '''load results from differential expression analysis and produce summary plots. Note: converts from ln(fold change) to log2 fold change. The cuffdiff output is parsed. Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent) are set to status 'NOCALL'. These transcripts might nevertheless be significant. ''' Expression.loadCuffdiff( infile, outfile )
def outputRegionsOfInterest(infiles, outfile, max_per_sample=10, sum_per_group=40): '''output windows according to various filters. The output is a mock analysis similar to a differential expression result. ''' job_options = "-l mem_free=64G" design_file, counts_file = infiles design = Expression.readDesignFile(design_file) # remove tracks not included in the design design = dict([(x, y) for x, y in design.items() if y.include]) # define the two groups groups = sorted(set([x.group for x in design.values()])) # build a filtering statement groupA, groupB = groups upper_levelA = "max( (%s) ) < %f" % ( ",".join( ["int(r['%s'])" % x for x, y in design.items() if y.group == groupA]), max_per_sample) sum_levelA = "sum( (%s) ) > %f" % ( ",".join( ["int(r['%s'])" % x for x, y in design.items() if y.group == groupB]), sum_per_group) upper_levelB = "max( (%s) ) < %f" % ( ",".join( ["int(r['%s'])" % x for x, y in design.items() if y.group == groupB]), max_per_sample) sum_levelB = "sum( (%s) ) > %f" % ( ",".join( ["int(r['%s'])" % x for x, y in design.items() if y.group == groupA]), sum_per_group) statement = ''' zcat %(counts_file)s | python %(scriptsdir)s/csv_select.py --log=%(outfile)s.log "(%(upper_levelA)s and %(sum_levelA)s) or (%(upper_levelB)s and %(sum_levelB)s)" | python %(scriptsdir)s/runExpression.py --log=%(outfile)s.log --filename-design=%(design_file)s --filename-tags=- --method=mock --filter-min-counts-per-sample=0 | gzip > %(outfile)s ''' P.run()
def runMEDIPSDMR(design_file, outfile): '''run differential methylation analysis using MEDIPS package. Arguments --------- infile : string Filename of :term:`bam` formatted file outfile : string Output filename in :term:`tsv` format. ''' job_memory = "30G" design = Expression.readDesignFile(design_file) # remove data tracks not needed design = [(x, y) for x, y in design.items() if y.include] # build groups groups = set([y.group for x, y in design]) statements = [] for pair1, pair2 in itertools.combinations(groups, 2): treatment = ["%s.bam" % x for x, y in design if y.group == pair1] control = ["%s.bam" % x for x, y in design if y.group == pair2] treatment = ",".join(treatment) control = ",".join(control) # outfile contains directory prefix statements.append( """python %(scriptsdir)s/runMEDIPS.py --ucsc-genome=%(medips_genome)s --treatment=%(treatment)s --control=%(control)s --toolset=dmr --shift=%(medips_shift)s --extend=%(medips_extension)s --window-size=%(medips_window_size)i --output-filename-pattern="%(outfile)s_%(pair1)s_vs_%(pair2)s_%%s" --fdr-threshold=%(medips_fdr)f --log=%(outfile)s.log > %(outfile)s.log2; checkpoint; zcat %(outfile)s_%(pair1)s_vs_%(pair2)s_data.tsv.gz | python %(scriptsdir)s/runMEDIPS.py --treatment=%(pair1)s --control=%(pair2)s --toolset=convert --fdr-threshold=%(medips_fdr)f --log=%(outfile)s.log | gzip > %(outfile)s """) P.run()
def runCuffdiff( infiles, outfile ): '''perform differential expression analysis using cuffdiff.''' design_file = infiles[0] geneset_file = infiles[1] bamfiles = infiles[2] if PARAMS["cuffdiff_include_mask"]: mask_file = os.path.abspath( "geneset_mask.gtf" ) else: mask_file = None options = PARAMS["cuffdiff_options"] + " --library-type %s" % PARAMS["cufflinks_library_type"] Expression.runCuffdiff( bamfiles, design_file, geneset_file, outfile, threads = PARAMS.get("cuffdiff_threads",4), cuffdiff_options = options, fdr = PARAMS["cuffdiff_fdr"], mask_file = mask_file )
def plotDETagStats(infile, composition_file, outfile): '''plot differential expression statistics Arguments --------- infile : string Filename with :term:`tsv` formatted list of differential methylation results output from :doc:`scripts/runExpression`. composition_file : string Filename with :term:`tsv` formatted data about nucleotide compositions of windows tested. outfile : string Output filename, used as sentinel only. ''' Expression.plotDETagStats( infile, outfile, additional_file=composition_file, join_columns=("contig", "start", "end"), additional_columns=("CpG_density", "length")) P.touch(outfile)
def runCuffdiff(infiles, outfile): '''perform differential expression analysis using cuffdiff.''' design_file = infiles[0] geneset_file = infiles[1] bamfiles = infiles[2] if PARAMS["cuffdiff_include_mask"]: mask_file = os.path.abspath("geneset_mask.gtf") else: mask_file = None options = PARAMS["cuffdiff_options"] + \ " --library-type %s" % PARAMS["cufflinks_library_type"] Expression.runCuffdiff(bamfiles, design_file, geneset_file, outfile, threads=PARAMS.get("cuffdiff_threads", 4), cuffdiff_options=options, fdr=PARAMS["cuffdiff_fdr"], mask_file=mask_file)
def buildProbeset2Gene(infile, outfile): """build map relating a probeset to an ENSEMBL gene_id""" Expression.buildProbeset2Gene(infile, outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-o", "--outfile", dest="output_filename", type="string", help="output filename [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "deseq", "edger", "deseq2", "ttest", "mock", "summary", "dump", "spike", "plottagstats", "plotdetagstats"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option( "--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no replicates " "[default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-p", "--pseudocounts", dest="pseudo_counts", type="float", help="pseudocounts to add for mock analyis " "[default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--deseq2-design-formula", dest="model", type="string", help="Design formula for DESeq2") parser.add_option("--deseq2-contrasts", dest="contrasts", type="string", help=("contrasts for post-hoc testing writen" " variable:control:treatment,...")) parser.set_defaults( input_filename_tags=None, input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=None, save_r_environment=None, filter_min_counts_per_row=1, filter_min_counts_per_sample=10, filter_percentile_rowsums=0, pseudo_counts=0, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.input_filename_tags == "-": fh = tempfile.NamedTemporaryFile(delete=False) fh.write("".join([x for x in options.stdin])) fh.close() options.input_filename_tags = fh.name else: fh = None # load tag data and filter if options.method in ("deseq2", "deseq", "edger", "mock", "ttest"): assert options.input_filename_tags and os.path.exists( options.input_filename_tags) assert options.input_filename_design and os.path.exists( options.input_filename_design) Expression.loadTagData(options.input_filename_tags, options.input_filename_design) nobservations, nsamples = Expression.filterTagData( filter_min_counts_per_row=options.filter_min_counts_per_row, filter_min_counts_per_sample=options.filter_min_counts_per_sample, filter_percentile_rowsums=options.filter_percentile_rowsums) if nobservations == 0: E.warn("no observations - no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return sample_names = R('''colnames(countsTable)''') E.info("%i samples to test at %i observations: %s" % (nsamples, nobservations, ",".join(sample_names))) try: if options.method == "deseq2": Expression.runDESeq2( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, ref_group=options.ref_group, model=options.model, contrasts=options.contrasts, ) elif options.method == "deseq": Expression.runDESeq( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, dispersion_method=options.deseq_dispersion_method, fit_type=options.deseq_fit_type, sharing_mode=options.deseq_sharing_mode, ref_group=options.ref_group, ) elif options.method == "edger": Expression.runEdgeR( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, ref_group=options.ref_group, dispersion=options.edger_dispersion) elif options.method == "mock": Expression.runMockAnalysis( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, ref_group=options.ref_group, pseudo_counts=options.pseudo_counts, ) elif options.method == "summary": Expression.outputTagSummary( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design ) elif options.method == "dump": assert options.input_filename_tags and os.path.exists( options.input_filename_tags) Expression.dumpTagData(options.input_filename_tags, options.input_filename_design, outfile=options.stdout) elif options.method == "plottagstats": assert options.input_filename_tags and os.path.exists( options.input_filename_tags) Expression.plotTagStats( options.input_filename_tags, options.input_filename_design, outfile_prefix=options.output_filename_pattern) elif options.method == "plotdetagstats": assert options.input_filename_result and os.path.exists( options.input_filename_result) Expression.plotDETagStats( options.input_filename_result, outfile_prefix=options.output_filename_pattern) elif options.method == "spike": Expression.outputSpikeIns( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design, foldchange_max=options.spike_foldchange_max, expression_max=options.spike_expression_max, max_counts_per_bin=options.spike_max_counts_per_bin, expression_bin_width=options.spike_expression_bin_width, foldchange_bin_width=options.spike_foldchange_bin_width, ) elif options.method == "ttest": Expression.runTTest( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr) except rpy2.rinterface.RRuntimeError: if options.save_r_environment: E.info("saving R image to %s" % options.save_r_environment) R['save.image'](options.save_r_environment) raise if fh and os.path.exists(fh.name): os.unlink(fh.name) if options.save_r_environment: R['save.image'](options.save_r_environment) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError, msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return
def plotDETagStats(infile, outfile): '''plot differential expression stats''' Expression.plotDETagStats(infile, outfile) P.touch(outfile)
def outputRegionsOfInterest(design_file, counts_file, outfile, max_per_sample=10, sum_per_group=40): '''output windows according to various filters. The output is a mock analysis similar to a differential expression result. Arguments --------- design_file : string Filename with experimental design counts_file : string :term:`tsv` formatted file with counts per windows outfile : string Output filename in :term:`tsv` format max_per_sample : int Remove samples with more than threshold counts sum_per_group : int Minimum counts per group. ''' job_memory = "64G" design = Expression.readDesignFile(design_file) # remove tracks not included in the design design = dict([(x, y) for x, y in list(design.items()) if y.include]) # define the two groups groups = sorted(set([x.group for x in list(design.values())])) # build a filtering statement groupA, groupB = groups def _buildMax(g, threshold): selected = [x for x, y in list(design.items()) if y.group == g] if len(selected) > 1: return "max((%s)) < %f" % (",".join( ["int(r['%s'])" % x for x in selected]), threshold) elif len(selected) == 1: return "int(r['%s']) < %f" % (selected[0], threshold) else: raise ValueError("no groups found for 'g'" % g) def _buildSum(g, threshold): selected = [x for x, y in list(design.items()) if y.group == g] if len(selected) > 1: return "sum((%s)) > %f" % (",".join( ["int(r['%s'])" % x for x in selected]), threshold) elif len(selected) == 1: return "int(r['%s']) > %f" % (selected[0], threshold) else: raise ValueError("no groups found for 'g'" % g) upper_levelA = _buildMax(groupA, max_per_sample) upper_levelB = _buildMax(groupB, max_per_sample) sum_levelA = _buildSum(groupA, sum_per_group) sum_levelB = _buildSum(groupB, sum_per_group) statement = ''' zcat %(counts_file)s | cgat csv_select --log=%(outfile)s.log "(%(upper_levelA)s and %(sum_levelB)s) or (%(upper_levelB)s and %(sum_levelA)s)" | cgat runExpression --log=%(outfile)s.log --design-tsv-file=%(design_file)s --tags-tsv-file=- --method=mock --filter-min-counts-per-sample=0 | gzip > %(outfile)s ''' P.run(statement)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "spike", "normalize"), help="differential expression method to apply " "[default=%default].") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--spike-change-bin-min", dest="min_cbin", type="float", help="minimum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-max", dest="max_cbin", type="float", help="maximum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-width", dest="width_cbin", type="float", help="bin width for change bins [default=%default].") parser.add_option("--spike-initial-bin-min", dest="min_ibin", type="float", help="minimum bin for initial bins[default=%default].") parser.add_option("--spike-initial-bin-max", dest="max_ibin", type="float", help="maximum bin for intitial bins[default=%default].") parser.add_option("--spike-initial-bin-width", dest="width_ibin", type="float", help="bin width intitial bins[default=%default].") parser.add_option( "--spike-minimum", dest="min_spike", type="int", help="minimum number of spike-ins required within each bin\ [default=%default].") parser.add_option( "--spike-maximum", dest="max_spike", type="int", help="maximum number of spike-ins allowed within each bin\ [default=%default].") parser.add_option("--spike-difference-method", dest="difference", type="choice", choices=("relative", "logfold", "abs_logfold"), help="method to use for calculating difference\ [default=%default].") parser.add_option("--spike-iterations", dest="iterations", type="int", help="number of iterations to generate spike-ins\ [default=%default].") parser.add_option("--spike-cluster-maximum-distance", dest="cluster_max_distance", type="int", help="maximum distance between adjacent loci in cluster\ [default=%default].") parser.add_option("--spike-cluster-minimum-size", dest="cluster_min_size", type="int", help="minimum number of loci required per cluster\ [default=%default].") parser.add_option("--spike-type", dest="spike_type", type="choice", choices=("row", "cluster"), help="spike in type [default=%default].") parser.add_option("--spike-subcluster-min-size", dest="min_sbin", type="int", help="minimum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-max-size", dest="max_sbin", type="int", help="maximum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-bin-width", dest="width_sbin", type="int", help="bin width for subcluster size\ [default=%default].") parser.add_option("--spike-output-method", dest="output_method", type="choice", choices=("append", "seperate"), help="defines whether the spike-ins should be appended\ to the original table or seperately [default=%default].") parser.add_option("--spike-shuffle-column-suffix", dest="shuffle_suffix", type="string", help="the suffix of the columns which are to be shuffled\ [default=%default].") parser.add_option("--spike-keep-column-suffix", dest="keep_suffix", type="string", help="a list of suffixes for the columns which are to be\ keep along with the shuffled columns[default=%default].") parser.add_option("--normalization-method", dest="normalization_method", type="choice", choices=("deseq-size-factors", "total-count", "total-column", "total-row"), help="normalization method to apply [%default]") parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.set_defaults(input_filename_tags="-", method="filter", filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, output_method="seperate", difference="logfold", spike_type="row", min_cbin=0, max_cbin=100, width_cbin=100, min_ibin=0, max_ibin=100, width_ibin=100, max_spike=100, min_spike=None, iterations=1, cluster_max_distance=100, cluster_min_size=10, min_sbin=1, max_sbin=1, width_sbin=1, shuffle_suffix=None, keep_suffix=None, normalization_method="deseq-size-factors") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # load if options.keep_suffix: # if using suffix, loadTagDataPandas will throw an error as it # looks for column names which exactly match the design # "tracks" need to write function in Counts.py to handle # counts table and design table + suffix counts = pd.read_csv(options.stdin, sep="\t", comment="#") inf = IOTools.openFile(options.input_filename_design) design = pd.read_csv(inf, sep="\t", index_col=0) inf.close() design = design[design["include"] != 0] if options.method in ("filter", "spike"): if options.input_filename_design is None: raise ValueError("method '%s' requires a design file" % options.method) else: # create Counts object # TS if spike type is cluster, need to keep "contig" and "position" # columns out of index if options.spike_type == "cluster": index = None, else: index = 0 if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(options.stdin, sep="\t", index_col=index, comment="#")) else: counts = Counts.Counts(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=index, comment="#") # TS normalization doesn't require a design table if not options.method == "normalize": assert options.input_filename_design and os.path.exists( options.input_filename_design) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if options.method == "filter": assert (options.filter_min_counts_per_sample is not None or options.filter_min_counts_per_row is not None or options.filter_percentile_rowsums is not None), \ "no filtering parameters have been suplied" # filter # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) nobservations, nsamples = counts.table.shape if nobservations == 0: E.warn("no observations remaining after filtering- no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "normalize": counts.normalise(method=options.normalization_method, row_title="total") # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "spike": # check parameters are sensible and set parameters where they # are not explicitly set if not options.min_spike: E.info("setting minimum number of spikes per bin to equal" "maximum number of spikes per bin (%s)" % options.max_spike) options.min_spike = options.max_spike if options.spike_type == "cluster": assert options.max_sbin <= options.cluster_min_size, \ ("max size of subscluster: %s is greater than min size of" "cluster: %s" % (options.max_sbin, options.cluster_min_size)) counts_columns = set(counts.table.columns.values.tolist()) assert ("contig" in counts_columns and "position" in counts_columns), \ ("cluster analysis requires columns named 'contig' and" "'position' in the dataframe") counts.sort(sort_columns=["contig", "position"], reset_index=True) # restrict design table to first pair only design.firstPairOnly() # get dictionaries to map group members to column names # use different methods depending on whether suffixes are supplied if options.keep_suffix: g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix( options.shuffle_suffix, options.keep_suffix) else: # if no suffixes supplied, spike and keep tracks are the same g_to_track = design.getGroups2Samples() g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track) # set up numpy arrays for change and initial values change_bins = np.arange(options.min_cbin, options.max_cbin, options.width_cbin) initial_bins = np.arange(options.min_ibin, options.max_ibin, options.width_ibin) E.info("Column boundaries are: %s" % str(change_bins)) E.info("Row boundaries are: %s" % str(initial_bins)) # shuffle rows/clusters if options.spike_type == "cluster": E.info("looking for clusters...") clusters_dict = Counts.findClusters(counts_sort, options.cluster_max_distance, options.cluster_min_size, g_to_spike_tracks, groups) if len(clusters_dict) == 0: raise Exception("no clusters were found, check parameters") E.info("shuffling subcluster regions...") output_indices, counts = Counts.shuffleCluster( initial_bins, change_bins, g_to_spike_tracks, groups, options.difference, options.max_spike, options.iterations, clusters_dict, options.max_sbin, options.min_sbin, options.width_sbin) elif options.spike_type == "row": E.info("shuffling rows...") output_indices, bin_counts = counts.shuffleRows( options.min_cbin, options.max_cbin, options.width_cbin, options.min_ibin, options.max_ibin, options.width_ibin, g_to_spike_tracks, design.groups, options.difference, options.max_spike, options.iterations) filled_bins = Counts.thresholdBins(output_indices, bin_counts, options.min_spike) assert len(filled_bins) > 0, "No bins contained enough spike-ins" # write out counts.outputSpikes(filled_bins, g_to_keep_tracks, design.groups, output_method=options.output_method, spike_type=options.spike_type, min_cbin=options.min_cbin, width_cbin=options.width_cbin, max_cbin=options.max_cbin, min_ibin=options.min_ibin, width_ibin=options.width_ibin, max_ibin=options.max_ibin, min_sbin=options.min_sbin, width_sbin=options.width_sbin, max_sbin=options.max_sbin) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError, msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return
def outputRegionsOfInterest(design_file, counts_file, outfile, max_per_sample=10, sum_per_group=40): '''output windows according to various filters. The output is a mock analysis similar to a differential expression result. Arguments --------- design_file : string Filename with experimental design counts_file : string :term:`tsv` formatted file with counts per windows outfile : string Output filename in :term:`tsv` format max_per_sample : int Remove samples with more than threshold counts sum_per_group : int Minimum counts per group. ''' job_memory = "64G" design = Expression.readDesignFile(design_file) # remove tracks not included in the design design = dict([(x, y) for x, y in design.items() if y.include]) # define the two groups groups = sorted(set([x.group for x in design.values()])) # build a filtering statement groupA, groupB = groups def _buildMax(g, threshold): selected = [x for x, y in design.items() if y.group == g] if len(selected) > 1: return "max((%s)) < %f" % ( ",".join( ["int(r['%s'])" % x for x in selected]), threshold) elif len(selected) == 1: return "int(r['%s']) < %f" % (selected[0], threshold) else: raise ValueError("no groups found for 'g'" % g) def _buildSum(g, threshold): selected = [x for x, y in design.items() if y.group == g] if len(selected) > 1: return "sum((%s)) > %f" % ( ",".join( ["int(r['%s'])" % x for x in selected]), threshold) elif len(selected) == 1: return "int(r['%s']) > %f" % (selected[0], threshold) else: raise ValueError("no groups found for 'g'" % g) upper_levelA = _buildMax(groupA, max_per_sample) upper_levelB = _buildMax(groupB, max_per_sample) sum_levelA = _buildSum(groupA, sum_per_group) sum_levelB = _buildSum(groupB, sum_per_group) statement = ''' zcat %(counts_file)s | python %(scriptsdir)s/csv_select.py --log=%(outfile)s.log "(%(upper_levelA)s and %(sum_levelB)s) or (%(upper_levelB)s and %(sum_levelA)s)" | python %(scriptsdir)s/runExpression.py --log=%(outfile)s.log --design-tsv-file=%(design_file)s --tags-tsv-file=- --method=mock --filter-min-counts-per-sample=0 | gzip > %(outfile)s ''' P.run()
def runCuffdiff(bamfiles, design_file, geneset_file, outfile, cuffdiff_options="", job_threads=4, job_memory="4G", fdr=0.1, mask_file=None): '''estimate differential expression using cuffdiff. Replicates within each track are grouped. Arguments --------- bamfiles : list List of filenames in :term:`bam` format. designfile : string Filename with experimental design in :term:`tsv` format. geneset_file : string Filename with geneset of interest in :term:`gtf format. outfile : string Output filename. The output is :term:`tsv` formatted. cuffdiff_options : string Options to pass on to cuffdiff job_threads : int Number of threads to use. job_memory : string Memory to reserve. fdr : float FDR threshold to apply. mask_file : string If given, ignore genes overlapping gene models in this :term:`gtf` formatted file. ''' design = Expression.readDesignFile(design_file) outdir = outfile + ".dir" try: os.mkdir(outdir) except OSError: pass # replicates are separated by "," reps = collections.defaultdict(list) for bamfile in bamfiles: groups = collections.defaultdict() # .accepted.bam kept for legacy reasons (see rnaseq pipeline) track = P.snip(os.path.basename(bamfile), ".bam", ".accepted.bam") if track not in design: E.warn("bamfile '%s' not part of design - skipped" % bamfile) continue d = design[track] if not d.include: continue reps[d.group].append(bamfile) groups = sorted(reps.keys()) labels = ",".join(groups) reps = " ".join([",".join(reps[group]) for group in groups]) # Nick - add mask gtf to not assess rRNA and ChrM extra_options = [] if mask_file: extra_options.append(" -M %s" % os.path.abspath(mask_file)) extra_options = " ".join(extra_options) # IMS added a checkpoint to catch cuffdiff errors # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute # AH: compress output in outdir job_memory = "7G" statement = '''date > %(outfile)s.log; hostname >> %(outfile)s.log; cuffdiff --output-dir %(outdir)s --verbose --num-threads %(job_threads)i --labels %(labels)s --FDR %(fdr)f %(extra_options)s %(cuffdiff_options)s <(gunzip < %(geneset_file)s ) %(reps)s 2>&1 | grep -v 'BAM record error' >> %(outfile)s.log; checkpoint; gzip -f %(outdir)s/*; checkpoint; date >> %(outfile)s.log; ''' P.run() results = parseCuffdiff(os.path.join(outdir, "gene_exp.diff.gz")) Expression.writeExpressionResults(outfile, results)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--filename-tags", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "-d", "--filename-design", dest="input_filename_design", type="string", help="input file with experimental design [default=%default].") parser.add_option("-o", "--outfile", dest="output_filename", type="string", help="output filename [default=%default].") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("deseq", "edger", "cuffdiff", "mock", "summary", "dump", "spike"), help="differential expression method to apply [default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option( "-p", "--pseudo-counts", dest="pseudo_counts", type="float", help="pseudocounts to add for mock analyis [default=%default].") parser.add_option("-R", "--save-R", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option( "-r", "--reference-group", dest="ref_group", type="string", help= "Group to use as reference to compute fold changes against [default=$default]" ) parser.add_option( "--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help= "remove rows with less than this number of counts in total [default=%default]." ) parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option( "--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help= "remove percent of rows with lowest total counts [default=%default].") parser.set_defaults( input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", ref_group=None, save_r_environment=None, filter_min_counts_per_row=1, filter_min_counts_per_sample=10, filter_percentile_rowsums=0, pseudo_counts=0, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.input_filename_tags == "-": fh = P.getTempFile() fh.write("".join([x for x in options.stdin])) fh.close() options.input_filename_tags = fh.name else: fh = None # load tag data and filter if options.method in ("deseq", "edger", "mock"): assert options.input_filename_tags and os.path.exists( options.input_filename_tags) assert options.input_filename_design and os.path.exists( options.input_filename_design) Expression.loadTagData(options.input_filename_tags, options.input_filename_design) nobservations, nsamples = Expression.filterTagData( filter_min_counts_per_row=options.filter_min_counts_per_row, filter_min_counts_per_sample=options.filter_min_counts_per_sample, filter_percentile_rowsums=options.filter_percentile_rowsums) if nobservations == 0: E.warn("no observations - no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return sample_names = R('''colnames(countsTable)''') E.info("%i samples to test at %i observations: %s" % (nsamples, nobservations, ",".join(sample_names))) try: if options.method == "deseq": Expression.runDESeq( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, dispersion_method=options.deseq_dispersion_method, fit_type=options.deseq_fit_type, sharing_mode=options.deseq_sharing_mode, ref_group=options.ref_group, ) elif options.method == "edger": Expression.runEdgeR(outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, ref_group=options.ref_group) elif options.method == "mock": Expression.runMockAnalysis( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, ref_group=options.ref_group, pseudo_counts=options.pseudo_counts, ) elif options.method == "summary": Expression.outputTagSummary( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design) elif options.method == "dump": assert options.input_filename_tags and os.path.exists( options.input_filename_tags) Expression.dumpTagData(options.input_filename_tags, options.input_filename_design, outfile=options.stdout) elif options.method == "spike": Expression.outputSpikeIns( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design, foldchange_max=options.spike_foldchange_max, expression_max=options.spike_expression_max, max_counts_per_bin=options.spike_max_counts_per_bin, expression_bin_width=options.spike_expression_bin_width, foldchange_bin_width=options.spike_foldchange_bin_width, ) except rpy2.rinterface.RRuntimeError, msg: if options.save_r_environment: E.info("saving R image to %s" % options.save_r_environment) R['save.image'](options.save_r_environment) raise
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0): '''load results from cuffdiff analysis to database This functions parses and loads the results of a cuffdiff differential expression analysis. Parsing is performed by the parseCuffdiff function. Multiple tables will be created as cuffdiff outputs information on gene, isoform, tss, etc. levels. The method converts from ln(fold change) to log2 fold change. Pairwise comparisons in which one gene is not expressed (fpkm < `min_fpkm`) are set to status 'NOCALL'. These transcripts might nevertheless be significant. Arguments --------- dbhandle : object Database handle. infile : string Input filename, output from cuffdiff outfile : string Output filename in :term:`tsv` format. min_fpkm : float Minimum fpkm. Genes with an fpkm lower than this will be set to status `NOCALL`. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued... tmpname = P.getTempFilename(shared=True) # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) P.load(tmpname, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=treatment_name " "--add-index=control_name " "--add-index=test_id") for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" infile = os.path.join(indir, fn) P.load(infile, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=tracking_id " "--add-index=control_name " "--add-index=test_id") # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} is_first = True for line in inf: if is_first: is_first = False continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # please make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") s = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[s]] + "\t") s += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() P.load(tmpf, outfile, tablename=tablename, options="--allow-empty-file " " --add-index=gene_id") os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def runCuffdiff(bamfiles, design_file, geneset_file, outfile, cuffdiff_options="", threads=4, fdr=0.1, mask_file=None): '''estimate differential expression using cuffdiff. infiles bam files geneset_file geneset to use for the analysis design_file design file describing which differential expression to test Replicates within each track are grouped. ''' design = Expression.readDesignFile(design_file) outdir = outfile + ".dir" try: os.mkdir(outdir) except OSError: pass job_threads = threads # replicates are separated by "," reps = collections.defaultdict(list) for bamfile in bamfiles: groups = collections.defaultdict() # .accepted.bam kept for legacy reasons (see rnaseq pipeline) track = P.snip(os.path.basename(bamfile), ".bam", ".accepted.bam") if track not in design: E.warn("bamfile '%s' not part of design - skipped" % bamfile) continue d = design[track] if not d.include: continue reps[d.group].append(bamfile) groups = sorted(reps.keys()) labels = ",".join(groups) reps = " ".join([",".join(reps[group]) for group in groups]) # Nick - add mask gtf to not assess rRNA and ChrM extra_options = [] if mask_file: extra_options.append(" -M %s" % os.path.abspath(mask_file)) extra_options = " ".join(extra_options) # IMS added a checkpoint to catch cuffdiff errors # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute # AH: compress output in outdir statement = '''date > %(outfile)s.log; hostname >> %(outfile)s.log; cuffdiff --output-dir %(outdir)s --verbose --num-threads %(threads)i --labels %(labels)s --FDR %(fdr)f %(extra_options)s %(cuffdiff_options)s <(gunzip < %(geneset_file)s ) %(reps)s 2>&1 | grep -v 'BAM record error' >> %(outfile)s.log; checkpoint; gzip -f %(outdir)s/*; checkpoint; date >> %(outfile)s.log; ''' P.run() results = parseCuffdiff(os.path.join(outdir, "gene_exp.diff.gz")) Expression.writeExpressionResults(outfile, results)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-o", "--outfile", dest="output_filename", type="string", help="output filename [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "deseq", "edger", "deseq2", "ttest", "mock", "summary", "dump", "spike", "plottagstats", "plotdetagstats"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option( "--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no replicates " "[default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-p", "--pseudocounts", dest="pseudo_counts", type="float", help="pseudocounts to add for mock analyis " "[default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--deseq2-design-formula", dest="model", type="string", help="Design formula for DESeq2") parser.add_option("--deseq2-contrasts", dest="contrasts", type="string", help=("contrasts for post-hoc testing writen" " variable:control:treatment,...")) parser.add_option("--deseq2-plot", dest="plot", type="int", help=("draw plots during deseq2 analysis")) parser.set_defaults( input_filename_tags=None, input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=None, save_r_environment=None, filter_min_counts_per_row=1, filter_min_counts_per_sample=10, filter_percentile_rowsums=0, pseudo_counts=0, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, plot=1 ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.input_filename_tags == "-": fh = tempfile.NamedTemporaryFile(delete=False) fh.write("".join([x for x in options.stdin])) fh.close() options.input_filename_tags = fh.name else: fh = None # load tag data and filter if options.method in ("deseq2", "deseq", "edger", "mock", "ttest"): assert options.input_filename_tags and os.path.exists( options.input_filename_tags) assert options.input_filename_design and os.path.exists( options.input_filename_design) Expression.loadTagData(options.input_filename_tags, options.input_filename_design) nobservations, nsamples = Expression.filterTagData( filter_min_counts_per_row=options.filter_min_counts_per_row, filter_min_counts_per_sample=options.filter_min_counts_per_sample, filter_percentile_rowsums=options.filter_percentile_rowsums) if nobservations == 0: E.warn("no observations - no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return sample_names = R('''colnames(countsTable)''') E.info("%i samples to test at %i observations: %s" % (nsamples, nobservations, ",".join(sample_names))) try: if options.method == "deseq2": Expression.runDESeq2( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, ref_group=options.ref_group, model=options.model, contrasts=options.contrasts, plot=options.plot ) elif options.method == "deseq": Expression.runDESeq( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, dispersion_method=options.deseq_dispersion_method, fit_type=options.deseq_fit_type, sharing_mode=options.deseq_sharing_mode, ref_group=options.ref_group, ) elif options.method == "edger": Expression.runEdgeR( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr, ref_group=options.ref_group, dispersion=options.edger_dispersion) elif options.method == "mock": Expression.runMockAnalysis( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, ref_group=options.ref_group, pseudo_counts=options.pseudo_counts, ) elif options.method == "summary": Expression.outputTagSummary( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design ) elif options.method == "dump": assert options.input_filename_tags and os.path.exists( options.input_filename_tags) Expression.dumpTagData(options.input_filename_tags, options.input_filename_design, outfile=options.stdout) elif options.method == "plottagstats": assert options.input_filename_tags and os.path.exists( options.input_filename_tags) Expression.plotTagStats( options.input_filename_tags, options.input_filename_design, outfile_prefix=options.output_filename_pattern) elif options.method == "plotdetagstats": assert options.input_filename_result and os.path.exists( options.input_filename_result) Expression.plotDETagStats( options.input_filename_result, outfile_prefix=options.output_filename_pattern) elif options.method == "spike": Expression.outputSpikeIns( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design=options.input_filename_design, foldchange_max=options.spike_foldchange_max, expression_max=options.spike_expression_max, max_counts_per_bin=options.spike_max_counts_per_bin, expression_bin_width=options.spike_expression_bin_width, foldchange_bin_width=options.spike_foldchange_bin_width, ) elif options.method == "ttest": Expression.runTTest( outfile=options.output_filename, outfile_prefix=options.output_filename_pattern, fdr=options.fdr) except rpy2.rinterface.RRuntimeError: if options.save_r_environment: E.info("saving R image to %s" % options.save_r_environment) R['save.image'](options.save_r_environment) raise if fh and os.path.exists(fh.name): os.unlink(fh.name) if options.save_r_environment: R['save.image'](options.save_r_environment) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tag-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("ttest", "sleuth", "edger", "deseq2", "mock", "dexseq"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq2-dispersion-method", dest="deseq2_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq2 [default=%default].") parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq2 [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") # currently not implemented # parser.add_option("-R", "--output-R-code", dest="save_r_environment", # type="string", # help="save R environment to loc [default=%default]") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--reduced-model", dest="reduced_model", type="string", help=("reduced model for LRT")) parser.add_option("--contrast", dest="contrast", type="string", help=("contrast for differential expression testing")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing expression estimates" "from sleuth. Sleuth expects counts" "files to be called abundance.h5")) parser.add_option("--dexseq-counts-dir", dest="dexseq_counts_dir", type="string", help=("directory containing counts for dexseq. DEXSeq " "expects counts files to be called .txt and" "to be generated by the DEXSeq_counts.py script")) parser.add_option("--dexseq-flattened-file", dest="dexseq_flattened_file", type="string", help=("directory containing flat gtf for dexseq. DEXSeq " "expects this to be generated by the" "DEXSeq_prepare_annotations.py script")) parser.add_option( "--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option( "--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.add_option("--de-test", dest="DEtest", type="choice", choices=("wald", "lrt"), help=("Differential expression test")) parser.add_option("--Rhistory", dest="Rhistory", type="string", help=("Outfile for R history")) parser.add_option("--Rimage", dest="Rimage", type="string", help=("Outfile for R image")) parser.set_defaults(input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq2_dispersion_method="pooled", deseq2_fit_type="parametric", edger_dispersion=0.4, ref_group=False, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrast=None, output_filename_pattern=None, sleuth_counts_dir=None, dexseq_counts_dir=None, dexseq_flattened_file=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None, DEtest="wald", reduced_model=None, Rhistory=None, Rimage=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) RH = None if options.Rhistory or options.Rimage: RH = R.R_with_History() outfile_prefix = options.output_filename_pattern # Expression.py currently expects a refernce group for edgeR and # sleuth, regardless of which test is used if not options.ref_group and (options.method is "edger" or options.method is "sleuth"): raise ValueError( "Must provide a reference group ('--reference-group')") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if len(set(design.table[options.contrast])) > 2: if options.method == "deseq2" or options.method == "sleuth": if options.DEtest == "wald": raise ValueError( "Factor must have exactly two levels for Wald Test. " "If you have more than two levels in your factor, " "consider LRT") else: E.info('''There are more than 2 levels for the contrast specified" "(%s:%s). The log2fold changes in the results table and MA plots will be for the first two levels in the contrast. The p-value will be the p-value for the overall significance of the contrast. Hence, some genes will have a signficant p-value but 0-fold change between the first two levels''' % (options.contrast, set(design[options.contrast]))) # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files " " (--sleuth-counts-dir)") # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart, DE_test=options.DEtest, ref_group=options.ref_group, reduced_model=options.reduced_model) # DEXSeq reads in data itself elif options.method == "dexseq": assert options.dexseq_counts_dir, ( "need to specify the location of the .txt counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model # design.validate(model=options.model) experiment = Expression.DEExperiment_DEXSeq() results = experiment.run(design, base_dir=options.dexseq_counts_dir, model=options.model, contrast=options.contrast, ref_group=options.ref_group, outfile_prefix=outfile_prefix, flattenedfile=options.dexseq_flattened_file, fdr=options.fdr) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts( pd.io.parsers.read_csv(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, ref_group=options.ref_group, fdr=options.fdr, dispersion=options.edger_dispersion) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group, DEtest=options.DEtest, R=RH) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in list(results.Summary.keys()): outf = IOTools.openFile( "_".join([outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() if options.Rhistory: RH.saveHistory(options.Rhistory) if options.Rimage: RH.saveImage(options.Rimage) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-t", "--filename-tags", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default]." ) parser.add_option("-d", "--filename-design", dest="input_filename_design", type="string", help="input file with experimental design [default=%default]." ) parser.add_option("-o", "--outfile", dest="output_filename", type="string", help="output filename [default=%default]." ) parser.add_option("-m", "--method", dest="method", type="choice", choices = ("deseq", "edger", "cuffdiff", "mock", "summary", "dump", "spike" ), help="differential expression method to apply [default=%default]." ) parser.add_option( "--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices = ("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default]." ) parser.add_option( "--deseq-fit-type", dest="deseq_fit_type", type="choice", choices = ("parametric", "local"), help="fit type for deseq [default=%default]." ) parser.add_option( "--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices = ("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default]." ) parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default]." ) parser.add_option("-p", "--pseudo-counts", dest="pseudo_counts", type="float", help="pseudocounts to add for mock analyis [default=%default]." ) parser.add_option("-R", "--save-R", dest="save_r_environment", type="string", help="save R environment [default=%default]." ) parser.add_option("-r","--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute fold changes against [default=$default]") parser.add_option( "--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this number of counts in total [default=%default]." ) parser.add_option( "--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default]." ) parser.add_option( "--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with lowest total counts [default=%default]." ) parser.set_defaults( input_filename_tags = "-", input_filename_design = None, output_filename = sys.stdout, method = "deseq", fdr = 0.1, deseq_dispersion_method = "pooled", deseq_fit_type = "parametric", deseq_sharing_mode = "maximum", ref_group = None, save_r_environment = None, filter_min_counts_per_row = 1, filter_min_counts_per_sample = 10, filter_percentile_rowsums = 0, pseudo_counts = 0, spike_foldchange_max = 4.0, spike_expression_max = 5.0, spike_expression_bin_width = 0.5, spike_foldchange_bin_width = 0.5, spike_max_counts_per_bin = 50, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) if options.input_filename_tags == "-": fh = P.getTempFile() fh.write( "".join( [ x for x in options.stdin ] ) ) fh.close() options.input_filename_tags = fh.name else: fh = None # load tag data and filter if options.method in ("deseq", "edger", "mock"): assert options.input_filename_tags and os.path.exists(options.input_filename_tags) assert options.input_filename_design and os.path.exists(options.input_filename_design) Expression.loadTagData( options.input_filename_tags, options.input_filename_design ) nobservations, nsamples = Expression.filterTagData( filter_min_counts_per_row = options.filter_min_counts_per_row, filter_min_counts_per_sample = options.filter_min_counts_per_sample, filter_percentile_rowsums = options.filter_percentile_rowsums ) if nobservations == 0: E.warn( "no observations - no output" ) return if nsamples == 0: E.warn( "no samples remain after filtering - no output" ) return sample_names = R('''colnames(countsTable)''') E.info( "%i samples to test at %i observations: %s" % ( nsamples, nobservations, ",".join( sample_names))) try: if options.method == "deseq": Expression.runDESeq( outfile = options.output_filename, outfile_prefix = options.output_filename_pattern, fdr = options.fdr, dispersion_method = options.deseq_dispersion_method, fit_type = options.deseq_fit_type, sharing_mode = options.deseq_sharing_mode, ref_group = options.ref_group, ) elif options.method == "edger": Expression.runEdgeR( outfile = options.output_filename, outfile_prefix = options.output_filename_pattern, fdr = options.fdr, ref_group = options.ref_group) elif options.method == "mock": Expression.runMockAnalysis( outfile = options.output_filename, outfile_prefix = options.output_filename_pattern, ref_group = options.ref_group, pseudo_counts = options.pseudo_counts, ) elif options.method == "summary": Expression.outputTagSummary( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design = options.input_filename_design ) elif options.method == "dump": assert options.input_filename_tags and os.path.exists(options.input_filename_tags) Expression.dumpTagData( options.input_filename_tags, options.input_filename_design, outfile = options.stdout ) elif options.method == "spike": Expression.outputSpikeIns( options.input_filename_tags, options.stdout, options.output_filename_pattern, filename_design = options.input_filename_design, foldchange_max = options.spike_foldchange_max, expression_max = options.spike_expression_max, max_counts_per_bin = options.spike_max_counts_per_bin, expression_bin_width = options.spike_expression_bin_width, foldchange_bin_width = options.spike_foldchange_bin_width, ) except rpy2.rinterface.RRuntimeError, msg: if options.save_r_environment: E.info("saving R image to %s" % options.save_r_environment) R['save.image']( options.save_r_environment ) raise
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("sleuth", "edger", "deseq2", "mock"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--contrasts", dest="contrasts", action="append", help=("contrasts for post-hoc testing writen as comma " "seperated list `condition,replicate` etc")) parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="string", help=("fit type used for observed dispersion mean " "relationship in deseq2")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing counts for sleuth. Sleuth " "expects counts files to be called abundance.h5")) parser.add_option("--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option("--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.set_defaults( input_filename_tags="-", input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=False, save_r_environment=None, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrasts=None, output_filename_pattern=None, deseq2_fit_type="parametric", sleuth_counts_dir=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfile_prefix = options.output_filename_pattern + "_" + options.method # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts(pd.io.parsers.read_csv( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, disperion=options.edger_dispersion, ref_group=options.ref_group, contrasts=options.contrasts, outfile_prefix=outfile_prefix) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix) results.plotMA(contrast, outfile_prefix=outfile_prefix) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in results.Summary.keys(): outf = IOTools.openFile("_".join( [outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() E.Stop()
def loadCuffdiff(infile, outfile, min_fpkm=1.0): '''load results from differential expression analysis and produce summary plots. Note: converts from ln(fold change) to log2 fold change. The cuffdiff output is parsed. Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent) are set to status 'NOCALL'. These transcripts might nevertheless be significant. This requires the cummeRbund library to be present in R. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued dbhandle = sqlite3.connect(PARAMS["database"]) tmpname = P.getTempFilename(".") # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) statement = '''cat %(tmpname)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=treatment_name --add-index=control_name --add-index=test_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" statement = '''zcat %(indir)s/%(fn)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=tracking_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} x = 0 for line in inf: if x == 0: x += 1 continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # plesae make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") x = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[x]] + "\t") x += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/csv2db.py " " %(csv2db_options)s" " --allow-empty-file" " --add-index=gene_id" " --table=%(tablename)s" " >> %(outfile)s.log") P.run() os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def buildProbeset2Gene(infile, outfile): '''build map relating a probeset to an ENSEMBL gene_id''' Expression.buildProbeset2Gene(infile, outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write( "estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write( "true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write( "nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join(["treatment_R%i" % x for x in range(len(options.treatment_files))])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join(["control_R%i" % x for x in range(len(options.control_files))])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.getOutputFile("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.getOutputFile("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.getOutputFile("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.getOutputFile("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.getOutputFile("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.getOutputFile("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.Stop()