def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) from bipy.cluster import view input_files = config["input"] for stage in config["run"]: if config["stage"][stage]["program"] == "tagdust": tagdust_config = config["stage"][stage] view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view input_dir = config["dir"]["input_dir"] results_dir = config["dir"]["results"] input_files = glob.glob(os.path.join(input_dir, "*.bam")) """ example running with macs macs.run_with_config(input_file, config, control_file=None, stage=None) """ curr_files = input_files # first combine all the negative controls into one file negative_control = _merge_condition(input_files, config["groups"]["negative"]) test_files = [_merge_condition(input_files, condition) for condition in config["groups"]["test"]] test_files = [x for x in test_files if x] curr_files = test_files for stage in config["run"]: # for now just run macs on all of these files without the control # file if stage == "macs": nfiles = len(curr_files) out_files = view.map(macs.run_with_config, curr_files, [config] * nfiles, [negative_control] * nfiles, [stage] * nfiles) # just use the peak files going forward peak_files = [x[0] for x in out_files] curr_files = peak_files if stage == "piranha": nfiles = len(curr_files) piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view input_dir = config["dir"]["input_dir"] results_dir = config["dir"]["results"] input_files = glob.glob(os.path.join(input_dir, "*.bam")) """ example running with macs macs.run_with_config(input_file, config, control_file=None, stage=None) """ curr_files = input_files # first combine all the negative controls into one file negative_control = _merge_condition(input_files, config["groups"]["negative"]) test_files = [ _merge_condition(input_files, condition) for condition in config["groups"]["test"] ] test_files = [x for x in test_files if x] curr_files = test_files for stage in config["run"]: # for now just run macs on all of these files without the control # file if stage == "macs": nfiles = len(curr_files) out_files = view.map(macs.run_with_config, curr_files, [config] * nfiles, [negative_control] * nfiles, [stage] * nfiles) # just use the peak files going forward peak_files = [x[0] for x in out_files] curr_files = peak_files if stage == "piranha": nfiles = len(curr_files) piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) sortsam = view.map(sam.coordinate_sort_sam, tophat_outputs, [config] * len(tophat_outputs)) bamfiles = view.map(sam.sam2bam, sortsam) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) down_args = zip(*product(final_bamfiles, [40000000])) down_bam = view.map(sam.downsample_bam, *down_args) view.map(rseqc.genebody_coverage, down_bam, [config] * len(down_bam)) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [x for x, y in enumerate(conditions) if y in comparison] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
parser = argparse.ArgumentParser(description='generic launcher') parser.add_argument('--profile', required=True, help="IPython profile name to use") parser.add_argument('--cores', required=True, help="Number of IPython engines to start.") parser.add_argument('--queue', help="Name of queue to use.") parser.add_argument('--scheduler', default="", help="Name of scheduler to use (LSF or SGE)") args = parser.parse_args() cluster_config = { "cluster": { "profile": args.profile, "cores": int(args.cores), "queue": args.queue, "scheduler": args.scheduler } } setup_logging(cluster_config) if _args_valid_for_scheduler(args) or _args_valid_for_local(args): start_cluster(cluster_config) from bipy.cluster import view main() stop_cluster() else: print parser.usage() sys.exit(1)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" % (conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip( *product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] results_dir = config["dir"].get("results", "results") for cell_type in config["cell_types"]: cell_type_dir = os.path.join(results_dir, cell_type) safe_makedir(cell_type_dir) config["dir"]["results"] = cell_type_dir in_files = glob.glob(os.path.join(config["dir"]["data"], cell_type, "*")) curr_files = in_files for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) picard = BroadRunner(config["program"]["picard"]) # convert to bam #args = zip(*product([picard], tophat_outputs)) #bamfiles = view.map(picardrun.picard_formatconverter, # *args) bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) RPKM_count_files = view.map(rseqc.RPKM_count, *rseq_args) dirs_to_process = list(set(map(os.path.dirname, RPKM_count_files))) logger.info("Count files: %s" % (RPKM_count_files)) logger.info("dirnames to process: %s" % (dirs_to_process)) RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) column_names = in_files out_file = os.path.join(config["dir"]["results"], stage, cell_type + ".combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, cell_type + ".rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) # end gracefully, wait for jobs to finish, then exit view.wait() stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view # view.push({'logger': logger}) input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [ os.path.basename(x).split("_")[0] for x in input_files ] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [ x for x, y in enumerate(conditions) if y in comparison ] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.sam", os.path.join(input_dir, "tophat_control"))) input_files += list(locate("*.sam", os.path.join(input_dir, "tophat_exposed"))) print input_files input_files = [x for x in input_files if "accepted" not in x] input_files = [x for x in input_files if "innerdist_estimate" not in x] logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) curr_files = input_files logger.info("Running quantitation on %s." % (curr_files)) for stage in config["run"]: if stage == "htseq-count": logger.info("Running htseq-count on %s." % (curr_files)) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) curr_files = view.map(sam.bam2sam, curr_files) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) logger.info("Output of cutadapt: %s." % (curr_files)) if stage == "bowtie": logger.info("Running Bowtie on %s." % (curr_files)) bowtie = Bowtie(config) bowtie_outputs = view.map(bowtie, curr_files) bamfiles = view.map(sam.sam2bam, bowtie_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (curr_files)) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list( locate("*.sam", os.path.join(input_dir, "tophat_control"))) input_files += list( locate("*.sam", os.path.join(input_dir, "tophat_exposed"))) print input_files input_files = [x for x in input_files if "accepted" not in x] input_files = [x for x in input_files if "innerdist_estimate" not in x] logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) curr_files = input_files logger.info("Running quantitation on %s." % (curr_files)) for stage in config["run"]: if stage == "htseq-count": logger.info("Running htseq-count on %s." % (curr_files)) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) curr_files = view.map(sam.bam2sam, curr_files) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" %(conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running tophat on %s." % (curr_files)) stage_runner = Tophat(config) tophat_outputs = view.map(stage_runner, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) data_dir = config["dir"]["data"] from bipy.cluster import view input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in config["input_dirs"]] input_files = list(flatten(input_files)) logger.info("Input files to process: %s" % (input_files)) results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": logger.info("Running htseq-count on %s" %(curr_files)) htseq_outputs = curr_files column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [os.path.basename(x).split("_")[0] for x in input_files] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) if stage == "dss": conditions = [os.path.basename(x).split("_")[0] for x in input_files] dss_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in dss_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) dss_conds = [conditions[index] for index in indexes] dss_prefix = os.path.join(out_dir, comparison_name) logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison)) dss_out = dss.run(combined_out, dss_conds, comparison, dss_prefix) stop_cluster()
def _args_valid_for_local(args): return not args.scheduler and not args.queue if __name__ == "__main__": parser = argparse.ArgumentParser(description='generic launcher') parser.add_argument('--profile', required=True, help="IPython profile name to use") parser.add_argument('--cores', required=True, help="Number of IPython engines to start.") parser.add_argument('--queue', help="Name of queue to use.") parser.add_argument('--scheduler', default="", help="Name of scheduler to use (LSF or SGE)") args = parser.parse_args() cluster_config = {"cluster": {"profile": args.profile, "cores": int(args.cores), "queue": args.queue, "scheduler": args.scheduler}} setup_logging(cluster_config) if _args_valid_for_scheduler(args) or _args_valid_for_local(args): start_cluster(cluster_config) from bipy.cluster import view main() stop_cluster() else: print parser.usage() sys.exit(1)
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [ x for x, y in enumerate(conditions) if y in comparison ] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [ _find_file_index_for_test(input_meta, condition) for condition in test ] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join( out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [_find_file_index_for_test(input_meta, condition) for condition in test] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join(out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def main(config, view): # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.disambiguous*.sorted.bam", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) curr_files = input_files logger.info("Running quantitation on %s." % (curr_files)) for stage in config["run"]: if stage == "htseq-count": logger.info("Running htseq-count on %s." % (input_files)) name_sorted = view.map(sam.bam_name_sort, input_files) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) #curr_files = view.map(sam.bam2sam, curr_files) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(sam.bamindex, curr_files) RPKM_count_out = view.map(rseqc.RPKM_count, *rseq_args) view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) #view.map(rseqc.junction_saturation, *rseq_args) #RPKM_args = zip(*product(final_bamfiles, [config])) #RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) #RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, # RPKM_count_out) #RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ #view.map(rseqc.RPKM_saturation, *rseq_args) # end gracefully stop_cluster()