def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] for stage in config["run"]: if stage == "download_encode": curr_files = _download_encode(config["encode_file"], config) elif stage == "fastqc": _run_fastqc(curr_files, config) elif stage == "trim": _run_trim(curr_files, config) elif stage == "align": _run_tophat(curr_files, config) cell_types = _get_cell_types(config["encode_file"]) logger.info("files: %s" % (curr_files)) logger.info("types: %s" % (cell_types)) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from rkinf.log import logger start_cluster(config) from rkinf.cluster import view input_files = [os.path.join(config["dir"]["data"], x) for x in config["input"]] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) combined_out = htseq_count.combine_counts(htseq_outputs, "combined.counts") stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from rkinf.log import logger start_cluster(config) from rkinf.cluster import view input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) combined_out = htseq_count.combine_counts(htseq_outputs, "combined.counts") stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import a view to it from rkinf.cluster import view in_file = config.get("query") # de-parallelize for now blast_results = [] for stage in config["run"]: if config["stage"][stage]["program"] == "blastn": blastn_config = config["stage"][stage] blast_results = [blastn.run(in_file, ref, blastn_config, config) for ref in config["refs"]] for identity in config["min_identity"]: filtered_results = [] for blast_result in blast_results: filtered_results.append(blastn.filter_results_by_length( blast_result, identity)) fasta_hits = set() for filtered_result in filtered_results: fasta_hits.update(blastn.get_id_of_hits(filtered_result)) def in_set_predicate(x): return x.id in fasta_hits outfile = os.path.join(build_results_dir(blastn_config, config), append_stem(os.path.basename(in_file), str(identity) + "_filt")) fasta_filtered = fasta.filter_fasta(in_file, in_set_predicate, outfile) trimmed = _trim(fasta_filtered, filtered_results) org_names = [x["name"] for x in config["refs"]] logger.info(trimmed) logger.info(filtered_results) logger.info(org_names) combined = _make_combined_csv(trimmed, filtered_results, org_names) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from rkinf.log import logger start_cluster(config) from rkinf.cluster import view input_dir = config["dir"]["input_dir"] results_dir = config["dir"]["results"] input_files = glob.glob(os.path.join(input_dir, "*.bam")) """ example running with macs macs.run_with_config(input_file, config, control_file=None, stage=None) """ curr_files = input_files for stage in config["run"]: # for now just run macs on all of these files without the control # file if stage == "macs": nfiles = len(curr_files) out_files = view.map(macs.run_with_config, curr_files, [config] * nfiles, [None] * nfiles, [stage] * nfiles) # just use the peak files going forward peak_files = [x[0] for x in out_files] curr_files = peak_files if stage == "intersect": """ 1) loop over the ids in the negative group """ for each one pick out the files that match it """ combine them into one file """ output it as the union """ 2) loop over the ids in the positive and test group """ find intersections of the ones that match the same id: """ intersectBed -wao -bed -f fraction -r -a bed1 -b -bed2 """ might have to try a range of f stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import a view to it from rkinf.cluster import view in_file = config.get("query") org_names = [x["name"] for x in config["refs"]] curr_files = in_file for stage in config["run"]: if stage == "blastn": logger.info("Running %s on %s." % (stage, curr_files)) blastn_config = config["stage"][stage] refs = config["refs"] args = zip(*itertools.product([curr_files], refs, [blastn_config], [config])) blastn_results = view.map(blastn.run, *args) curr_files = blastn_results if stage == "annotate": logger.info("Running %s on %s." % (stage, curr_files)) # annotate the data frames args = zip(*itertools.product(curr_files, ["sseqid"], org_names)) annotated = view.map(_annotate_df, *args) curr_files = annotated if stage == "combine": out_fname = os.path.join(os.path.dirname(curr_files[0]), append_stem(in_file, "combined")) logger.info("Combining %s into %s." % (curr_files, out_fname)) org_names = [x["name"] for x in config["refs"]] # combined = _make_combined_csv(curr_files, org_names, out_fname) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from rkinf.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"]) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only data_dir = os.path.join(config["dir"]["data"], stage) safe_makedir(data_dir) view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) new_files = [os.path.join(data_dir, x) for x in map(os.path.basename, sorted_bf)] [os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use view.map(picardrun.picard_index, [picard] * len(new_files), new_files) curr_files = new_files if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) stop_cluster()