def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] for stage in config["run"]: if stage == "download_encode": curr_files = _download_encode(config["encode_file"], config) elif stage == "fastqc": _run_fastqc(curr_files, config) elif stage == "trim": _run_trim(curr_files, config) elif stage == "align": _run_tophat(curr_files, config) cell_types = _get_cell_types(config["encode_file"]) logger.info("files: %s" % (curr_files)) logger.info("types: %s" % (cell_types)) # end gracefully stop_cluster()
def _run_fastqc(curr_files, config): logger.info("Running fastqc on %s" % (str(curr_files))) nfiles = len(curr_files) fastqc_config = config["stage"]["fastqc"] out_files = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) return out_files
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from rkinf.log import logger start_cluster(config) from rkinf.cluster import view input_files = [os.path.join(config["dir"]["data"], x) for x in config["input"]] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) combined_out = htseq_count.combine_counts(htseq_outputs, "combined.counts") stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from rkinf.log import logger start_cluster(config) from rkinf.cluster import view input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) combined_out = htseq_count.combine_counts(htseq_outputs, "combined.counts") stop_cluster()
def _combine_and_write(dataframes, out_file): from rkinf.log import logger import pandas as pd from bcbio.utils import file_exists logger.info("Writing combined file to %s." % (out_file)) if file_exists(out_file): return out_file merged = pd.concat(dataframes) df_subset = merged[TO_KEEP] df_subset.to_csv(out_file, index=False, sep="\t") return out_file
def _download_encode(input_file, config): """ grab the encode files they listed in their file """ NAME_FIELD = 0 if not os.path.exists(input_file): logger.info("Error %s does not exist, aborting." % (input_file)) exit(-1) with open(input_file) as in_handle: reader = csv.reader(in_handle, delimiter="\t") files = [x[NAME_FIELD] for x in reader] logger.info("Downloading %s." % (files)) data_dir = config["dir"].get("data", "data") out_files = view.map(_download_ref, files, [data_dir] * len(files)) return out_files
def _run_trim(curr_files, config): logger.info("Trimming poor quality ends from %s" % (str(curr_files))) nfiles = len(curr_files) min_length = str(config["stage"]["trim"].get("min_length", 20)) pair = str(config["stage"]["trim"].get("pair", "se")) platform = str(config["stage"]["trim"].get("platform", "sanger")) out_dir = os.path.join(config["dir"]["results"], "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(sickle.run, curr_files, [pair] * nfiles, [platform] * nfiles, [min_length] * nfiles, out_files) return out_files
def _annotate_df(in_file, join_column, organism, out_file=None): from rkinf.log import logger from rkinf.utils import append_stem from rpy2 import robjects ORG_TO_ENSEMBL = {"opossum": {"gene_ensembl": "mdomestica_gene_ensembl", "gene_symbol": "hgnc_symbol"}, "mouse": {"gene_ensembl": "mmusculus_gene_ensembl", "gene_symbol": "mgi_symbol"}, "human": {"gene_ensembl": "hsapiens_gene_ensembl", "gene_symbol": "hgnc_symbol"}, "taz": {"gene_ensembl": "sharrisii_gene_ensembl", "gene_symbol": "hgnc_symbol"}} if organism not in ORG_TO_ENSEMBL: logger.error("organism not supported") exit(1) logger.info("Annotating %s." % (organism)) if not out_file: out_file = append_stem(in_file, "annotated") if os.path.exists(out_file): return out_file # use biomaRt to annotate the data file r = robjects.r r.assign('join_column', join_column) r.assign('in_file', in_file) r.assign('out_file', out_file) r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"]) r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"]) r(''' library(biomaRt) ensembl = useMart("ensembl", dataset = ensembl_gene) d = read.table(in_file, header=TRUE) a = getBM(attributes=c("ensembl_transcript_id", "ensembl_gene_id", gene_symbol, "description"), filters=c("ensembl_transcript_id"), values=d[,join_column], mart=ensembl) m = merge(d, a, by.x=join_column, by.y="ensembl_transcript_id") write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t") ''') return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import a view to it from rkinf.cluster import view in_file = config.get("query") # de-parallelize for now blast_results = [] for stage in config["run"]: if config["stage"][stage]["program"] == "blastn": blastn_config = config["stage"][stage] blast_results = [blastn.run(in_file, ref, blastn_config, config) for ref in config["refs"]] for identity in config["min_identity"]: filtered_results = [] for blast_result in blast_results: filtered_results.append(blastn.filter_results_by_length( blast_result, identity)) fasta_hits = set() for filtered_result in filtered_results: fasta_hits.update(blastn.get_id_of_hits(filtered_result)) def in_set_predicate(x): return x.id in fasta_hits outfile = os.path.join(build_results_dir(blastn_config, config), append_stem(os.path.basename(in_file), str(identity) + "_filt")) fasta_filtered = fasta.filter_fasta(in_file, in_set_predicate, outfile) trimmed = _trim(fasta_filtered, filtered_results) org_names = [x["name"] for x in config["refs"]] logger.info(trimmed) logger.info(filtered_results) logger.info(org_names) combined = _make_combined_csv(trimmed, filtered_results, org_names) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import a view to it from rkinf.cluster import view in_file = config.get("query") org_names = [x["name"] for x in config["refs"]] curr_files = in_file for stage in config["run"]: if stage == "blastn": logger.info("Running %s on %s." % (stage, curr_files)) blastn_config = config["stage"][stage] refs = config["refs"] args = zip(*itertools.product([curr_files], refs, [blastn_config], [config])) blastn_results = view.map(blastn.run, *args) curr_files = blastn_results if stage == "annotate": logger.info("Running %s on %s." % (stage, curr_files)) # annotate the data frames args = zip(*itertools.product(curr_files, ["sseqid"], org_names)) annotated = view.map(_annotate_df, *args) curr_files = annotated if stage == "combine": out_fname = os.path.join(os.path.dirname(curr_files[0]), append_stem(in_file, "combined")) logger.info("Combining %s into %s." % (curr_files, out_fname)) org_names = [x["name"] for x in config["refs"]] # combined = _make_combined_csv(curr_files, org_names, out_fname) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from rkinf.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"]) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only data_dir = os.path.join(config["dir"]["data"], stage) safe_makedir(data_dir) view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) new_files = [os.path.join(data_dir, x) for x in map(os.path.basename, sorted_bf)] [os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use view.map(picardrun.picard_index, [picard] * len(new_files), new_files) curr_files = new_files if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) stop_cluster()