def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools = dd.get_expression_caller(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = data[0][0]["cluster_bam"] if "seqcluster" in tools: gtf_file = dd.get_transcriptome_gtf(sample) if dd.get_transcriptome_gtf(sample) else dd.get_srna_gtf_file(sample) sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), gtf_file) sample["report"] = _report(sample, dd.get_ref_file(sample)) if "mirge" in tools: sample["mirge"] = mirge.run(data) out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample data = spikein.combine_spikein(data) return data
def _detect_rRNA(data, out_dir): out_file = os.path.join(out_dir, "rRNA_metrics.txt") if not utils.file_exists(out_file): gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) quant = tz.get_in(["quant", "tsv"], data) if not quant: salmon_dir = dd.get_salmon_dir(data) if salmon_dir: quant = os.path.join(salmon_dir, "quant.sf") logger.info("Calculating RNA-seq rRNA metrics for %s." % quant) rrna_features = gtf.get_rRNA(gtf_file) transcripts = set([x[1] for x in rrna_features if x]) if not (transcripts and quant and utils.file_exists(quant)): return {'rRNA': "NA", "rRNA_rate": "NA"} sample_table = pd.read_csv(quant, sep="\t") rrna_exp = list( map( float, sample_table[sample_table["Name"].isin(transcripts)] ["NumReads"])) total_exp = list(map(float, sample_table["NumReads"])) rrna = sum(rrna_exp) if sum(total_exp) == 0: rrna_rate = "NA" else: rrna_rate = float(rrna) / sum(total_exp) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write(",".join(["rRNA", str(rrna)]) + "\n") out_handle.write( ",".join(["rRNA_rate", str(rrna_rate)]) + "\n") return _read_memoized_rrna(out_file)
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-forward", "secondstrand": "strand-specific-reverse", "unstranded": "non-strand-specific", "auto": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) library = strandedness[dd.get_strandedness(data)] # don't run qualimap on the full bam by default if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info(f"Full qualimap analysis for {bam_file} may be slow.") ds_bam = bam_file else: logger.info(f"Downsampling {bam_file} for Qualimap run.") ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update( {"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse " "{gene_map_flag}" "--cb_histogram {cb_histogram} {bam} {tx_out_file}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [ umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames" ] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def use_installed_transcriptome(data): user_fa = dd.get_transcriptome_fasta(data) user_gtf = dd.get_transcriptome_gtf(data) if not user_fa and not user_gtf: return True else: return False
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_transcriptome_gtf(odata) if not gtf_file: gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def run_salmon_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon") gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) salmon_index(gtf_file, data, salmon_dir) return samples
def use_installed_transcriptome(data): user_fa = dd.get_transcriptome_fasta(data) user_gtf = dd.get_transcriptome_gtf(data) if not user_fa and not user_gtf: return True else: return False
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join( out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") resources = config_utils.get_resources("featureCounts", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def run_kallisto_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto") gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists( fasta_file), "%s was not found, exiting." % fasta_file kallisto_index(gtf_file, fasta_file, data, kallisto_dir) return samples
def run_arriba(data): build = dd.get_genome_build(data) if build not in SUPPORTED_BUILDS: logger.info(f"{build} not supported for arriba, skipping.") return data arriba_dir = os.path.join(dd.get_work_dir(data), "arriba", dd.get_sample_name(data)) utils.safe_makedir(arriba_dir) bam_file = dd.get_work_bam(data) ref_file = dd.get_ref_file(data) gtf = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) arriba = config_utils.get_program("arriba", data) fusion_file = os.path.join(arriba_dir, "fusions.tsv") discarded_fusion_file = os.path.join(arriba_dir, "fusions.discarded.tsv") blacklist_file = get_arriba_blacklist_file(data) contigs = get_contigs(data) contig_list = ",".join(contigs) if utils.file_exists(fusion_file): data["arriba"] = { "fusions": fusion_file, "discarded": discarded_fusion_file } return (data) with file_transaction(fusion_file) as tx_fusion_file, \ file_transaction(discarded_fusion_file) as tx_discarded_fusion_file: cmd = ( f"{arriba} -x {bam_file} -g {gtf} -a {ref_file} -o {tx_fusion_file} " f"-O {tx_discarded_fusion_file} -T -P " f"-i {contig_list} ") if blacklist_file: logger.info( f"arriba blacklist file found, running blacklisting with {blacklist_file}." ) cmd += (f"-b {blacklist_file} ") else: logger.info( "arriba blacklist file not found, disabling blacklist filtering." ) cmd += (f"-f blacklist ") if dd.get_known_fusions(data): cmd += (f"-k {dd.get_known_fusions(data)} ") message = f"Running arriba on {dd.get_sample_name(data)}." do.run(cmd, message) data["arriba"] = { "fusions": fusion_file, "discarded": discarded_fusion_file } return (data)
def run_kallisto_singlecell(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def index(ref_file, out_dir, data): """Create a bismark index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) bismark = config_utils.find_program("bismark", data["config"]) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a bismark index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) other_opts = config_utils.get_resources("bismark", data["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() cmd = "{bismark} {other_opts} --bowtie2 -p {num_cores} -n 1 -o {tx_out_dir} --basename {sample} --unmapped {ref_file} {in_fastq}" do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("We don't support kallisto for single-end reads and fusion " "calling with pizzly does not accept single end reads.") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def index(ref_file, out_dir, data): """Create a STAR index in the defined reference directory. """ (ref_dir, local_file) = os.path.split(ref_file) gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) if not utils.file_exists(gtf_file): raise ValueError("%s not found, could not create a star index." % (gtf_file)) if not utils.file_exists(out_dir): with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir: num_cores = dd.get_cores(data) cmd = ( "STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} " "--runThreadN {num_cores} " "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}" ) do.run(cmd.format(**locals()), "Index STAR") if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_out_dir, out_dir) return out_dir
def run_pizzly(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") gtf = dd.get_transcriptome_gtf(data) if not gtf: gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) stripped_fa = os.path.splitext(os.path.basename(gtf_fa))[0] + "-noversions.fa" stripped_fa = os.path.join(pizzlydir, stripped_fa) gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def run_pizzly(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") gtf = dd.get_transcriptome_gtf(data) if not gtf: gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) stripped_fa = os.path.splitext( os.path.basename(gtf_fa))[0] + "-noversions.fa" stripped_fa = os.path.join(pizzlydir, stripped_fa) gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def run_salmon_decoy(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir)) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def _stringtie_expression(bam, data, out_dir="."): """ only estimate expression the Stringtie, do not assemble new transcripts """ gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) num_cores = dd.get_num_cores(data) error_message = "The %s file for %s is missing. StringTie has an error." stringtie = config_utils.get_program("stringtie", data, default="stringtie") # don't assemble transcripts unless asked exp_flag = ("-e" if "stringtie" not in dd.get_transcript_assembler(data) else "") base_cmd = ("{stringtie} {exp_flag} -b {out_dir} -p {num_cores} -G {gtf_file} " "-o {out_gtf} {bam}") transcript_file = os.path.join(out_dir, "t_data.ctab") exon_file = os.path.join(out_dir, "e_data.ctab") out_gtf = os.path.join(out_dir, "stringtie-assembly.gtf") if file_exists(transcript_file): return exon_file, transcript_file, out_gtf cmd = base_cmd.format(**locals()) do.run(cmd, "Running Stringtie on %s." % bam) assert file_exists(exon_file), error_message % ("exon", exon_file) assert file_exists(transcript_file), error_message % ("transcript", transcript_file) return transcript_file
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_transcriptome_gtf(odata) if not gtf_file: gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_transcriptome_gtf(data) if not gtf_file: gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): if "arriba" in dd.get_fusion_caller(data): cmd += ( "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions " "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 " "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 " "--alignSJstitchMismatchNmax 5 -1 5 5 " "--chimSegmentReadGapMax 3 ") else: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data