def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse " "{gene_map_flag}" "--cb_histogram {cb_histogram} {bam} {tx_out_file}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [ umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames" ] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def barcode_histogram(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) out_file = os.path.join(sample_dir, "cb-histogram.txt") filtered_out_file = os.path.join(sample_dir, "cb-histogram-filtered.txt") fq1_cmd = fq1 cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Computing cellular barcode counts for %s." % fq1 do.run(cmd.format(**locals()), message) cutoff = dd.get_minimum_barcode_depth(data) filter_barcode_histogram(filtered_out_file, out_file, cutoff) return [[data]]
def barcode_histogram(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) out_file = os.path.join(sample_dir, "cb-histogram.txt") filtered_out_file = os.path.join(sample_dir, "cb-histogram-filtered.txt") fq1_cmd = fq1 cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Computing cellular barcode counts for %s." % fq1 do.run(cmd.format(**locals()), message) cutoff = dd.get_minimum_barcode_depth(data) filter_barcode_histogram(filtered_out_file, out_file, cutoff) newdata = dd.set_histogram_counts(data, filtered_out_file) return [[newdata]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".counts") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount --positional --cb_cutoff {cutoff} --cb_histogram " "{cb_histogram} {bam} {tx_out_file}") with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse " "--cb_histogram {cb_histogram} {bam} {tx_out_file}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]