def join(args, outs, chunk_defs, chunk_outs): if args.fragments is None: outs.raw_matrix = None outs.raw_matrix_mex = None return # Hstack barcodes to generate full peak matrix barcodes = [] sp_matrix = None for i, chunk in enumerate(chunk_outs): if chunk.raw_matrix is not None and os.path.exists(chunk.raw_matrix): cpm = cr_matrix.CountMatrix.load_h5_file(chunk.raw_matrix) if i == 0: sp_matrix = cpm.m else: sp_matrix = hstack([sp_matrix, cpm.m]) barcodes.extend(cpm.bcs) genomes = utils.generate_genome_tag(args.reference_path) peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes) raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes, sp_matrix) raw_matrix.save_h5_file(outs.raw_matrix, sw_version=martian.get_pipelines_version()) if not os.path.exists(outs.raw_matrix_mex): os.mkdir(outs.raw_matrix_mex) atac_matrix.save_mex(raw_matrix, outs.raw_matrix_mex, cr_lib_constants.ATACSEQ_LIBRARY_TYPE, martian.get_pipelines_version())
def join(args, outs, chunk_defs, chunk_outs): ref_mgr = ReferenceManager(args.reference_path) if args.filtered_matrix is None or args.peak_motif_hits is None or len( ref_mgr.list_species()) > 1: outs.filtered_tf_bc_matrix = None outs.filtered_tf_bc_matrix_mex = None outs.tf_propZ_matrix = None return # motif scan is completed in ANNOTATE_PEAKS peaks = BedTool(args.peaks) motifs = Motifs(args.reference_path) peak_motif_hits = BedTool(args.peak_motif_hits) # extract peak coordinate to numerical index map peak_idx, n_peaks = _get_peak_indexes(peaks) # extract motif names to numerical index map motif_idx, n_motifs = _get_motif_indexes(motifs) # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix( peak_motif_hits, peak_idx, motif_idx, format='binary') # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)), shape=(n_motifs, n_peaks), dtype='int32') # compute the motif-BC matrix via pooling # The current method simply counts the number of hits for a motif inside the peaks in a barcode # cast as a CountMatrix peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix) motif_names = motif_idx.keys() barcodes = peak_matrix.bcs genomes = utils.generate_genome_tag(args.reference_path) motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes) tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes, tf_peak_matrix * peak_matrix.m) # perform MAD-zscoring of proportion values propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0)) propZ_matrix = MADzscore(propZ_matrix) outs.coerce_strings() # save to h5 and csv tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix, sw_version=martian.get_pipelines_version()) if not os.path.exists(outs.filtered_tf_bc_matrix_mex): os.mkdir(outs.filtered_tf_bc_matrix_mex) atac_matrix.save_mex( tf_matrix, outs.filtered_tf_bc_matrix_mex, feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE, sw_version=martian.get_pipelines_version()) # save propZ matrix as gz np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
def join(args, outs, chunk_defs, chunk_outs): if args.raw_matrix is None: outs.filtered_matrix = None return # consume cell barcodes across all species and raise errors if not found if args.cell_barcodes is None: martian.exit("cell barcodes not provided") cell_barcodes = utils.load_cell_barcodes(args.cell_barcodes, with_species=True) # Read the peak matrix file and keep only cell barcodes # remove cell barcodes that were specified externally, such in reanalyzer, # which may not be present in raw matrix because they're missing from the fragments file present_cell_barcodes = {} peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.raw_matrix) peak_matrix_bcs = set(peak_matrix.bcs) for species in cell_barcodes: present_cell_barcodes[species] = set() for bc in cell_barcodes[species]: if bc not in peak_matrix_bcs: martian.log_info("{} not found in the raw peak - bc matrix".format(bc)) else: present_cell_barcodes[species].add(bc) peak_matrix = peak_matrix.filter_barcodes(present_cell_barcodes) if peak_matrix.features_dim == 0: martian.log_info("data has no peaks, skipping the clustering analysis") outs.filtered_matrix = None outs.filtered_matrix_mex = None return peak_matrix = prune(peak_matrix, num_analysis_bcs=args.num_analysis_bcs, random_state=args.random_seed) if peak_matrix.bcs_dim <= analysis_constants.MAX_N_CLUSTERS_DEFAULT: martian.log_info("Insufficient number of cell barcodes present after processing") outs.filtered_matrix = None outs.filtered_matrix_mex = None return if peak_matrix.features_dim < analysis_constants.MAX_N_CLUSTERS_DEFAULT: martian.log_info("Insufficient number of peaks present after processing") outs.filtered_matrix = None outs.filtered_matrix_mex = None return # save processed matrix peak_matrix.save_h5_file(outs.filtered_matrix, sw_version=martian.get_pipelines_version()) if not os.path.exists(outs.filtered_matrix_mex): os.mkdir(outs.filtered_matrix_mex) atac_matrix.save_mex(peak_matrix, outs.filtered_matrix_mex, cr_lib_constants.ATACSEQ_LIBRARY_TYPE, sw_version=martian.get_pipelines_version())
def get_pipeline_info(args, reference, debug): """Generates a table of general pipeline information. """ metadata = reference.metadata def get_fastq_paths(sample_def): if sample_def is None: return "" else: paths = [x["read_path"] for x in sample_def] return "\n".join(paths) rows = [ ['Sample ID', args.sample_id], ['Sample description', args.sample_desc], ['FASTQ path', get_fastq_paths(args.sample_def)], ['Pipeline version', martian.get_pipelines_version()], ['Reference path', args.reference_path], ] if metadata: rows.extend([ ['Organism', metadata.get('organism')], ['Assembly', metadata.get('assembly')], ['Annotation', metadata.get('annotation')], ]) if debug: rows.append(['Barcode Whitelist', args.barcode_whitelist]) data = {'pipeline_info_table': {'rows': rows}} data['pipeline_helptext'] = {'title': 'Sample', 'data': []} return data
def simple_load_metrics(summary_metrics, metrics_fn): with open(metrics_fn, 'r') as infile: metrics = json.load(infile) summary_metrics.update(metrics) summary_metrics['cellranger-atac_version'] = martian.get_pipelines_version( ) return summary_metrics
def main(args, outs): genomes = cr_matrix.GeneBCMatrices.load_genomes_from_h5( args.filtered_matrices) chemistry = cr_matrix.GeneBCMatrices.load_chemistry_from_h5( args.filtered_matrices) total_cells = cr_matrix.GeneBCMatrices.count_cells_from_h5( args.filtered_matrices) summary = { 'chemistry_description': chemistry, 'filtered_bcs_transcriptome_union': total_cells } with open(outs.summary, 'w') as f: json.dump(summary, f, indent=4, sort_keys=True) sample_properties = ReanalyzeSampleProperties( sample_id=args.analysis_id, sample_desc=args.analysis_desc, genomes=genomes, version=martian.get_pipelines_version()) sample_properties = dict(sample_properties._asdict()) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.summary, analysis_path=args.analysis, ) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_REANALYZE)
def main(args, outs): cr_report.merge_jsons(args.summaries, outs.metrics_summary_json) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.metrics_summary_json, barcode_summary_path=args.barcode_summary_h5, analysis_path=args.analysis, filtered_barcodes_path=args.filtered_barcodes, ) genomes = cr_utils.get_reference_genomes(args.reference_path) sample_properties = CountSampleProperties( sample_id=args.sample_id, sample_desc=args.sample_desc, genomes=genomes, version=martian.get_pipelines_version()) sample_properties = dict(sample_properties._asdict()) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_COUNT, alerts_output_filename=outs.alerts) cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv, sample_properties, sample_data, PIPELINE_COUNT)
def get_pipeline_info(args, reference, debug): """Generates a table of general pipeline information. """ data = {} metadata = reference.metadata rows = [ ['Sample ID', args.sample_id], ['Sample description', args.sample_desc], ['Pipeline version', martian.get_pipelines_version()], ['Reference path', args.reference_path], ] if metadata: rows.extend([ ['Organism', metadata.get('organism')], ['Assembly', metadata.get('assembly')], ['Annotation', metadata.get('annotation')], ]) if debug: rows.append(['Barcode Whitelist', args.barcode_whitelist]) data = {'pipeline_info_table': {'rows': rows}} return data
def main(args, outs): metrics = {} for fname in args.metrics: if fname is not None: with open(fname, 'r') as f: metrics.update(json.load(f)) # Normalize "NaN" values for key in metrics: value = metrics[key] if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)): metrics[key] = None # add version info metrics['cellranger-atac_version'] = martian.get_pipelines_version() if len(metrics) > 0: martian.log_info('Writing out summary_metrics') with open(outs.metrics, 'w') as outfile: outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True)) # compile summary.csv metrics # load library info and fake libraries as species metric_registry = MetricAnnotations() metrics_csv_dict = {} if args.library_info is not None: with open(args.library_info, 'r') as f: library_info = pickle.load(f) library_list = [library_info[n]['library_id'] for n in library_info.keys()] metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list)) # load species level metrics ctg_mgr = ReferenceManager(args.reference_path) metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species())) write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
def main(args, outs): args.coerce_strings() outs.coerce_strings() input_vfr = tk_io.VariantFileReader(args.input) bc_mix_prob = args.bc_mix_prob min_var_hap_conf = args.min_var_hap_conf min_junction_hap_conf = args.min_junction_hap_conf hap_block_size = args.hap_block_size hap_block_buffer_size = args.hap_block_buffer_size max_reassign_rounds = args.max_reassign_rounds chrom, start, stop = tk_io.get_locus_info(args.locus) output_file = open(outs.default.strip('.gz'), 'w') fragment_output_file = open(outs.fragment_phasing.strip('.gz'), 'w') vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode) # Add the component name and the version of the phasing code new_source = "10X/pipelines/stages/snpindels/phase_snpindels %s" % martian.get_pipelines_version( ) new_filters = [ ("10X_PHASING_INCONSISTENT", "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with phasing." ), ("10X_HOMOPOLYMER_UNPHASED_INSERTION", "Unphased insertions in homopolymer regions tend to be false positives" ) ] new_formats = [ ("PS", 1, "Integer", "ID of Phase Set for Variant"), ("PQ", 1, "Integer", "Phred QV indicating probability at this variant is incorrectly phased" ), ("JQ", 1, "Integer", "Phred QV indicating probability of a phasing switch error in gap prior to this variant" ), ] vfw = tk_io.VariantFileWriter(output_file, template_file=open(args.input), new_source=new_source, new_format_fields=new_formats, new_filters=new_filters) if args.do_phasing: phaser = Phaser(input_vfr, args.fragments, chrom, start, stop, bc_mix_prob, min_junction_hap_conf, min_var_hap_conf, hap_block_buffer_size, hap_block_size, max_reassign_rounds, vc_mode) phaser.call_haps(vfw, fragment_output_file) else: pass_variants(input_vfr, vfw, chrom, start, stop, strip_phasing_info=True) output_file.close() fragment_output_file.close() tk_tabix.sort_unique_tabix_vcf(outs.default.strip('.gz'))
def join_matrices(args, outs, chunk_defs, chunk_outs): chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs] matrix = cr_matrix.merge_matrices(chunk_h5s) matrix_attrs = cr_matrix.make_matrix_attrs_count( args.sample_id, args.gem_groups, cr_chem.get_description(args.chemistry_def)) matrix.save_h5_file(outs.matrices_h5, extra_attrs=matrix_attrs) rna_matrix.save_mex(matrix, outs.matrices_mex, martian.get_pipelines_version())
def make_sample_info(args): p = { "sample_def": args.sample_def, "reference_path": args.reference_path, "sample_id": args.sample_id, "sample_desc": args.sample_desc, "version": martian.get_pipelines_version() } return p
def join(args, outs, _chunk_defs, _chunk_outs): filtered_matrix = filter_barcodes(args, outs) matrix_attrs = cr_matrix.make_matrix_attrs_count( args.sample_id, args.gem_groups, cr_chem.get_description(args.chemistry_def)) filtered_matrix.save_h5_file(outs.filtered_matrices_h5, extra_attrs=matrix_attrs) rna_matrix.save_mex(filtered_matrix, outs.filtered_matrices_mex, martian.get_pipelines_version())
def write_analysis_parameters(analysis_params_outfn): with open(analysis_params_outfn, 'w') as analysis_params_out: analysis_params = { 'analysis_version': martian.get_pipelines_version(), # Dropping meowmix version -- we're moving to putting special reference datasets into main repo 'meowmix_version': "99.9.9", # Lena needs this set, even though we're not trimming 'lead_trim': 0, } analysis_params_out.write( tenkit.safe_json.safe_jsonify(analysis_params))
def main(args, outs): args.coerce_strings() outs.coerce_strings() outs.raw_matrix_mex = None if args.fragments is None: outs.raw_matrix = None return with open(args.peaks, 'r') as infile: full_peaks = tk_bio.get_target_regions(infile) with open(args.peaks, 'r') as pfile: peaks_dict = OrderedDict( ("{}:{}-{}".format(*peak.strip("\n").split("\t")), num) for num, peak in enumerate(pfile)) with open(args.barcodes, 'r') as barcode_file: barcodes_dict = OrderedDict( (bc.strip('\n'), num) for num, bc in enumerate(barcode_file)) if len(barcodes_dict) == 0: outs.raw_matrix = None return # get matrix counts peak_bc_counts = Counter() for contig, start, stop, barcode, _ in open_fragment_file(args.fragments): if barcode not in barcodes_dict: continue for pos in (start, stop): if contig in full_peaks.keys(): peak = full_peaks[contig].get_region_containing_point(pos) if peak is not None: peak_bc_counts[barcodes_dict[barcode], peaks_dict['{}:{}-{}'.format( contig, peak[0], peak[1])]] += 1 data, col, row = (), (), () if len(peak_bc_counts) > 0: data, col, row = zip(*[(val, key[0], key[1]) for key, val in peak_bc_counts.iteritems()]) sp_matrix = csc_matrix( coo_matrix((data, (row, col)), shape=(len(peaks_dict), len(barcodes_dict)), dtype=int)) # save as a CountMatrix genomes = utils.generate_genome_tag(args.reference_path) peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes) raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes_dict.keys(), sp_matrix) raw_matrix.save_h5_file(outs.raw_matrix, sw_version=martian.get_pipelines_version())
def main(args, outs): outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.bucket[0]) bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in, pg=tk_bam.make_pg_header( martian.get_pipelines_version(), "sort_reads_by_bc")) bam_in.close() outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out) bam_out.close()
def main(args, outs): summary = {} filtered_mat = cr_matrix.CountMatrix.load_h5_file( args.filtered_matrices_h5) genomes = filtered_mat.get_genomes() # get metrics from other summaries if args.analyze_matrices_summary: with open(args.analyze_matrices_summary) as reader: analysis_summary = json.load(reader) summary.update(analysis_summary) with open(args.normalize_depth_summary, 'r') as reader: summary.update(json.load(reader)) agg_batches = summary['batches'] with open(outs.summary, 'w') as f: json.dump(summary, f, indent=4, sort_keys=True) # build web summary sample_properties = AggrSampleProperties( sample_id=args.sample_id, sample_desc=args.sample_desc, genomes=genomes, version=martian.get_pipelines_version(), agg_batches=agg_batches) sample_properties = dict(sample_properties._asdict()) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.summary, barcode_summary_path=args.barcode_summary_h5, analysis_path=args.analysis, ) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_AGGR)
def write_mask_bed(bedfile,store,chroms,window_size,ref, args): """Write a BED file corresponding to the mask=True regions for our profiles.""" chroms = sorted(chroms) with open(bedfile,'w') as outfile: version = martian.get_pipelines_version() outfile.write("#cellranger-dna {}\n".format(version)) outfile.write("#reference genome: {}\n".format(args.reference_path)) outfile.write("#chrom\tstart\tend\n") for chrom in chroms: chrom_length = ref.contig_lengths[chrom] mask = store['/masks/'+chrom] start,end = None,None in_mask = False for i in xrange(len(mask)): if not in_mask and mask[i]: start = i in_mask = True if in_mask and not mask[i]: end = i in_mask = False outfile.write( '\t'.join(str(s) for s in [\ chrom, start*window_size, min(end*window_size,chrom_length), ]) + os.linesep ) if in_mask and \ (start is not None) and \ (end is None or end*window_size<chrom_length): outfile.write( '\t'.join(str(s) for s in [\ chrom, start*window_size, chrom_length, ]) + os.linesep )
def main(args, outs): summary = {} # add stats from matrices filtered_mats = cr_matrix.GeneBCMatrices.load_h5(args.filtered_matrices_h5) genomes = filtered_mats.get_genomes() cells_per_genome = {} for genome in genomes: matrix = filtered_mats.matrices[genome] cells_per_genome[genome] = matrix.bcs_dim median_gene_counts = np.median( matrix._sum(matrix.m >= cr_constants.MIN_READS_PER_GENE, axis=0)) median_umi_counts = np.median(matrix._sum(matrix.m, axis=0)) summary.update({ '%s_filtered_bcs' % genome: cells_per_genome[genome], '%s_filtered_bcs_median_counts' % genome: median_umi_counts, '%s_filtered_bcs_median_unique_genes_detected' % genome: median_gene_counts, }) del filtered_mats # get metrics from other summaries if args.analyze_matrices_summary: with open(args.analyze_matrices_summary) as reader: analysis_summary = json.load(reader) summary.update(analysis_summary) with open(args.normalize_depth_summary, 'r') as reader: data = json.load(reader) raw_conf_mapped_per_genome = data['raw_conf_mapped_per_genome'] downsample_map = data['downsample_info'] mol_counter_metrics = data['mol_counter_metrics'] with open(args.count_genes_summary, 'r') as reader: data = json.load(reader) flt_conf_mapped_per_genome = data['flt_conf_mapped_per_genome'] for genome in flt_conf_mapped_per_genome: frac_reads_in_cells = tk_stats.robust_divide( flt_conf_mapped_per_genome[genome], raw_conf_mapped_per_genome[genome]) summary['%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % genome] = frac_reads_in_cells # Pass chemistry metrics through to output summary.update({ k: v for k, v in mol_counter_metrics.iteritems() if k.startswith('chemistry_') }) # Molecule counter metrics gem_groups = [] total_reads_per_gem_group = [] downsampled_reads_per_gem_group = [] for (gg, submetrics) in mol_counter_metrics[ cr_mol_counter.GEM_GROUPS_METRIC].iteritems(): gem_groups.append(gg) total_reads = submetrics[cr_mol_counter.GG_TOTAL_READS_METRIC] total_reads_per_gem_group.append(total_reads) # If metric is missing, assume no downsampling was done downsampled = submetrics.get( cr_mol_counter.GG_DOWNSAMPLED_READS_METRIC, total_reads) downsampled_reads_per_gem_group.append(downsampled) total_reads = sum(total_reads_per_gem_group) downsampled_reads = sum(downsampled_reads_per_gem_group) total_cells = sum(cells_per_genome.values()) mean_reads_per_cell = tk_stats.robust_divide(total_reads, total_cells) downsampled_mean_reads_per_cell = tk_stats.robust_divide( downsampled_reads, total_cells) summary.update({ 'pre_normalization_total_reads': total_reads, 'post_normalization_total_reads': downsampled_reads, 'filtered_bcs_transcriptome_union': total_cells, 'pre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc': mean_reads_per_cell, 'post_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc': downsampled_mean_reads_per_cell, }) # Downsampling metrics gem_group_index = args.gem_group_index agg_batches = [] lowest_frac_reads_kept = 1.0 for (gg, rpg) in zip(gem_groups, total_reads_per_gem_group): dinfo = downsample_map[str(gg)] (library_id, old_gg) = gem_group_index[str(gg)] batch = library_id + ('-%d' % old_gg if old_gg > 1 else '') agg_batches.append(batch) # calc summary metrics frac_reads_kept = dinfo['frac_reads_kept'] lowest_frac_reads_kept = min(lowest_frac_reads_kept, frac_reads_kept) summary['%s_frac_reads_kept' % batch] = frac_reads_kept summary['%s_pre_normalization_raw_reads_per_filtered_bc' % batch] = tk_stats.robust_divide(dinfo['total_reads'], dinfo['cells']) summary['%s_pre_normalization_cmb_reads_per_filtered_bc' % batch] = tk_stats.robust_divide(dinfo['cmb_reads'], dinfo['cells']) # this is an internal metric, so keep using gem group instead of batch summary['%s_total_reads_per_gem_group' % gg] = frac_reads_kept * rpg summary['lowest_frac_reads_kept'] = lowest_frac_reads_kept with open(outs.summary, 'w') as f: json.dump(summary, f, indent=4, sort_keys=True) # build web summary sample_properties = cr_webshim.get_sample_properties( args.aggregation_id, args.aggregation_desc, genomes, version=martian.get_pipelines_version(), agg_batches=agg_batches) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.summary, barcode_summary_path=args.barcode_summary_h5, analysis_path=args.analysis, ) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_AGGR)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def] read_tags = [None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main(args, outs): """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics. PCR duplicates have the same read1 start site and read2 start site. """ args.coerce_strings() outs.coerce_strings() # Chunk output doesn't get indexed outs.fragments_index = None outs.index = None # Pull in prior likelihoods for barcodes raw_barcode_abundance = None barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) if args.raw_barcode_counts is not None and barcode_whitelist is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) raw_barcode_abundance = { '{}-{}'.format(barcode, gem_group): count for gem_group, subdict in raw_counts.iteritems() for barcode, count in zip(barcode_whitelist, subdict['bc_counts']) } bam_in = create_bam_infile(args.input) bam_refs = bam_in.references bam_prefix, ext = os.path.splitext(outs.output) raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' + ext) frag_prefix, ext = os.path.splitext(outs.fragments) raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext) # only write CO line for one chunk, so we don't have duplicates after samtools merge if args.chunk_num == 0: COs = [ '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)', '10x_bam_to_fastq_seqnames:R1,R3,I1,R2' ] else: COs = None bam_out, _ = tk_bam.create_bam_outfile( raw_bam_file, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates", TENX_PRODUCT_NAME) ], cos=COs) fragments_out = open(raw_frag_file, 'w') bam_in.reset() # Ensure the summary key indicates what kind of dup marking was actually performed. lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) reference_manager = ReferenceManager(args.reference_path) summarizer = DupSummary(split_bcs=False, lane_coordinate_system=lane_coord_sys, output_bam=bam_out, output_tsv=fragments_out, ref=reference_manager, bam_refs=bam_refs, priors=raw_barcode_abundance) # Now broadcast the selected reads to the summarizers consumers = [summarizer.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Close outfiles bam_out.close() fragments_out.close() # Feed the chunk barcode_counts data back to join() with open(outs.singlecell_mapping, 'w') as outfile: pickle.dump(summarizer.bc_counts, outfile) # Sort the output bam & tsv files sort_bam(raw_bam_file, outs.output, threads=martian.get_threads_allocation()) sort_bed(raw_frag_file, outs.fragments, genome=reference_manager.fasta_index, threads=martian.get_threads_allocation(), leave_key=True)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk #subsample_rate = 1.0 #if args.subsample_rate is not None: # subsample_rate = args.subsample_rate bam_in = tk_bam.create_bam_infile(args.align_chunk) bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] reads_attached = 0 #emit_read_pair = random.random() < subsample_rate emit_read_pair = True while read.qname == read_name or read_name == None: if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags reads_attached += 1 if not (read_name is None): assert(read.qname == read_name) if emit_read_pair: # Count the perfect reads -- will be used when subsampling in dedup if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(tk_io.get_read_barcode(read) is None): bam_out.write(read) else: bam_out.write(read) try: read = bam_in.next() except StopIteration: read = None break # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def join(args, outs, chunk_defs, chunk_outs): summary_files = [ args.reads_summary, args.filter_umis_summary, args.filter_barcodes_summary, args.trim_reads_summary, args.filter_reads_summary, args.filter_contigs_summary, args.report_contigs_summary, args.report_contig_alignments_summary, args.raw_consensus_summary, args.group_clonotypes_summary, ] summary_files = [ sum_file for sum_file in summary_files if not sum_file is None ] cr_report.merge_jsons(summary_files, outs.metrics_summary_json) # Copy barcode summary h5 if args.barcode_summary: cr_utils.copy(args.barcode_summary, outs.barcode_summary) # Copy cell barcodes if args.cell_barcodes: cr_utils.copy(args.cell_barcodes, outs.cell_barcodes) # Copy barcode support if args.barcode_support: cr_utils.copy(args.barcode_support, outs.barcode_support) # Copy barcode umi summary if args.barcode_umi_summary: cr_utils.copy(args.barcode_umi_summary, outs.barcode_umi_summary) # Copy umi info if args.umi_info: cr_utils.copy(args.umi_info, outs.umi_info) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.metrics_summary_json, barcode_summary_path=args.barcode_summary, vdj_clonotype_summary_path=args.clonotype_summary, vdj_barcode_support_path=args.barcode_support, ) # Determine chain type for the report if args.chain_type_spec == vdj_constants.AUTO_CHAIN_TYPE: chain_type = args.chain_type_auto elif args.chain_type_spec == vdj_constants.ALL_CHAIN_TYPES: chain_type = None else: chain_type = args.chain_type_spec sample_properties = VdjSampleProperties( sample_id=args.sample_id, sample_desc=args.sample_desc, chain_type=chain_type, version=martian.get_pipelines_version()) sample_properties = dict(sample_properties._asdict()) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) if args.barcode_whitelist is not None: cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_VDJ, alerts_output_filename=outs.alerts) cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv, sample_properties, sample_data, PIPELINE_VDJ)
def join(args, outs, chunk_defs, chunk_outs): ## merge gc params jsons node_gc_params = {} sc_gc_params = json.load(open(args.sc_gc_params, "r")) internal_gc_params = json.load(open(args.internal_gc_params, "r")) ncells = len(sc_gc_params['linear']) nnodes = 2*ncells - 1 for key in ["scale", "linear", "quadratic"]: node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key] with open(outs.node_gc_params, "w") as out: json.dump(node_gc_params, out, indent=4) ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)]) chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)]) tmp = martian.make_path('tmp.bed') tmp_dir = os.path.dirname(tmp) tmp_sorted = martian.make_path('tmp_sorted.bed') calls = [[args.sc_cnv_calls, args.internal_cnv_calls], [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]] out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls] for calls, out in zip(calls, out_calls): with open(tmp, 'w') as outf: for f in calls: for l in open(f): fields = l.split() # offset internal node indices by ncells if f == calls[1]: fields[3] = str(int(fields[3]) + ncells) # fix type of confidence field to integer fields[-1] = str(int(float(fields[-1]))) # replace index number at start for sorting fields[0] = chrom_index[fields[0]] outf.write('\t'.join(fields) + '\n') no_unicode = dict(LC_ALL='C') tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3)))) try: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', '--parallel=1', # force sort to use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # on some systems, --parallel is unavailable except subprocess.CalledProcessError: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', # will by default only use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # strip index column into outfile with open(out, 'w') as outf: version = martian.get_pipelines_version() outf.write("#cellranger-dna {}\n".format(version)) outf.write("#reference genome: {}\n".format(args.reference_path)) outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n") for l in open(tmp_sorted): l = l.split('\t') l[0] = index_chrom[l[0]] outf.write('\t'.join(l)) os.remove(tmp) os.remove(tmp_sorted) ## cnv tracks file sc_windows = load_h5(args.sc_cnv_tracks, "windows") internal_windows = load_h5(args.internal_cnv_tracks, "windows") windows = sc_windows.append(internal_windows).values constants = load_h5(args.sc_cnv_tracks, "constants") sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, "ploidy_conf").values) internal_ploidy_conf = scale_confidence_score(load_h5( args.internal_cnv_tracks, "ploidy_conf").values) sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor") internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor") sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin") internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin") X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values nbins = X.shape[1] Q = np.zeros((nnodes, nbins), dtype=X.dtype) Q[0:ncells, :] = X del X Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values store = pd.HDFStore(outs.node_cnv_tracks, "w") store["constants"] = constants store["windows"] = sc_windows.append(internal_windows) store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf) store["scale_factor"] = sc_scale_factor.append(internal_scale_factor) store["reads_per_bin"] = sc_rpb.append(internal_rpb) store["cnv_tracks"] = pd.DataFrame(Q) store.close() ## Compute heterogeneity and store in tree_data ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) if args.tracks is None: gmask = np.ones(nbins, dtype=bool) else: gmask = [] maptrack = pd.HDFStore(args.tracks, "r") for chrom in chroms: gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD) maptrack.close( ) gmask = np.array(gmask) ## update tree data # load tree store = pd.HDFStore( args.tree_data, "r" ) Z = store["/Z"].values distances = store["/distances"].values constants = store["/constants"] store.close( ) # Compute the heterogeneity at every *internal* node of the tree # obviously the heterogeneity is zero at every leaf, so don't # store a bunch of zeros levels = 6 het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels) del Q # dump to disk store = pd.HDFStore( outs.tree_data, "w" ) store["Z"] = pd.DataFrame(Z) store["het"] = pd.DataFrame(het) store["distances"] = pd.Series(distances) store["windows"] = pd.Series(windows) store["constants"] = constants store.close( ) del het ## normalized profiles sc_store = pd.HDFStore(args.sc_norm_profiles, "r") internal_store = pd.HDFStore(args.internal_norm_profiles, "r") out_store = pd.HDFStore(outs.norm_node_profiles, "w") out_store["/constants"] = sc_store["/constants"] for chrom in chroms: ## first do the /contigs X = sc_store["/contigs/"+chrom].values Y = internal_store["/contigs/"+chrom].values assert X.shape[1] == Y.shape[1] nbins = X.shape[1] Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype) Z[:ncells, :] = X Z[ncells:, :] = Y out_store["/contigs/"+chrom] = pd.DataFrame(Z) del X, Y, Z ## next do the /masks out_store["/masks/"+chrom] = sc_store["/masks/"+chrom] ## gc params for key in ["scale", "linear", "quadratic"]: out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key], internal_store["/gc_params/"+key]], ignore_index=True) ## do the normalization metrics out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True) out_store.close() sc_store.close() internal_store.close()
def main(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0)) if args.write_bam: bam_prefix, ext = os.path.splitext(outs.output) out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")]) outs.index = None # chunk bams don't get indexed else: bam_out = None outs.output = None outs.index = None # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs] # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM if bam_out: bam_out.close() # Note - the indexing happens in join bam_prefix, _ = os.path.splitext(outs.output) tk_bam.sort(out_bam_name, bam_prefix) # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups, custom_diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_old_" + desc dup_results[diff_desc] = diff_dups custom_diff_desc = "diffusion_" + desc dup_results[custom_diff_desc] = custom_diff_dups if outs.duplicate_summary: with open(outs.duplicate_summary, 'w') as f: json.dump(dup_results, f, indent=4)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Build the feature reference if args.reference_path: feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) else: feature_ref = rna_feature_ref.FeatureReference.empty() # Setup feature barcode extraction feature_extractor = rna_feature_ref.FeatureExtractor( feature_ref, use_feature_types=[args.library_type]) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) num_libraries = len(args.library_info) reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), num_libraries=num_libraries) # Determine if barcode sequences need to be reverse complemented. with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) as bc_check_rc: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, True) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice( r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) # Record feature counts: feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int) # If this library type has no feature barcodes, make the reader a NOOP if feature_extractor.has_features_to_extract(): feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor, args.reads_interleaved, r1_length, r2_length) else: feature_reads = FastqReader(None, None, None, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads, feature_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) tag_writer = None if not args.augment_fastq: tag_writer = ChunkedFastqWriter(outs.tags, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.chunk_initial_reads): # Downsample if random.random() > args.chunk_subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences lib_idx = [ i for i, x in enumerate(args.library_info) if x['library_id'] == args.library_id ][0] reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, lib_idx, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) feat_raw_bc = None feat_proc_bc = None feat_qual = None feat_ids = None if feature_extraction: if feature_extraction.barcode: feat_raw_bc = feature_extraction.barcode feat_qual = feature_extraction.qual if len(feature_extraction.ids) > 0: feat_proc_bc = feature_extraction.barcode feat_ids = ';'.join(feature_extraction.ids) # If hit a single feature ID, count its frequency if len(feature_extraction.ids) == 1: feature_counts[feature_extraction.indices[0]] += 1 if feat_raw_bc: fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read1_writer.write( (fastq_header1.to_string(), rna_read[1], rna_read[2])) else: read1_writer.write((rna_read[0], rna_read[1], rna_read[2])) tag_writer.write((fastq_header1.to_string(), '', '')) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) if feat_raw_bc: fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header2.set_tag( cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) else: read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() if not args.augment_fastq: tag_writer.close() bc_counter.close() # Write feature BC read counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] if args.augment_fastq: outs.tags = [] else: outs.tags = tag_writer.get_out_paths(len(outs.tags)) libraries = args.library_info library = [ li for li in libraries if li['library_id'] == args.library_id ][0] outs.gem_groups = [library['gem_group']] * len(outs.reads) outs.library_types = [library['library_type']] * len(outs.reads) outs.library_ids = [library['library_id']] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.tags = [] outs.gem_groups = [] outs.library_types = [] outs.library_ids = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) assert args.augment_fastq or len(outs.tags) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained trim_defs = compute_trim_defs( read_defs, read_tags, args.chemistry_def.get('retain_trimmed_suffix_read')) outs.bam_comments = sorted( set([td.bam_to_fastq for td in trim_defs.itervalues()])) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Determine which read_iters need to retain trimmed sequence # (only one per read-type e.g., one per R1, one per R2, etc.) read_types_with_trim_def = set() rna_read_trim_defs = None rna_read2_trim_defs = None bc_read_trim_defs = None si_read_trim_defs = None umi_read_trim_defs = None if rna_read_def.read_type not in read_types_with_trim_def: rna_read_trim_defs = trim_defs read_types_with_trim_def.add(rna_read_def.read_type) if rna_read2_def.read_type not in read_types_with_trim_def: rna_read2_trim_defs = trim_defs read_types_with_trim_def.add(rna_read2_def.read_type) if bc_read_def.read_type not in read_types_with_trim_def: bc_read_trim_defs = trim_defs read_types_with_trim_def.add(bc_read_def.read_type) if si_read_def.read_type not in read_types_with_trim_def: si_read_trim_defs = trim_defs read_types_with_trim_def.add(si_read_def.read_type) if umi_read_def.read_type not in read_types_with_trim_def: umi_read_trim_defs = trim_defs read_types_with_trim_def.add(umi_read_def.read_type) # Setup read iterators. rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, rna_read_trim_defs) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, rna_read2_trim_defs) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, bc_read_trim_defs) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, si_read_trim_defs) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, umi_read_trim_defs) else: umi_reads = FastqReader(None, None, False, None) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) # Compute trim order of the readers; this is to ensure stability in the ordering # in which trimmed sequence is added to the TRIMMED_SEQ tags trim_order = list( np.argsort([ reader.read_def.read_type for reader in fastq_readers if reader.read_def is not None ])) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME) trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs, args.reads_per_file) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ si_read = si_extraction.read if si_extraction is not None else EMPTY_READ umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ # Extra trimming for internal purposes if args.rna_read_length is not None: rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length], rna_read[2][0:args.rna_read_length]) # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2) # to ensure stability trimmed_seq = '' trimmed_qual = '' for i in trim_order: if extractions[i] is None: continue trimmed_seq += extractions[i].trimmed_seq trimmed_qual += extractions[i].trimmed_qual if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) # Write trimmed sequence data to a separate, unaligned BAM file # Note: We assume that there is only one trimmed sequence per read-pair trimmed_seq_data = pysam.AlignedSegment() trimmed_seq_data.query_name = fastq_header_str1.split( AugmentedFastqHeader.WORD_SEP)[0] trimmed_seq_data.flag = 4 trimmed_seq_data.seq = trimmed_seq trimmed_seq_data.qual = trimmed_qual trimmed_seq_writer.write(trimmed_seq_data) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() trimmed_seq_writer.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) outs.trimmed_seqs = trimmed_seq_writer.get_out_paths() else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] outs.trimmed_seqs = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) assert len(outs.trimmed_seqs) == len(outs.reads) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main_mark_duplicates(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) bam_out, tids = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pg=tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")) # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [ tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000) ] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # We do the subsampling to achieve the desired coverage on _perfect reads_, as # defined by tenkit.read_filter.stringent_read_filter. This is tallied in ATTACH_BCS, # and passed into the perfect_read_count argument. We will fail if it's not supplied. total_coverage = args.estimated_coverage # Set a fixed random seed to eliminate noise in metrics random.seed(0) sampling_rates = [] for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES: rate = tk_stats.robust_divide(float(sample_cov), total_coverage) sampling_rates.append((rate, sample_cov)) # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, bam_out) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, write_to_stdout=False) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, bam_out, write_to_stdout=False) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys) # Make a battery of duplicate summaries at different coverages, with and w/o # barcode splitting split_options = [True, False] dup_sums = [ full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs ] # Duplicate marking on perfect reads subsampled to the requested coverage for (sr, cov) in sampling_rates: for split_bc in split_options: description = "cov_" + str(cov) + ('_use_bcs' if split_bc else '_ignore_bcs') dup_sums.append( DupSummary(True, sr, split_bc, description, lane_coord_sys)) # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [ x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes) ] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM bam_out.close() # Note - the indexing happens in join # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_" + desc dup_results[diff_desc] = diff_dups if outs.duplicate_summary: f = open(outs.duplicate_summary, 'w') json.dump(dup_results, f) f.close()
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_peak_bc_matrix is None or not args.reduction_summary[ 'h5'].keys(): outs.analysis = None outs.analysis_csv = None outs.feature_bc_matrix = None return # Make the FBM # build joint Peak + TF count matrix for single genomes # combine peak annotations for single genome analysis peak_annotation = None if args.peak_annotation: annotations = pd.read_csv(args.peak_annotation, sep='\t')[['gene', 'peak_type']] annotations = annotations.replace(np.nan, '', regex=True) annotations = annotations.values.astype(str).tolist() peak_annotation = [] for row in annotations: genes = row[0].split(";") annotation = row[1].split(";") promoter = [] nearby_gene = [] assert len(annotation) == len(genes) for en, kind in enumerate(annotation): if kind == 'promoter': promoter += [genes[en]] nearby_gene += [genes[en]] peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]] fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) mapping = None if args.filtered_tf_bc_matrix: # combine matrices, ensure the barcodes are same and ordered the same way tf_matrix = cr_matrix.CountMatrix.load_h5_file( args.filtered_tf_bc_matrix) assert (fbm.bcs == tf_matrix.bcs).all() if peak_annotation is not None: fbm.feature_ref = FeatureReference.addtags( fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation) tf_matrix.feature_ref = FeatureReference.addtags( tf_matrix.feature_ref, ['promoter', 'nearby_gene']) combined_feature_defs = FeatureReference.join(fbm.feature_ref, tf_matrix.feature_ref) combined_matrix = vstack([fbm.m, tf_matrix.m]) # explicit map linking rows in diffexp to combined matrix mapping = np.zeros((tf_matrix.features_dim, 2)) for x in range(tf_matrix.features_dim): mapping[x, 0] = x mapping[x, 1] = x + fbm.features_dim fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs, combined_matrix) fbm.save_h5_file(outs.feature_bc_matrix, sw_version=martian.get_pipelines_version()) # Pytables doesn't support variable len strings, so use h5py first with h5.File(outs.feature_bc_matrix, 'r') as matrix, \ h5.File(outs.analysis, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') factorizations = args.reduction_summary['h5'].keys() USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[ 0] with tables.open_file(outs.analysis, 'a') as out: for summary, key in zip([ args.reduction_summary, args.clustering_summary, args.tsne_summary, args.enrichment_analysis_summary ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']): if summary is None or not summary: continue print(key, summary) data_h5 = summary['h5'][USE_FACTORIZATION] with tables.open_file(data_h5, 'r') as indata: indata.copy_children(indata.root, out.root, recursive=True) dirname = os.path.join(outs.analysis_csv, key) cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname) # if mapping is present (single genome case), so is the coloring matrix if mapping is not None: with h5.File(outs.analysis, 'a') as out: out.create_dataset('feature_DE_map', data=mapping) args.coerce_strings() tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix) with h5.File(outs.analysis, 'a') as out: out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
def join(args, outs, chunk_defs, chunk_outs): pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs) # Change TRANS type to DISTAL. This change will only # affect the type reported not the names of the metrics. new_info = [] for _, row in pred_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'TRANS': sv_type = 'DISTAL' new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type])) pred_df['info'] = new_info if not true_df is None: true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') ##### Write BEDPE/VCF outputs tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates) source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format( martian.get_pipelines_version()) sample_id = 'sample' if args.sample_id is None else args.sample_id tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id, source_str, args.reference_path) # this will sort and gzip tk_sv_io.index_sv_vcf(outs.svs.strip(".gz")) outs.svs_index = outs.svs + '.tbi' # delete the non-gzipped file os.remove(outs.svs.strip('.gz')) if not pred_df.empty: call_df = pred_df[np.logical_or(pred_df['filters'] == '.', pred_df['filters'] == "PASS")] else: call_df = None tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls) # Annotate each call with the matching ground truth svs. The resulting # dataframe might have multiple rows for the same call if there were multiple # matching ground truth svs. martian.log_info("merging calls and gt") if not pred_df.empty: pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match) martian.log_info("writing call_tsv") pred_df.to_csv(outs.call_tsv, index=False, header=True, sep='\t', na_rep='NaN') pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))] max_dists = sorted(np.array(args.detect_dists)) gt_sv_types = get_all_sv_types(true_df) call_sv_types = get_all_sv_types(pred_df) if not true_df is None: # Use the default MAX_PPV_TIER unless this is greater than the maximum tier # present in the data. max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier)) # Use the default unless this is smaller than the minimum tier present in # the data. max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier)) else: max_ppv_tier = 1 max_sens_tier = 1 tiers = [max_ppv_tier, max_sens_tier] # All combinations of filters in ground truth and call set if not args.targets is None and not args.target_dists is None: target_dists = list(sorted(np.array(args.target_dists, dtype=np.float))) target_dists.append(float('NaN')) else: target_dists = [float('NaN')] combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers, [True, False], call_sv_types, max_dists) metrics = defaultdict(list) gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier'] call_filters = ['call_filtered', 'call_sv_type', 'match_dist'] for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type, dist) in combs: if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type: continue metrics['genic_breaks'].append(genic_breaks) metrics['target_dist'].append(tdist) metrics['gt_sv_type'].append(gt_sv_type) metrics['tier'].append(tier) metrics['call_filtered'].append(is_filtered) metrics['call_sv_type'].append(call_sv_type) metrics['match_dist'].append(dist) if true_df is None: sel_true_df = None else: sel_true_df = true_df if gt_sv_type != 'NA': sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type] if not np.isnan(tdist): sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist] sel_true_df = sel_true_df[sel_true_df.tier <= tier] # Restrict to genic or non-genic or take everything if this is None. if not genic_breaks is None: sel_true_df = sel_true_df[sel_true_df.genic_breaks == genic_breaks] if len(sel_true_df) == 0: sel_true_df = None sel_pred_df = pred_df if is_filtered and not pred_df.empty: sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') | (sel_pred_df.filters == 'PASS')] if call_sv_type != 'NA' and not pred_df.empty: sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type] if not pred_df.empty and (args.min_rel_overlap is None or args.min_rel_overlap == 0): # Do not apply thi filter if the matching is done based on overlap. sel_pred_df = sel_pred_df[np.logical_or( np.isnan(sel_pred_df.match_dist), sel_pred_df.match_dist <= dist)] add_metrics(sel_pred_df, sel_true_df, metrics) column_names = gt_filters column_names.extend(call_filters) other_names = set(metrics.keys()).difference(set(column_names)) column_names.extend(other_names) metric_df = pd.DataFrame(metrics) metric_df = metric_df[column_names] martian.log_info("writing summary tsv") metric_df.to_csv(outs.summary_tsv, index=False, header=True, sep='\t', na_rep='NaN') short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier, max_sens_tier, args) if not args.call_summary is None: with open(args.call_summary, 'r') as in_summary_fn: in_summary = json.load(in_summary_fn) for key, val in in_summary.iteritems(): short_metrics[key] = val short_metrics['min_qv'] = min_qv with open(outs.summary, 'w') as out_file: out_file.write( tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))