def main(args, outs): molecule_counter = cr_mol_counter.MoleculeCounter.open( args.raw_molecules, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) raw_matrices = cr_matrix.GeneBCMatrices.build_from_mol_counter( molecule_counter) filtered_matrices = raw_matrices.filter_barcodes(filtered_bcs_per_genome) raw_matrices.save_h5(outs.raw_matrices_h5) raw_matrices.save_mex(outs.raw_matrices_mex) raw_matrices.save_barcode_summary_h5(outs.barcode_summary_h5) filtered_matrices.save_h5(outs.filtered_matrices_h5) filtered_matrices.save_mex(outs.filtered_matrices_mex) genome_ids = molecule_counter.get_ref_column('genome_ids') with cr_mol_counter.MoleculeCounter.open(outs.filtered_molecules, 'w') as ctr_out: summary = write_filtered_molecules(molecule_counter, ctr_out, genome_ids, filtered_bcs_per_genome) with open(outs.summary, 'w') as f: tk_json.dump_numpy(summary, f, pretty=True)
def main(args, outs): np.random.seed(0) subsample_rate = args.subsample_info.get('subsample_rate') if subsample_rate is None: return mol_counter = MoleculeCounter.open(args.molecule_info, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) # Subsample the matrices subsample_result = {} subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter( mol_counter, subsample_rate=subsample_rate, subsample_result=subsample_result) # Filter the subsampled matrices filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) subsampled_filt_mats = subsampled_raw_mats.filter_barcodes( filtered_bcs_per_genome) # Calculations for subsampled duplication rate reporter = cr_report.Reporter( genomes=map(str, mol_counter.get_ref_column('genome_ids')), subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES, subsample_depths=args.subsample_info['all_target_rpc']) reporter.subsampled_duplication_frac_cb( subsampled_raw_mats, mol_counter, args.subsample_info['subsample_rate'], args.subsample_info['subsample_type'], args.subsample_info['target_rpc'], subsample_result['mapped_reads'], ) mol_counter.close() reporter.save(outs.chunked_reporter) outs.subsampled_matrices = {} outs.subsampled_matrices['raw_matrices'] = martian.make_path( 'raw_matrices.h5') outs.subsampled_matrices['filtered_matrices'] = martian.make_path( 'filtered_matrices.h5') subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices']) subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
def main(args, outs): outs.coerce_strings() # Load whitelist whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist)) # Load feature reference feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.chunk_input) library_info = rna_library.get_bam_library_info(in_bam) # Get cell-associated barcodes by genome filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bc_union = cr_utils.get_cell_associated_barcode_set( args.filtered_barcodes) # Create the barcode info barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome, library_info, whitelist) # Create the molecule info file mc = MoleculeCounter.open(outs.output, mode='w', feature_ref=feature_ref, barcodes=whitelist, library_info=library_info, barcode_info=barcode_info) # Initialize per-library metrics lib_metrics = {} for lib_idx in xrange(len(library_info)): lib_metrics[str(lib_idx)] = {} lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0 # Record read-counts per molecule. Note that UMIs are not contiguous # in the input because no sorting was done after UMI correction. prev_gem_group = None prev_barcode_idx = None for (gem_group, barcode_seq), reads_iter in \ itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi): if barcode_seq is None: continue barcode_idx = barcode_to_idx[barcode_seq] # Assert expected sort order of input BAM assert gem_group >= prev_gem_group if gem_group == prev_gem_group: assert barcode_idx >= prev_barcode_idx is_cell_barcode = cr_utils.format_barcode_seq( barcode_seq, gem_group) in filtered_bc_union counts = defaultdict(int) for read in reads_iter: # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or \ read.is_read2 or \ cr_utils.is_read_low_support_umi(read) or \ not cr_utils.is_read_conf_mapped_to_feature(read): continue umi_seq = cr_utils.get_read_umi(read) if umi_seq is None: continue umi_int = MoleculeCounter.compress_umi_seq( umi_seq, MoleculeCounter.get_column_dtype('umi').itemsize * 8) feature_ids = cr_utils.get_read_gene_ids(read) assert len(feature_ids) == 1 feature_int = feature_ref.id_map[feature_ids[0]].index library_idx = cr_utils.get_read_library_index(read) counts[(umi_int, library_idx, feature_int)] += 1 if is_cell_barcode: lib_metrics[str(library_idx)][ cr_mol_counter.USABLE_READS_METRIC] += 1 prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Record data for this barcode gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group) mc.append_column('gem_group', np.repeat(gg_int, len(counts))) bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type( barcode_idx) mc.append_column('barcode_idx', np.repeat(bc_int, len(counts))) feature_ints = np.fromiter( (k[2] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('feature_idx'), count=len(counts)) # Sort by feature for fast matrix construction order = np.argsort(feature_ints) feature_ints = feature_ints[order] mc.append_column('feature_idx', feature_ints) del feature_ints li_ints = np.fromiter( (k[1] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('library_idx'), count=len(counts))[order] mc.append_column('library_idx', li_ints) del li_ints umi_ints = np.fromiter((k[0] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('umi'), count=len(counts))[order] mc.append_column('umi', umi_ints) del umi_ints count_ints = np.fromiter( counts.itervalues(), dtype=MoleculeCounter.get_column_dtype('count'), count=len(counts))[order] mc.append_column('count', count_ints) del count_ints in_bam.close() mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics)) mc.save()
def split(args): # Get the cell count filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) n_cells = len(filtered_bcs) if n_cells == 0: return { 'chunks': [{ 'chunk_start': 0, 'chunk_len': 0, 'subsample_info': {} }] } # Get required info from the mol info with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter: n_molecule_info_entries = mol_counter.nrows() barcode_whitelist = mol_counter.get_barcode_whitelist() gem_groups = mol_counter.get_gem_groups() raw_reads = mol_counter.get_total_raw_reads() raw_rpc = tk_stats.robust_divide(raw_reads, n_cells) mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads() mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads) subsamplings = list() # track subsample info definitions # Calculate extra deciles to add in based on raw reads if raw_reads > 0: subsampling_deciles = [ round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1) ] else: subsampling_deciles = [] # All target depths target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles for subsample_type, rpc_multiplier in [ (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac), (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0) ]: # Generate subsampling definitions for target_rpc in target_rpcs: target_mapped_reads = int( float(target_rpc) * float(n_cells) * rpc_multiplier) subsample_rate = tk_stats.robust_divide(target_mapped_reads, mapped_reads) if subsample_rate > 1.0: continue subsamplings.append({ 'subsample_type': subsample_type, 'target_rpc': target_rpc, 'subsample_rate': subsample_rate, 'all_target_rpc': target_rpcs, }) # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5 matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( barcode_whitelist, gem_groups) chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len) join_mem_gb = matrix_mem_gb # Split the molecule info h5 into equi-RAM chunks chunks = [] for subsample_info in subsamplings: for chunk_start in xrange(0, n_molecule_info_entries, chunk_len): chunks.append({ 'chunk_start': str(chunk_start), 'chunk_len': str(min(n_molecule_info_entries - chunk_start, chunk_len)), 'subsample_info': subsample_info, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } if len(chunks) == 0: chunks.append({ 'chunk_start': str(0), 'chunk_len': str(0), 'subsample_info': {}, }) return {'chunks': chunks, 'join': join}
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()