def get_library_mapping(aggr_id, libraries): """Get the mapping of gem groups and library indices to their new values. Args: aggr_id (str): The label given to a set of libraries in the aggr CSV file. libraries (list of dict): New library info. Returns: tuple of (gem_group_map, library_map) (np.array, np.array): gem_group_map maps the old gem group integer ro the new one library_map maps the old library index integer to the new one """ for i, lib in enumerate(libraries): lib['index'] = i my_libs = [lib for lib in libraries if lib['aggr_id'] == aggr_id] max_old_gg = max(lib['old_gem_group'] for lib in my_libs) max_old_lib_idx = max(lib['old_library_index'] for lib in my_libs) gem_group_map = np.zeros( 1 + max_old_gg, dtype=MoleculeCounter.get_column_dtype('gem_group')) lib_idx_map = np.zeros( 1 + max_old_lib_idx, dtype=MoleculeCounter.get_column_dtype('library_idx')) for lib in my_libs: gem_group_map[lib['old_gem_group']] = lib['gem_group'] lib_idx_map[lib['old_library_index']] = lib['index'] return gem_group_map, lib_idx_map
def join(args, outs, chunk_defs, chunk_outs): molecules = [chunk_out.molecule_h5 for chunk_out in chunk_outs] metrics = MoleculeCounter.naive_concatenate_metrics(molecules) metrics[cr_mol_counter.IS_AGGREGATED_METRIC] = True MoleculeCounter.concatenate(outs.merged_molecules, molecules, metrics=metrics) # Record, for each gem group, the range of barcode indices it can contain. outs.gem_group_barcode_ranges = {} for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): for gg in chunk_out.new_gem_groups: outs.gem_group_barcode_ranges[str(gg)] = [ chunk_def.barcode_idx_offset, chunk_def.barcode_idx_end ]
def main(args, outs): np.random.seed(0) subsample_rate = args.subsample_info.get('subsample_rate') if subsample_rate is None: return mol_counter = MoleculeCounter.open(args.molecule_info, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) # Subsample the matrices subsample_result = {} subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter( mol_counter, subsample_rate=subsample_rate, subsample_result=subsample_result) # Filter the subsampled matrices filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) subsampled_filt_mats = subsampled_raw_mats.filter_barcodes( filtered_bcs_per_genome) # Calculations for subsampled duplication rate reporter = cr_report.Reporter( genomes=map(str, mol_counter.get_ref_column('genome_ids')), subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES, subsample_depths=args.subsample_info['all_target_rpc']) reporter.subsampled_duplication_frac_cb( subsampled_raw_mats, mol_counter, args.subsample_info['subsample_rate'], args.subsample_info['subsample_type'], args.subsample_info['target_rpc'], subsample_result['mapped_reads'], ) mol_counter.close() reporter.save(outs.chunked_reporter) outs.subsampled_matrices = {} outs.subsampled_matrices['raw_matrices'] = martian.make_path( 'raw_matrices.h5') outs.subsampled_matrices['filtered_matrices'] = martian.make_path( 'filtered_matrices.h5') subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices']) subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
def join(args, outs, chunk_defs, chunk_outs): summary = cr_utils.merge_jsons_as_dict([ args.extract_reads_summary, args.attach_bcs_and_umis_summary, args.mark_duplicates_summary, ]) # Hack for getting reference metadata - # this used to be computed in prior stages. # This is needed for storage in the molecule_info HDF5. tmp_reporter = cr_report.Reporter() tmp_reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE) summary.update(ref_metadata) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.inputs[0]) library_info = rna_library.get_bam_library_info(in_bam) metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info, args.recovered_cells, args.force_cells) input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs] # update with metrics that were computed in the chunks chunk_metric = cr_mol_counter.USABLE_READS_METRIC summed_lib_metrics = MoleculeCounter.sum_library_metric( input_h5_filenames, chunk_metric) for lib_key, value in summed_lib_metrics.iteritems(): metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value MoleculeCounter.concatenate(outs.output, input_h5_filenames, metrics=metrics)
def split(args): """ Chunk the data by input library """ chunks, merged_barcodes = [], [] barcode_whitelist_to_idx_offset = {} barcode_idx_offset = 0 merged_barcodes_file = martian.make_path('merged_barcodes.pickle') for sample_def in args.sample_defs: with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD], 'r') as mol_counter: mem_gb = int(1.5 * MoleculeCounter.estimate_mem_gb(mol_counter.nrows())) barcode_whitelist = mol_counter.get_barcode_whitelist() barcodes = mol_counter.get_barcodes() if barcode_whitelist not in barcode_whitelist_to_idx_offset: merged_barcodes.extend(barcodes) barcode_whitelist_to_idx_offset[ barcode_whitelist] = barcode_idx_offset barcode_idx_offset += len(barcodes) idx_offset = barcode_whitelist_to_idx_offset[barcode_whitelist] chunks.append({ 'aggr_id': sample_def[cr_constants.AGG_ID_FIELD], 'molecule_h5': sample_def[cr_constants.AGG_H5_FIELD], '__mem_gb': mem_gb, 'barcode_idx_offset': idx_offset, 'barcode_idx_end': idx_offset + len(barcodes), 'merged_barcodes': merged_barcodes_file, }) with open(merged_barcodes_file, 'wb') as fp: cPickle.dump(merged_barcodes, fp, cPickle.HIGHEST_PROTOCOL) return {'chunks': chunks, 'join': {'__mem_gb': 6}}
def split(args): with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() # For memory request calculation num_gem_groups = len(set(lib['gem_group'] for lib in library_info)) # Number of barcodes in the full matrix num_barcodes = mc.get_ref_column_lazy('barcodes').shape[0] # Worst case number of nonzero elements in final matrix num_nonzero = args.raw_nnz join_mem_gb = CountMatrix.get_mem_gb_from_matrix_dim(num_barcodes*num_gem_groups, num_nonzero) return { 'chunks': [], 'join': { '__mem_gb': join_mem_gb, '__threads': 2 } }
def summarize_read_matrix(matrix, library_info, barcode_info, barcode_seqs): """Summarize matrix of read-pair counts""" lib_types = sorted(set(lib['library_type'] for lib in library_info)) view = matrix.view() summary = {} for lib_type in lib_types: if rna_library.has_genomes(lib_type): sum_genomes = map(str, barcode_info.genomes) else: sum_genomes = [lib_constants.MULTI_REFS_PREFIX] for genome in sum_genomes: m = view.select_features_by_type(lib_type) if rna_library.has_genomes(lib_type): m = m.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) else: genome_idx = None prefix = '%s%s' % ( rna_library.get_library_type_metric_prefix(lib_type), genome) summary['%s_raw_mapped_reads' % prefix] = m.sum() filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) filtered_m = m.select_barcodes_by_seq(filtered_bcs) summary['%s_flt_mapped_reads' % prefix] = filtered_m.sum() summary['%s_filtered_bcs' % prefix] = len(filtered_bcs) return summary
def main(args, outs): outs.coerce_strings() # Load whitelist whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist)) # Load feature reference feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.chunk_input) library_info = rna_library.get_bam_library_info(in_bam) # Get cell-associated barcodes by genome filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bc_union = cr_utils.get_cell_associated_barcode_set( args.filtered_barcodes) # Create the barcode info barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome, library_info, whitelist) # Create the molecule info file mc = MoleculeCounter.open(outs.output, mode='w', feature_ref=feature_ref, barcodes=whitelist, library_info=library_info, barcode_info=barcode_info) # Initialize per-library metrics lib_metrics = {} for lib_idx in xrange(len(library_info)): lib_metrics[str(lib_idx)] = {} lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0 # Record read-counts per molecule. Note that UMIs are not contiguous # in the input because no sorting was done after UMI correction. prev_gem_group = None prev_barcode_idx = None for (gem_group, barcode_seq), reads_iter in \ itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi): if barcode_seq is None: continue barcode_idx = barcode_to_idx[barcode_seq] # Assert expected sort order of input BAM assert gem_group >= prev_gem_group if gem_group == prev_gem_group: assert barcode_idx >= prev_barcode_idx is_cell_barcode = cr_utils.format_barcode_seq( barcode_seq, gem_group) in filtered_bc_union counts = defaultdict(int) for read in reads_iter: # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or \ read.is_read2 or \ cr_utils.is_read_low_support_umi(read) or \ not cr_utils.is_read_conf_mapped_to_feature(read): continue umi_seq = cr_utils.get_read_umi(read) if umi_seq is None: continue umi_int = MoleculeCounter.compress_umi_seq( umi_seq, MoleculeCounter.get_column_dtype('umi').itemsize * 8) feature_ids = cr_utils.get_read_gene_ids(read) assert len(feature_ids) == 1 feature_int = feature_ref.id_map[feature_ids[0]].index library_idx = cr_utils.get_read_library_index(read) counts[(umi_int, library_idx, feature_int)] += 1 if is_cell_barcode: lib_metrics[str(library_idx)][ cr_mol_counter.USABLE_READS_METRIC] += 1 prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Record data for this barcode gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group) mc.append_column('gem_group', np.repeat(gg_int, len(counts))) bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type( barcode_idx) mc.append_column('barcode_idx', np.repeat(bc_int, len(counts))) feature_ints = np.fromiter( (k[2] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('feature_idx'), count=len(counts)) # Sort by feature for fast matrix construction order = np.argsort(feature_ints) feature_ints = feature_ints[order] mc.append_column('feature_idx', feature_ints) del feature_ints li_ints = np.fromiter( (k[1] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('library_idx'), count=len(counts))[order] mc.append_column('library_idx', li_ints) del li_ints umi_ints = np.fromiter((k[0] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('umi'), count=len(counts))[order] mc.append_column('umi', umi_ints) del umi_ints count_ints = np.fromiter( counts.itervalues(), dtype=MoleculeCounter.get_column_dtype('count'), count=len(counts))[order] mc.append_column('count', count_ints) del count_ints in_bam.close() mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics)) mc.save()
def main(args, outs): new_gg = 0 gg_index = {} libraries = [] chemistry_batch_correction = False ### Batch info # If a column 'batch' is given in sample_defs (read from input csv), that # column will be used as batch identifier and chemistry_batch_correction will # be turned on. otherwise, aggr_id will be used as batch identifier. # Each batch will have a distinct batch_id, which is an increasing integer. batch_name_to_id = {} sample_defs = [] if args.sample_defs is None else args.sample_defs for sample_def in sample_defs: seen_ggs = set() aggr_id = sample_def[cr_constants.AGG_ID_FIELD] if cr_constants.AGG_BATCH_FIELD in sample_def: chemistry_batch_correction = True batch_name = sample_def[cr_constants.AGG_BATCH_FIELD] else: batch_name = aggr_id if batch_name not in batch_name_to_id: batch_name_to_id[batch_name] = len(batch_name_to_id) with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD], 'r') as mc: old_libraries = mc.get_library_info() for lib_idx, old_lib in enumerate(old_libraries): # Remap gem groups old_gg = int(old_lib['gem_group']) # Increment gem group if this is a new one from the same input sample if old_gg not in seen_ggs: new_gg += 1 gg_index[new_gg] = (aggr_id, old_gg) # Remap libraries new_lib = copy.deepcopy(old_lib) new_lib['gem_group'] = new_gg # Make the new library id unique new_lib['library_id'] += ".%d" % (new_gg) new_lib['old_library_index'] = lib_idx new_lib['old_gem_group'] = old_gg new_lib['aggr_id'] = sample_def[cr_constants.AGG_ID_FIELD] new_lib['batch_name'] = batch_name new_lib['batch_id'] = batch_name_to_id[batch_name] libraries.append(new_lib) # Track gem groups seen_ggs.add(old_gg) if chemistry_batch_correction is True and len(batch_name_to_id) <= 1: chemistry_batch_correction = False martian.log_info('Warning: only one batch sepecified in the input csv, chemistry_batch_correction is disabled.') outs.libraries = libraries outs.gem_group_index = gg_index outs.chemistry_batch_correction = chemistry_batch_correction # Write the "gem group index" (a legacy structure) for Loupe with open(outs.gem_group_index_json, 'w') as outfile: json.dump({"gem_group_index": gg_index}, outfile)
def split(args): # Get the cell count filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) n_cells = len(filtered_bcs) if n_cells == 0: return { 'chunks': [{ 'chunk_start': 0, 'chunk_len': 0, 'subsample_info': {} }] } # Get required info from the mol info with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter: n_molecule_info_entries = mol_counter.nrows() barcode_whitelist = mol_counter.get_barcode_whitelist() gem_groups = mol_counter.get_gem_groups() raw_reads = mol_counter.get_total_raw_reads() raw_rpc = tk_stats.robust_divide(raw_reads, n_cells) mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads() mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads) subsamplings = list() # track subsample info definitions # Calculate extra deciles to add in based on raw reads if raw_reads > 0: subsampling_deciles = [ round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1) ] else: subsampling_deciles = [] # All target depths target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles for subsample_type, rpc_multiplier in [ (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac), (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0) ]: # Generate subsampling definitions for target_rpc in target_rpcs: target_mapped_reads = int( float(target_rpc) * float(n_cells) * rpc_multiplier) subsample_rate = tk_stats.robust_divide(target_mapped_reads, mapped_reads) if subsample_rate > 1.0: continue subsamplings.append({ 'subsample_type': subsample_type, 'target_rpc': target_rpc, 'subsample_rate': subsample_rate, 'all_target_rpc': target_rpcs, }) # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5 matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( barcode_whitelist, gem_groups) chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len) join_mem_gb = matrix_mem_gb # Split the molecule info h5 into equi-RAM chunks chunks = [] for subsample_info in subsamplings: for chunk_start in xrange(0, n_molecule_info_entries, chunk_len): chunks.append({ 'chunk_start': str(chunk_start), 'chunk_len': str(min(n_molecule_info_entries - chunk_start, chunk_len)), 'subsample_info': subsample_info, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } if len(chunks) == 0: chunks.append({ 'chunk_start': str(0), 'chunk_len': str(0), 'subsample_info': {}, }) return {'chunks': chunks, 'join': join}
def join(args, outs, chunk_defs, chunk_outs): # Merge tallies data = None for chunk in chunk_outs: with open(chunk.metrics) as f: chunk_data = cPickle.load(f) if data is None: data = chunk_data else: for k, v in data.iteritems(): data[k] += chunk_data[k] # Compute metrics for each subsampling rate summary = {} with MoleculeCounter.open(args.molecule_info, 'r') as mc: genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) lib_types = sorted(set(lib['library_type'] for lib in mc.library_info)) lib_type_map = dict((lt, idx) for (idx, lt) in enumerate(lib_types)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Give each cell-associated barcode an integer index cell_bcs = sorted(list(cell_bcs_by_genome[''])) cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)} subsample_info = chunk_defs[0].subsample_info if len( chunk_defs) > 0 else [] for i, task in enumerate(subsample_info): lib_type = task['library_type'] lib_type_idx = lib_type_map[lib_type] ss_type = task['subsample_type'] ss_depth = task['target_read_pairs_per_cell'] if rna_library.has_genomes(lib_type): genome_ints = list(range(data['umis_per_bc'].shape[1])) else: genome_ints = [0] # Per-genome metrics for g in genome_ints: if not data['lib_type_genome_any_reads'][lib_type_idx, g]: continue genome = genomes[g] # Only compute on cell-associated barcodes for this genome. # This only matters when there are multiple genomes present. cell_inds = np.array( sorted(cell_bc_to_int[bc] for bc in cell_bcs_by_genome[genome])) median_umis_per_cell = np.median(data['umis_per_bc'][i, g, cell_inds]) summary[make_metric_name('subsampled_filtered_bcs_median_counts', lib_type, genome, ss_type, ss_depth)] = median_umis_per_cell median_features_per_cell = np.median( data['features_det_per_bc'][i, g, cell_inds]) summary[make_metric_name( 'subsampled_filtered_bcs_median_unique_genes_detected', lib_type, genome, ss_type, ss_depth)] = median_features_per_cell dup_frac = compute_dup_frac(data['read_pairs'][i, g], data['umis'][i, g]) summary[make_metric_name('subsampled_duplication_frac', lib_type, genome, ss_type, ss_depth)] = dup_frac # Whole-dataset duplication frac all_read_pairs = np.sum(data['read_pairs'][i, :]) all_umis = np.sum(data['umis'][i, :]) dup_frac = compute_dup_frac(all_read_pairs, all_umis) summary[make_metric_name('subsampled_duplication_frac', lib_type, lib_constants.MULTI_REFS_PREFIX, ss_type, ss_depth)] = dup_frac with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def main(args, outs): np.random.seed(0) LogPerf.mem() with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() metrics_in = mc.get_all_metrics() metrics_out = copy.deepcopy(metrics_in) # Compute subsampling rate and approximate new total readpair count frac_reads_kept = np.array(args.frac_reads_kept, dtype=float) total_reads_in = mc.get_raw_read_pairs_per_library() total_reads_out = total_reads_in * frac_reads_kept for lib_idx, _ in enumerate(library_info): metrics_out[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)][cr_mol_counter. DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx] # downsample molecule info chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_rate = frac_reads_kept[mol_library_idx] del mol_library_idx new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) del mol_read_pairs del mol_rate keep_mol = np.flatnonzero(new_read_pairs) new_read_pairs = new_read_pairs[keep_mol] mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol] # Assert that gem groups start at 1 and are contiguous gem_groups = sorted(set(lib['gem_group'] for lib in library_info)) assert(min(gem_groups) == 1 and \ np.all(np.diff(np.array(gem_groups,dtype=int)) == 1)) feature_ref = mc.get_feature_ref() # Compute matrix dimensions # Get the range of possible barcode indices for each gem group. gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int) gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int) for gg_str, idx_range in sorted( args.gem_group_barcode_ranges.iteritems(), key=lambda kv: int(kv[0])): gg = int(gg_str) gg_barcode_idx_start[gg] = idx_range[0] gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0] num_bcs = gg_barcode_idx_len.sum() num_features = feature_ref.get_num_features() print 'downsampled' LogPerf.mem() # Convert molecule barcode indices into matrix barcode indices # The molecule info barcode_idx is in this space: # [W_0, W_1, ...] where W_i is distinct original whitelist i. # The matrix is in, e.g., this space: # [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j. # Return to the original whitelist index mol_barcode_idx -= gg_barcode_idx_start.astype( np.uint64)[mol_gem_group] # Offset by the cumulative whitelist length up to a barcode's gem group gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype( np.uint64) mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1] ones = np.ones(len(mol_barcode_idx), dtype=cr_matrix.DEFAULT_DATA_DTYPE) umi_matrix = sp_sparse.coo_matrix( (ones, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) print 'created umi matrix' LogPerf.mem() # Create a read-count matrix so we can summarize reads per barcode read_matrix = sp_sparse.coo_matrix( (new_read_pairs, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) del ones del mol_feature_idx del mol_barcode_idx del new_read_pairs # Get all barcodes strings for the raw matrix barcode_seqs = mc.get_barcodes() print len(barcode_seqs), len(gem_groups) print 'creating barcode strings' LogPerf.mem() barcodes = [] for gg in gem_groups: idx_start = gg_barcode_idx_start[gg] idx_end = idx_start + gg_barcode_idx_len[gg] gg_bcs = np.array([ cr_utils.format_barcode_seq(bc, gg) for bc in barcode_seqs[idx_start:idx_end] ]) barcodes.append(gg_bcs) barcodes = np.concatenate(barcodes) barcodes.flags.writeable = False print 'created barcode strings' LogPerf.mem() # Get mapped reads per barcode per library,genome read_summary = {} read_matrix = CountMatrix(feature_ref, barcodes, read_matrix) read_matrix.m = read_matrix.m.tocsc(copy=True) read_summary = summarize_read_matrix(read_matrix, library_info, barcode_info, barcode_seqs) del read_matrix print 'created read matrix' LogPerf.mem() # Construct the raw UMI matrix raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix) raw_umi_matrix.save_h5_file(outs.raw_matrix_h5) outs.raw_nnz = raw_umi_matrix.m.nnz # Construct the filtered UMI matrix filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs) filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq( filtered_bcs) filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5) outs.filtered_nnz = filtered_umi_matrix.m.nnz print 'created filtered umi matrix' LogPerf.mem() summary = { 'read_summary': read_summary, 'mol_metrics': metrics_out, } with open(outs.chunk_summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True) # Don't write MEX from chunks. outs.raw_matrices_mex = None outs.filtered_matrices_mex = None
def join(args, outs, chunk_defs, chunk_outs): # compute invariants on input data input_genomes = set() input_features = set() input_bc_counts = {} input_feature_counts = {} input_num_gem_groups = 0 for sample_def in args.input_sample_defs: library_id = sample_def['library_id'] with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD], 'r') as mc: input_genomes.update(mol_counter_genomes(mc)) input_features.update(mol_counter_features_id_type(mc)) gem_groups = mc.get_gem_groups() input_num_gem_groups += len(gem_groups) mol_gem_group = mc.get_column('gem_group') mol_barcode_idx = mc.get_column('barcode_idx') for gg in gem_groups: input_bc_counts[(library_id, gg)] = np.zeros( len(mc.get_ref_column('barcodes'))) bc_idx, counts = np.unique( mol_barcode_idx[mol_gem_group == gg], return_counts=True) input_bc_counts[(library_id, gg)][bc_idx] = counts del mol_barcode_idx mol_feature_idx = mc.get_column('feature_idx') for gg in gem_groups: input_feature_counts[(library_id, gg)] = np.zeros( len(mc.feature_reference.feature_defs)) feature_idx, counts = np.unique( mol_feature_idx[mol_gem_group == gg], return_counts=True) input_feature_counts[(library_id, gg)][feature_idx] = counts del mol_feature_idx # compute invariants on output output_matrix = cr_matrix.CountMatrix.load_h5_file( args.merged_raw_gene_bc_matrices_h5) output_genomes = set(output_matrix.get_genomes()) output_features = set(count_matrix_features_id_type(output_matrix)) output_bc_counts = {} output_feature_counts = {} output_gem_index = cr_matrix.get_gem_group_index( args.merged_raw_gene_bc_matrices_h5) output_num_gem_groups = len(output_gem_index) for gg in output_gem_index: library_id, old_gg = output_gem_index[gg] matrix_gg = output_matrix.select_barcodes_by_gem_group(gg) output_bc_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_bc() output_feature_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_feature() exit_message = ( 'An internal problem in the aggr pipeline has been detected ' 'that might lead to incorrect results. Please report this ' 'problem to [email protected].') if input_genomes != output_genomes: martian.log_info( 'Genomes differ between input molecule files and aggregated matrix' ) martian.exit(exit_message) if input_features != output_features: martian.log_info( 'Features differ between input molecule files and aggregated matrix' ) martian.exit(exit_message) if input_num_gem_groups != output_num_gem_groups: martian.log_info( 'Number of GEM groups differs between input molecule files and aggregated matrix' ) martian.exit(exit_message) for lib_gg in input_bc_counts.keys(): if len(input_bc_counts[lib_gg]) != len(output_bc_counts[lib_gg]): martian.log_info( 'Barcode list for library {}, GEM group {} has different length ' 'in aggregated output compared to input.'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if np.any(input_bc_counts[lib_gg] < output_bc_counts[lib_gg]): martian.log_info( 'Barcode(s) in library {}, GEM group {} have higher UMI counts ' 'in aggregated output compared to inputs'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if len(input_feature_counts[lib_gg]) != len( output_feature_counts[lib_gg]): martian.log_info( 'Feature list for library {}, GEM group {} has different length ' 'in aggregated output compared to input.'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if np.any( input_feature_counts[lib_gg] < output_feature_counts[lib_gg]): martian.log_info( 'Feature(s) in library {}, GEM group {} have higher UMI counts ' 'in aggregated output compared to inputs'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) summary = { 'genomes_present': list(input_genomes), 'num_features_in_ref': len(input_features), 'num_gem_groups': input_num_gem_groups, } with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): version = martian.get_pipelines_version() with open(args.summary) as f: summary = json.load(f) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() barcode_seqs = mc.get_barcodes() lib_types = sorted(set(lib['library_type'] for lib in library_info)) # make attrs for user-added columns in aggr csv extra_attrs = get_custom_aggr_columns(args.sample_defs) # track original library/gem info library_map = cr_matrix.make_library_map_aggr(args.gem_group_index) extra_attrs.update(library_map) # Merge raw matrix raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5) raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs) genomes = raw_matrix.get_genomes() # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot with h5py.File(outs.barcode_summary_h5, 'w') as f: cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs) gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64') genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key, data=gex_bc_counts) rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version) del raw_matrix # Merge filtered matrix filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5) filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs) # Summarize the matrix across library types and genomes for lib_type in lib_types: libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type) if rna_library.has_genomes(lib_type): genomes = filt_mat.get_genomes() else: genomes = [None] mat_lib = filt_mat.view().select_features_by_type(lib_type) for genome in genomes: if genome is None: mat = mat_lib genome_idx = None else: mat = mat_lib.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) # Select barcodes passing filter for this (lib_type, genome) filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) mat = mat.select_barcodes_by_seq(filtered_bcs) median_features = np.median(mat.count_ge(axis=0, threshold=cr_constants.MIN_COUNTS_PER_GENE)) median_counts = np.median(mat.sum(axis=0)) genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX prefixes = (libtype_prefix, genome_prefix) if genome is not None: flt_reads = summary['%s%s_flt_mapped_reads' % prefixes] raw_reads = summary['%s%s_raw_mapped_reads' % prefixes] frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] = frac_reads_in_cells summary.update({ '%s%s_filtered_bcs_median_counts' % prefixes: median_counts, '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features, }) # Compute frac reads in cells across all genomes prefixes = [(libtype_prefix, g) for g in genomes if g is not None] if len(prefixes) == 0: prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes) raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes) frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % ( libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells # Write MEX format (do it last because it converts the matrices to COO) rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def split(args): # Get required info from the mol info mc = MoleculeCounter.open(args.molecule_info, 'r') genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Get cell counts per gem group n_cells_per_gg = defaultdict(int) for bc in cell_bcs_by_genome['']: _, gem_group = cr_utils.split_barcode_seq(bc) n_cells_per_gg[gem_group] += 1 # Assign gem group cell counts to their constituent libraries # TODO FIXME: Need to allow for per-library cell counts # because some feature types might only have a subset of the GEX cell-assoc barcodes. n_cells_per_lib = np.zeros(len(mc.library_info), dtype=int) for lib_idx, lib in enumerate(mc.library_info): n_cells_per_lib[lib_idx] = n_cells_per_gg[lib['gem_group']] if n_cells_per_lib.sum() == 0: return {'chunks': []} library_info = mc.library_info raw_count_per_lib = np.array(mc.get_raw_read_pairs_per_library()) raw_rppc_per_lib = raw_count_per_lib.astype(float) / n_cells_per_lib usable_count_per_lib = np.array(mc.get_usable_read_pairs_per_library()) subsamplings = list() # track subsample info definitions library_types = sorted(set(lib['library_type'] for lib in library_info)) for library_type in library_types: # All libraries w/ this type lib_indexes = np.array([ i for i, lib in enumerate(library_info) if lib['library_type'] == library_type ]) # For plotting, we want a series of target depths that exist for all # libraries w/ the same library type. When there's a single library # per type (the common case), this is trivial - split it into deciles. # But if there are multiple libraries with different depths, (e.g., # because gem-group-aggregation was used to increase cell numbers), # we need to find depths that are achievable for all libraries. # For now, let the lowest-depth library for a given type dictate this. min_raw_rppc = np.min(raw_rppc_per_lib[lib_indexes]) # Use deciles of the raw read pairs per cell. deciles = np.arange(0.1, 1.1, 0.1) plot_targets = map(round, min_raw_rppc * deciles) # TODO: separate this work (internal + non) raw_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \ plot_targets # TODO: separate this work (internal + non) usable_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \ plot_targets for targets, depth_type in \ ((raw_targets, cr_constants.RAW_SUBSAMPLE_TYPE), \ ((usable_targets, cr_constants.MAPPED_SUBSAMPLE_TYPE)),): targets = sorted(list(set(map(int, targets)))) for target_rppc in targets: if depth_type == cr_constants.RAW_SUBSAMPLE_TYPE: # Infer the usable depth required to achieve this raw depth usable_read_fracs = usable_count_per_lib.astype( float) / raw_count_per_lib target_usable_counts = target_rppc * n_cells_per_lib * usable_read_fracs else: target_usable_counts = target_rppc * n_cells_per_lib # Zero out libraries of the other types rates = np.zeros(len(library_info), dtype=float) rates[lib_indexes] = target_usable_counts[lib_indexes].astype(float) \ / usable_count_per_lib[lib_indexes] # Clamp rates that are close to 1 to 1 rates[np.absolute(rates - 1) < 1e-3] = 1 # Zero out the libraries for which we have fewer reads than the target rates[rates > 1] = 0.0 enough_data = np.any((rates > 0) & (rates <= 1)) if not enough_data: rates = np.zeros(len(rates)) subsamplings.append({ 'library_type': library_type, 'subsample_type': depth_type, 'target_read_pairs_per_cell': int(target_rppc), 'library_subsample_rates': list(map(float, rates)), }) # Each chunk needs to store a piece of the mol info h5 tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK # Split the molecule info h5 into equi-RAM chunks chunks = [] for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len, preserve_boundaries=True): chunks.append({ 'chunk_start': chunk_start, 'chunk_len': chunk_len, 'subsample_info': subsamplings, # The estimate_mem_gb only count the memory usage for the MoleculeCounter object, which is # under-estimated the actual memory usage. # Based on memory profiling with test case fuzzer_114, actual memory usageis ~4x more # than estimate_mem_gb (without cap), here set scale = 6. '__mem_gb': MoleculeCounter.estimate_mem_gb(chunk_len, scale=6), }) join = { '__mem_gb': 6, } mc.close() # TODO: is this really necessary w/ martian 3 if len(chunks) == 0: chunks.append({ 'chunk_start': str(0), 'chunk_len': str(0), 'subsample_info': [], }) return {'chunks': chunks, 'join': join}
def join(args, outs, chunk_defs, chunk_outs): # Pass through the matrix chunks and nnz counts outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs] outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs) outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs] outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() lib_types = sorted(set(lib['library_type'] for lib in library_info)) summary = { 'frac_reads_kept': chunk_defs[0].frac_reads_kept, 'num_cells_by_library': chunk_defs[0].num_cells, } # Merge read summary metrics read_summary = defaultdict(int) for filename in [co.chunk_summary for co in chunk_outs]: with open(filename) as f: d = json.load(f) for k in d['read_summary'].iterkeys(): read_summary[k] += d['read_summary'][k] summary.update(read_summary) # Get summary metrics with open(chunk_outs[0].chunk_summary) as f: mol_metrics = json.load(f)['mol_metrics'] chem_keys = [ k for k in mol_metrics.iterkeys() if k.startswith('chemistry') ] for k in chem_keys: summary[k] = mol_metrics[k] print json.dumps(mol_metrics, indent=4, sort_keys=True) # Report normalization metrics all_batches = OrderedDict() # These are all per-library-type min_frac_reads_kept = np.ones(len(lib_types), dtype='float') total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_cells = np.zeros(len(lib_types), dtype='uint64') for lib_type_idx, lib_type in enumerate(lib_types): lib_inds = [ i for i, lib in enumerate(library_info) if lib['library_type'] == lib_type ] for lib_idx in lib_inds: aggr_id = library_info[lib_idx]['aggr_id'] old_gg = library_info[lib_idx]['old_gem_group'] batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '') all_batches[batch] = None n_cells = summary['num_cells_by_library'][lib_idx] total_cells[lib_type_idx] += n_cells lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)] raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC] mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC] ds_read_pairs = lib_metrics[ cr_mol_counter.DOWNSAMPLED_READS_METRIC] total_raw_read_pairs[lib_type_idx] += raw_read_pairs total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs frac_reads_kept = summary['frac_reads_kept'][lib_idx] min_frac_reads_kept[lib_type_idx] = min( min_frac_reads_kept[lib_type_idx], frac_reads_kept) pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells) pre_norm_mapped_rppc = tk_stats.robust_divide( mapped_read_pairs, n_cells) # Prefix with batch and library type if lib_type.lower().startswith( rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()): lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_' else: lib_prefix = rna_library.get_library_type_metric_prefix( lib_type) p = (batch, lib_prefix) summary.update({ '%s_%sfrac_reads_kept' % p: frac_reads_kept, '%s_%spre_normalization_raw_reads_per_filtered_bc' % p: pre_norm_raw_rppc, '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p: pre_norm_mapped_rppc, }) summary['batches'] = all_batches.keys() for lib_type_idx, lib_type in enumerate(lib_types): mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) ds_mean_rppc = tk_stats.robust_divide( total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) p = rna_library.get_library_type_metric_prefix(lib_type) summary.update({ '%spre_normalization_total_reads' % p: total_raw_read_pairs[lib_type_idx], '%spost_normalization_total_reads' % p: total_ds_raw_read_pairs[lib_type_idx], '%sfiltered_bcs_transcriptome_union' % p: total_cells[lib_type_idx], '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: mean_rppc, '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: ds_mean_rppc, '%slowest_frac_reads_kept' % p: min_frac_reads_kept[lib_type_idx], }) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def split(args): # default to downsampling by mapped reads downsample = True if args.normalization_mode == cr_constants.NORM_MODE_NONE: downsample = False # compute downsample rates for each library with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() usable_reads = mc.get_usable_read_pairs_per_library() cells = np.array([ mc.get_num_filtered_barcodes_for_library(lib_idx) for lib_idx in xrange(len(library_info)) ]) print "Libraries: %s" % library_info print "Usable reads: %s" % usable_reads print "Cells: %s" % cells usable_rpc = np.zeros(len(library_info), dtype=float) for i in xrange(len(library_info)): usable_rpc[i] = tk_stats.robust_divide( usable_reads[i], cells[i]) if cells[i] > 0 else 0.0 # Determine lowest depth for each library type lt_rpcs = defaultdict(list) for lib, rpc in itertools.izip(library_info, usable_rpc): lt_rpcs[lib['library_type']].append(rpc) min_rpc_by_lt = {lt: min(rpcs) for lt, rpcs in lt_rpcs.iteritems()} for lib_idx in xrange(len(library_info)): lib_type = library_info[lib_idx]['library_type'] print "%s Usable read pairs per cell: %s" % (lib_type, usable_rpc[lib_idx]) print "%s Minimum read pairs usable per cell: %d" % ( lib_type, min_rpc_by_lt[lib_type]) if not downsample: frac_reads_kept = np.ones(len(library_info), dtype=float) else: frac_reads_kept = np.zeros(len(library_info), dtype=float) for i in xrange(len(library_info)): lib_type = library_info[i]['library_type'] min_rpc = min_rpc_by_lt[lib_type] if min_rpc == 0: frac_reads_kept[i] = 0 else: frac_reads_kept[i] = tk_stats.robust_divide( min_rpc, usable_rpc[i]) # Split the molecule info h5 into equi-RAM chunks, preserving (barcode, gem_group) boundaries # Assumes the molecule_info is sorted by (gem_group, barcode) tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK chunks = [] # For memory request calculation num_gem_groups = len(set(lib['gem_group'] for lib in library_info)) with MoleculeCounter.open(args.molecules, 'r') as mc: # Number of barcodes in the full matrix num_barcodes = mc.get_ref_column_lazy('barcodes').shape[0] for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len, preserve_boundaries=True): mol_mem_gb = MoleculeCounter.estimate_mem_gb(chunk_len, scale=2.0, cap=False) print 'molecule_info mem_gb = %d' % mol_mem_gb # Worst case number of nonzero elements in chunk matrix num_nonzero = chunk_len matrix_mem_gb = CountMatrix.get_mem_gb_from_matrix_dim( num_barcodes * num_gem_groups, num_nonzero) print 'matrix mem_gb = %d' % matrix_mem_gb mem_gb = max(h5_constants.MIN_MEM_GB, matrix_mem_gb + mol_mem_gb) chunks.append({ 'frac_reads_kept': list(frac_reads_kept), 'num_cells': list(cells), 'chunk_start': chunk_start, 'chunk_len': chunk_len, # Request enough for two copies '__mem_gb': mem_gb, }) # Join is not loading the merged matrix, so it doesn't need much memory. # WRITE_MATRICES will use the precise nnz counts to make an appropriate mem request. return {'chunks': chunks, 'join': {'__mem_gb': 3, '__threads': 2}}
def main(args, outs): with MoleculeCounter.open(args.molecule_h5, 'r') as in_mc: # Get the gem group and library mappings gg_map, lib_idx_map = get_library_mapping(args.aggr_id, args.libraries) # load merged barcode whitelists bc_idx_offset = args.barcode_idx_offset with open(args.merged_barcodes) as fp: merged_barcodes = cPickle.load(fp) # FIXME: Handle heterogeneous feature references merged_feature_ref = in_mc.get_feature_ref() # Remap the barcode info old_barcode_info = in_mc.get_barcode_info() new_pass_filter = old_barcode_info.pass_filter new_pass_filter[:, 0] = new_pass_filter[:, 0] + bc_idx_offset new_pass_filter[:, 1] = lib_idx_map[new_pass_filter[:, 1]] new_barcode_info = cr_mol_counter.BarcodeInfo( pass_filter=new_pass_filter, genomes=old_barcode_info.genomes, ) with MoleculeCounter.open( outs.molecule_h5, 'w', feature_ref=merged_feature_ref, barcodes=merged_barcodes, library_info=args.libraries, barcode_info=new_barcode_info, ) as out_mc: # Copy the datasets, rewriting the ones we remap for col, ds in in_mc.columns.iteritems(): if col == 'gem_group': old_gg = ds[:] new_gg = gg_map[old_gg] out_mc.append_column(col, new_gg) outs.new_gem_groups = np.flatnonzero( np.bincount(new_gg)).tolist() elif col == 'library_idx': old_idx = ds[:] new_idx = lib_idx_map[old_idx] out_mc.append_column(col, new_idx) elif col == 'barcode_idx': new_bc_idx = ds[:] + bc_idx_offset out_mc.append_column(col, new_bc_idx) else: out_mc.append_column(col, ds[:]) # Copy over all standard metrics out_metrics = in_mc.get_all_metrics() # Remap the per-gem-group and per-library metrics old_gg_metrics = in_mc.get_metric(cr_mol_counter.GEM_GROUPS_METRIC) gg_metrics = { str(gg_map[int(og)]): m for og, m in old_gg_metrics.iteritems() } old_lib_metrics = in_mc.get_metric(cr_mol_counter.LIBRARIES_METRIC) lib_metrics = { str(lib_idx_map[int(ol)]): m for ol, m in old_lib_metrics.iteritems() } out_metrics[cr_mol_counter.GEM_GROUPS_METRIC] = gg_metrics out_metrics[cr_mol_counter.LIBRARIES_METRIC] = lib_metrics out_mc.set_all_metrics(out_metrics)
def main(args, outs): np.random.seed(0) mc = MoleculeCounter.open(args.molecule_info, 'r') # Get cell-associated barcodes genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Load chunk of relevant data from the mol_info chunk = slice(int(args.chunk_start), int(args.chunk_start) + int(args.chunk_len)) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_gem_group = mc.get_column_lazy('gem_group')[chunk] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk] barcodes = mc.get_ref_column('barcodes') # Give each cell-associated barcode an integer index cell_bcs = sorted(list(cell_bcs_by_genome[''])) cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)} # Give each genome an integer index genome_to_int = {g: i for i, g in enumerate(genomes)} feature_int_to_genome_int = np.fromiter( (genome_to_int[f.tags.get('genome', '')] for f in mc.feature_reference.feature_defs), dtype=int) mol_genome_idx = feature_int_to_genome_int[mol_feature_idx] # determine which (library type, genome) pairs have any associated reads lib_types = sorted(set(lib['library_type'] for lib in mc.library_info)) lib_type_to_int = {l: i for i, l in enumerate(lib_types)} lib_idx_to_lib_type_idx = np.fromiter( (lib_type_to_int[lib['library_type']] for lib in mc.library_info), dtype=np.int) lib_type_genome_any_reads = np.zeros((len(lib_types), len(genomes)), dtype=np.bool) lib_genome_idx_pairs = set( izip(mol_library_idx[mol_read_pairs > 0], mol_genome_idx[mol_read_pairs > 0])) for (lib_idx, genome_idx) in lib_genome_idx_pairs: lib_type_idx = lib_idx_to_lib_type_idx[lib_idx] lib_type_genome_any_reads[lib_type_idx, genome_idx] = True # Run each subsampling task on this chunk of data n_tasks = len(args.subsample_info) n_genomes = len(genomes) n_cells = len(cell_bcs) umis_per_bc = np.zeros((n_tasks, n_genomes, n_cells)) features_det_per_bc = np.zeros((n_tasks, n_genomes, n_cells)) read_pairs_per_task = np.zeros((n_tasks, n_genomes)) umis_per_task = np.zeros((n_tasks, n_genomes)) for task_idx, task in enumerate(args.subsample_info): # Per-library subsampling rates rates_per_library = np.array(task['library_subsample_rates'], dtype=float) if np.count_nonzero(rates_per_library) == 0: continue mol_rate = rates_per_library[mol_library_idx] # Subsampled read pairs per molecule new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) # Compute tallies for each barcode group_keys = (mol_gem_group, mol_barcode_idx) group_values = (mol_feature_idx, mol_genome_idx, new_read_pairs) for (gg, bc_idx), (feature_idx, genome_idx, read_pairs) in \ cr_utils.numpy_groupby(group_values, group_keys): barcode = cr_utils.format_barcode_seq(barcodes[bc_idx], gg) cell_idx = cell_bc_to_int.get(barcode) for this_genome_idx in xrange(len(genomes)): umis = np.flatnonzero((read_pairs > 0) & (genome_idx == this_genome_idx)) this_genome_read_pairs = np.sum( read_pairs[genome_idx == this_genome_idx]) # Tally UMIs and median features detected if barcode in cell_bcs_by_genome[genomes[this_genome_idx]]: # This is a cell-associated barcode for this genome umis_per_bc[task_idx, this_genome_idx, cell_idx] = len(umis) features_det_per_bc[task_idx, this_genome_idx, cell_idx] = np.count_nonzero( np.bincount(feature_idx[umis])) # Tally numbers for duplicate fraction read_pairs_per_task[task_idx, this_genome_idx] += np.sum( this_genome_read_pairs) umis_per_task[task_idx, this_genome_idx] += len(umis) with open(outs.metrics, 'w') as f: data = { 'umis_per_bc': umis_per_bc, 'features_det_per_bc': features_det_per_bc, 'read_pairs': read_pairs_per_task, 'umis': umis_per_task, 'lib_type_genome_any_reads': lib_type_genome_any_reads, } cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)