def call_cell_barcodes(umi_info_path, gem_group): """ Call cell barcodes by UMI support. Args: umi_info_path (str) - path to umi info h5 gem_group (int) - gem group Returns: (bc_support, cell_bcs, rt, ut) where bc_support = dict of { barcode: umi_count }, cell_bcs = list(str) of cell barcodes) rt = read pair per umi threshold used ut = umi threshold """ # Get umi info for this gem group only bc_idx = vdj_umi_info.get_column(umi_info_path, 'barcode_idx') bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes') bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str]) bc_in_gg = bc_gg == gem_group umi_in_gg = bc_in_gg[bc_idx] umi_read_pairs = vdj_umi_info.get_column(umi_info_path, 'reads') rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells( umi_barcode_idx=bc_idx[umi_in_gg], umi_read_pairs=umi_read_pairs[umi_in_gg], barcodes=bc_str, rpu_mix_init_sd=RPU_MIX_INIT_SD, umi_mix_init_sd=UMI_MIX_INIT_SD, verbosity=1, ) cell_bcs = [ bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold ] return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) chains = umi_info['chains'] barcodes = umi_info['barcodes'] bc_gg = [str(cr_utils.split_barcode_seq(bc)[1]) for bc in barcodes] # Compute N50 read pairs per UMI for this gem group umi_read_pairs = [] total_read_pairs = {} chain_bad_read_pairs = {} for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['chain_idx'], umi_info['reads']), key=lambda x: x[0]): bc_umi_read_pairs = {} for _, umi, chain_idx, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads chain = chains[chain_idx] total_read_pairs[chain] = total_read_pairs.get(chain, 0) + reads total_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = total_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads if reads < args.min_readpairs_per_umi[bc_gg[bc_idx]]: chain_bad_read_pairs[chain] = chain_bad_read_pairs.get( chain, 0) + reads chain_bad_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = chain_bad_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_read_pairs.append(r) rppu_n50 = tk_stats.NX(umi_read_pairs, 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') # Report bad read-pairs/umi for chain in reporter.vdj_genes: bad_count = chain_bad_read_pairs.get(chain, 0) total_count = total_read_pairs.get(chain, 0) reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac', chain).set_value(bad_count, total_count) reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) reporter.save(outs.chunked_reporter)
def get_compressed_bc_iter(barcodes): """ Yields compressed barcode tuples that can be compared against a MoleculeCounter's data. Useful for filtering a MoleculeCounter by barcode. Args: barcodes (iterable) - list of barcode strings (e.g., ACGT-1) Yields: (compressed_bc, compressed_gem_group) tuples """ for barcode in barcodes: barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) compressed_bc = MoleculeCounter.compress_barcode_seq(barcode_seq) compressed_gg = MoleculeCounter.compress_gem_group(gem_group) yield compressed_bc, compressed_gg
def join(args, outs, chunk_defs, chunk_outs): chunks = zip(chunk_defs, chunk_outs) chunks.sort( key=lambda chunk: cr_utils.split_barcode_seq(chunk[0].prefix)[::-1]) buckets = [] outs.total_reads = 0 for chunk in chunks: buckets.append(chunk[1].default) outs.total_reads += chunk[1].total_reads tk_bam.concatenate(outs.default, buckets)
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = { int(k): v for k, v in args.chunks_per_gem_group.iteritems() } with open(args.read1s_chunk) as f1: read1s = [read for read in tk_fasta.read_generator_fastq(f1)] with open(args.read2s_chunk) as f2: read2s = [read for read in tk_fasta.read_generator_fastq(f2)] assert len(read1s) == len(read2s) fastqs_out = {} buckets = {} outs.buckets = {} for gem_group, bucket_name in enumerate_bucket_names( args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) fastqs_out[bucket_name] = open(filename, 'w') outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for read1, read2 in itertools.izip(read1s, read2s): barcode = vdj_utils.get_fastq_read_barcode(read1) # Exclude unbarcoded reads if barcode is None: continue assert barcode == vdj_utils.get_fastq_read_barcode(read2) barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append(read1) buckets[bucket_name].append(read2) # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) fastq_out = fastqs_out[bucket_name] for read in bucket: tk_fasta.write_read_fastq(fastq_out, *read) fastq_out.close()
def main(args, outs): bam_in = tk_bam.create_bam_infile(args.chunk_input) # Get gem groups library_info = rna_library.get_bam_library_info(bam_in) gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info))) # Define buckets bucket_names = [] prefixes = cr_utils.get_seqs(args.nbases) for gg in gem_groups: for prefix in prefixes: bucket_names.append('%s-%d' % (prefix, gg)) bucket_names.append('') # Read all records reads = [read for read in bam_in] # Bucket the records bams_out = {} outs.buckets = {} buckets = {} for bucket_name in bucket_names: filename = martian.make_path("bc-%s.bam" % bucket_name) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in, rgs=args.read_groups, replace_rg=True) bams_out[bucket_name] = bam_out outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for r in reads: barcode = cr_utils.get_read_barcode(r) if barcode is None: bucket_name = '' else: barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) prefix = barcode_seq[:args.nbases] bucket_name = '%s-%d' % (prefix, gem_group) buckets[bucket_name].append(r) for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=cr_utils.barcode_sort_key) bam_out = bams_out[bucket_name] for r in bucket: bam_out.write(r) bam_out.close()
def build_barcode_info(filtered_barcodes_by_genome, library_info, barcodes): """Generate numpy arrays for per-barcode info Args: filtered_barcodes_by_genome (dict of str:list(str)): Keys are genomes, values are lists of filtered barcode strings. library_info (list of dict): Per-library metadata. barcodes (list of str): All barcode sequences (e.g. ['ACGT', ...] Returns: BarcodeInfo object """ # Replace a genome string with its lexicographical rank genome_to_idx = {g:i for i, g in \ enumerate(sorted(filtered_barcodes_by_genome.keys()))} libraries_for_gem_group = defaultdict(list) for lib_idx, lib in enumerate(library_info): libraries_for_gem_group[lib['gem_group']].append(lib_idx) # Map a barcode sequence to its index into the MoleculeCounter # 'barcodes' array bc_seq_to_idx = {bc:i for i, bc in enumerate(barcodes)} # Populate the "pass filter" array of tuples pf_tuples = [] for genome, bcs in filtered_barcodes_by_genome.iteritems(): genome_idx = genome_to_idx[genome] for bc_str in bcs: seq, gg = cr_utils.split_barcode_seq(bc_str) barcode_idx = bc_seq_to_idx[seq] # FIXME: Assumes no per-library filtering, just per-gem-group library_inds = libraries_for_gem_group[gg] for library_idx in library_inds: pf_tuples.append((barcode_idx, library_idx, genome_idx)) if len(pf_tuples) > 0: pass_filter = np.array(pf_tuples, dtype=BARCODE_INFO_DTYPES['pass_filter']) else: pass_filter = np.zeros((0,3), dtype=BARCODE_INFO_DTYPES['pass_filter']) assert pass_filter.shape[0] == len(pf_tuples) assert pass_filter.shape[1] == 3 # Sort by barcode index pass_filter = pass_filter[np.argsort(pass_filter[:,0]), :] return BarcodeInfo( pass_filter, genomes=sorted(filtered_barcodes_by_genome.keys()), )
def call_cell_barcodes(umi_info_path, gem_group): """ Call cell barcodes by UMI support. Args: umi_info_path (str) - path to umi info h5 gem_group (int) - gem group Returns: (bc_support, cell_bcs, rt, ut) where bc_support = dict of { barcode: umi_count }, cell_bcs = list(str) of cell barcodes) rt = read pair per umi threshold used ut = umi threshold """ # Get umi info for this gem group only bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes') bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str]) bc_in_gg = bc_gg == gem_group umi_info = vdj_umi_info.read_umi_info(umi_info_path) umi_barcode_idx = [] umi_read_pairs = [] for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['reads']), key=lambda x: x[0]): if not bc_in_gg[bc_idx]: continue bc_umi_read_pairs = {} for _, umi, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_barcode_idx.append(bc_idx) umi_read_pairs.append(r) rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells( umi_barcode_idx=np.array(umi_barcode_idx, dtype=vdj_umi_info.get_dtype('barcode_idx')), umi_read_pairs=np.array(umi_read_pairs, dtype=vdj_umi_info.get_dtype('reads')), barcodes=bc_str, rpu_mix_init_sd=RPU_MIX_INIT_SD, umi_mix_init_sd=UMI_MIX_INIT_SD, verbosity=1, ) cell_bcs = [ bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold ] return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def write_filtered_molecules(ctr_in, ctr_out, genomes, bcs_per_genome): ctr_out.set_all_metrics(ctr_in.get_all_metrics()) filtered_bc_tuples = set() genome_ids = ctr_in.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for (genome, formatted_bcs) in bcs_per_genome.iteritems(): genome_id = cr_reference.get_genome_id(genome, genome_index) for formatted_bc in formatted_bcs: (bc, gg) = cr_utils.split_barcode_seq(formatted_bc) cbc = cr_mol_counter.MoleculeCounter.compress_barcode_seq(bc) filtered_bc_tuples.add((genome_id, gg, cbc)) def keep_molecule(genome_id, gem_group, barcode): tup = (genome_id, gem_group, barcode) return (tup in filtered_bc_tuples) filter_func = np.vectorize(keep_molecule) gem_groups = ctr_in.get_column('gem_group') barcodes = ctr_in.get_column('barcode') filter_index = filter_func(genome_ids, gem_groups, barcodes) for col in cr_mol_counter.MOLECULE_INFO_COLUMNS: data = ctr_in.get_column(col) filtered_data = data[filter_index] ctr_out.add_many(col, filtered_data) for col in cr_mol_counter.MOLECULE_REF_COLUMNS: ctr_out.set_ref_column(col, ctr_in.get_ref_column(col)) # summarize filtered data genomes = ctr_out.get_ref_column('genome_ids') filtered_reads = ctr_out.get_column('reads') flt_conf_mapped_per_genome = {} if len(genomes) == 1: genome = genomes[0] flt_conf_mapped_per_genome[genome] = filtered_reads.sum() else: genome_ids = ctr_out.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for genome in genomes: genome_id = cr_reference.get_genome_id(genome, genome_index) flt_conf_mapped_per_genome[genome] = filtered_reads[ genome_ids == genome_id].sum() summary = {'flt_conf_mapped_per_genome': flt_conf_mapped_per_genome} return summary
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return gg_id_to_batch_id, batch_id_to_name = {}, {} for lib in args.library_info: gg_id_to_batch_id[lib['gem_group']] = lib['batch_id'] batch_id_to_name[lib['batch_id']] = lib['batch_name'] matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5) matrix = matrix.select_features_by_type(GENE_EXPRESSION_LIBRARY_TYPE) batch_ids = np.array([gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in matrix.bcs]) # select intersect of non-zero feature in each batch feature_mask = np.ones(matrix.features_dim) for b_id in batch_id_to_name: batch_bc_indices = np.where(batch_ids == b_id)[0] matrix_view = cr_matrix.CountMatrixView(matrix, bc_indices=batch_bc_indices) feature_mask = np.logical_and(feature_mask, matrix_view.sum(axis=1)) matrix = matrix.select_features(np.flatnonzero(feature_mask)) # filter barcodes with zero count bc_indices = np.flatnonzero(matrix.get_counts_per_bc()) matrix = matrix.select_barcodes(bc_indices) # l2 norm matrix.m = matrix.m.astype('float64') cr_matrix.inplace_csc_column_normalize_l2(matrix.m) n_pcs = args.num_pcs if args.num_pcs is not None else analysis_constants.CBC_N_COMPONENTS_DEFAULT dimred_matrix = fbpca_reduce_dimension(matrix, n_pcs) outs.dimred_matrix = martian.make_path('dimred_matrix.pickle') with open(outs.dimred_matrix, 'wb') as fp: cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL) bc_feature_info = { 'barcodes' : matrix.bcs, 'features' : matrix.feature_ref.feature_defs, } outs.matrix_barcode_feature_info = martian.make_path('matrix_barcode_feature_info.pickle') with open(outs.matrix_barcode_feature_info, 'wb') as fp: cPickle.dump(bc_feature_info, fp, cPickle.HIGHEST_PROTOCOL)
def split(args): # Get required info from the mol info mc = MoleculeCounter.open(args.molecule_info, 'r') genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Get cell counts per gem group n_cells_per_gg = defaultdict(int) for bc in cell_bcs_by_genome['']: _, gem_group = cr_utils.split_barcode_seq(bc) n_cells_per_gg[gem_group] += 1 # Assign gem group cell counts to their constituent libraries # TODO FIXME: Need to allow for per-library cell counts # because some feature types might only have a subset of the GEX cell-assoc barcodes. n_cells_per_lib = np.zeros(len(mc.library_info), dtype=int) for lib_idx, lib in enumerate(mc.library_info): n_cells_per_lib[lib_idx] = n_cells_per_gg[lib['gem_group']] if n_cells_per_lib.sum() == 0: return {'chunks': []} library_info = mc.library_info raw_count_per_lib = np.array(mc.get_raw_read_pairs_per_library()) raw_rppc_per_lib = raw_count_per_lib.astype(float) / n_cells_per_lib usable_count_per_lib = np.array(mc.get_usable_read_pairs_per_library()) subsamplings = list() # track subsample info definitions library_types = sorted(set(lib['library_type'] for lib in library_info)) for library_type in library_types: # All libraries w/ this type lib_indexes = np.array([ i for i, lib in enumerate(library_info) if lib['library_type'] == library_type ]) # For plotting, we want a series of target depths that exist for all # libraries w/ the same library type. When there's a single library # per type (the common case), this is trivial - split it into deciles. # But if there are multiple libraries with different depths, (e.g., # because gem-group-aggregation was used to increase cell numbers), # we need to find depths that are achievable for all libraries. # For now, let the lowest-depth library for a given type dictate this. min_raw_rppc = np.min(raw_rppc_per_lib[lib_indexes]) # Use deciles of the raw read pairs per cell. deciles = np.arange(0.1, 1.1, 0.1) plot_targets = map(round, min_raw_rppc * deciles) # TODO: separate this work (internal + non) raw_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \ plot_targets # TODO: separate this work (internal + non) usable_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \ plot_targets for targets, depth_type in \ ((raw_targets, cr_constants.RAW_SUBSAMPLE_TYPE), \ ((usable_targets, cr_constants.MAPPED_SUBSAMPLE_TYPE)),): targets = sorted(list(set(map(int, targets)))) for target_rppc in targets: if depth_type == cr_constants.RAW_SUBSAMPLE_TYPE: # Infer the usable depth required to achieve this raw depth usable_read_fracs = usable_count_per_lib.astype( float) / raw_count_per_lib target_usable_counts = target_rppc * n_cells_per_lib * usable_read_fracs else: target_usable_counts = target_rppc * n_cells_per_lib # Zero out libraries of the other types rates = np.zeros(len(library_info), dtype=float) rates[lib_indexes] = target_usable_counts[lib_indexes].astype(float) \ / usable_count_per_lib[lib_indexes] # Clamp rates that are close to 1 to 1 rates[np.absolute(rates - 1) < 1e-3] = 1 # Zero out the libraries for which we have fewer reads than the target rates[rates > 1] = 0.0 enough_data = np.any((rates > 0) & (rates <= 1)) if not enough_data: rates = np.zeros(len(rates)) subsamplings.append({ 'library_type': library_type, 'subsample_type': depth_type, 'target_read_pairs_per_cell': int(target_rppc), 'library_subsample_rates': list(map(float, rates)), }) # Each chunk needs to store a piece of the mol info h5 tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK # Split the molecule info h5 into equi-RAM chunks chunks = [] for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len, preserve_boundaries=True): chunks.append({ 'chunk_start': chunk_start, 'chunk_len': chunk_len, 'subsample_info': subsamplings, # The estimate_mem_gb only count the memory usage for the MoleculeCounter object, which is # under-estimated the actual memory usage. # Based on memory profiling with test case fuzzer_114, actual memory usageis ~4x more # than estimate_mem_gb (without cap), here set scale = 6. '__mem_gb': MoleculeCounter.estimate_mem_gb(chunk_len, scale=6), }) join = { '__mem_gb': 6, } mc.close() # TODO: is this really necessary w/ martian 3 if len(chunks) == 0: chunks.append({ 'chunk_start': str(0), 'chunk_len': str(0), 'subsample_info': [], }) return {'chunks': chunks, 'join': join}
def select_barcodes_by_gem_group(self, gem_group): return self.select_barcodes_by_seq([ bc for bc in self.bcs if gem_group == cr_utils.split_barcode_seq(bc)[1] ])
def filter_barcodes(args, outs): random.seed(0) np.random.seed(0) correction_data = pd.read_csv(args.barcode_correction_csv) raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5) if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE, correction_data.library_type): matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads( correction_data, raw_matrix) ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads removed_bcs_df.to_csv(outs.aggregate_barcodes) summary = metrics_to_report else: matrix = raw_matrix summary = {} if args.cell_barcodes is not None: method = FilterMethod.MANUAL elif args.force_cells is not None: method = FilterMethod.TOP_N_BARCODES else: method = FilterMethod.ORDMAG_NONAMBIENT summary['total_diversity'] = matrix.bcs_dim summary['filter_barcodes_method'] = get_filter_method_name(method) # Get unique gem groups unique_gem_groups = sorted(list(set(args.gem_groups))) # Get per-gem group cell load if args.recovered_cells is not None: gg_recovered_cells = int( float(args.recovered_cells) / float(len(unique_gem_groups))) else: gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP if args.force_cells is not None: gg_force_cells = int( float(args.force_cells) / float(len(unique_gem_groups))) # Only use gene expression matrix for cell calling gex_matrix = matrix.view().select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Make initial cell calls for each genome separately genomes = gex_matrix.get_genomes() # (gem_group, genome) => dict filtered_metrics_groups = OrderedDict() # (gem_group, genome) => list of barcode strings filtered_bcs_groups = OrderedDict() for genome in genomes: genome_matrix = gex_matrix.select_features_by_genome(genome) # Make initial cell calls for each gem group individually for gem_group in unique_gem_groups: gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group) if method == FilterMethod.ORDMAG or \ method == FilterMethod.ORDMAG_NONAMBIENT: gg_total_diversity = gg_matrix.bcs_dim gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag( gg_bc_counts, gg_recovered_cells, gg_total_diversity) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) elif method == FilterMethod.MANUAL: with (open(args.cell_barcodes)) as f: cell_barcodes = json.load(f) gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual( gg_matrix, cell_barcodes) elif method == FilterMethod.TOP_N_BARCODES: gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff( gg_bc_counts, gg_force_cells) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) else: martian.exit("Unsupported BC filtering method: %s" % method) if msg is not None: martian.log_info(msg) filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs # Do additional cell calling outs.nonambient_calls = None if method == FilterMethod.ORDMAG_NONAMBIENT: # We need the full gene expression matrix instead of just a view full_gex_matrix = matrix.select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Track these for recordkeeping eval_bcs_arrays = [] umis_per_bc_arrays = [] loglk_arrays = [] pvalue_arrays = [] pvalue_adj_arrays = [] nonambient_arrays = [] genome_call_arrays = [] # Do it by gem group, but agnostic to genome for gg in unique_gem_groups: gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg) # Take union of initial cell calls across genomes gg_bcs = sorted( list( reduce(set.union, [ set(bcs) for group, bcs in filtered_bcs_groups.iteritems() if group[0] == gg ]))) result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs) if result is None: print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg continue # Assign a genome to the cell calls by argmax genome counts genome_counts = [] for genome in genomes: genome_counts.append(gg_matrix.view() \ .select_features_by_genome(genome) \ .select_barcodes(result.eval_bcs) \ .get_counts_per_bc()) genome_counts = np.column_stack(genome_counts) genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)] umis_per_bc = gg_matrix.get_counts_per_bc() eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs]) umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs]) loglk_arrays.append(result.log_likelihood) pvalue_arrays.append(result.pvalues) pvalue_adj_arrays.append(result.pvalues_adj) nonambient_arrays.append(result.is_nonambient) genome_call_arrays.append(genome_calls) # Update the lists of cell-associated barcodes for genome in genomes: eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs] filtered_bcs_groups[(gg, genome)].extend( eval_bc_strs[(genome_calls == genome) & (result.is_nonambient)]) if len(eval_bcs_arrays) > 0: nonambient_summary = pd.DataFrame( OrderedDict([ ('barcode', np.concatenate(eval_bcs_arrays)), ('umis', np.concatenate(umis_per_bc_arrays)), ('ambient_loglk', np.concatenate(loglk_arrays)), ('pvalue', np.concatenate(pvalue_arrays)), ('pvalue_adj', np.concatenate(pvalue_adj_arrays)), ('nonambient', np.concatenate(nonambient_arrays)), ('genome', np.concatenate(genome_call_arrays)), ])) nonambient_summary.to_csv(outs.nonambient_calls) # Record all filtered barcodes genome_filtered_bcs = defaultdict(set) filtered_bcs = set() for (gem_group, genome), bcs in filtered_bcs_groups.iteritems(): genome_filtered_bcs[genome].update(bcs) filtered_bcs.update(bcs) # Combine initial-cell-calling metrics for genome in genomes: # Merge metrics over all gem groups for this genome txome_metrics = [ v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome ] txome_summary = cr_stats.merge_filtered_metrics(txome_metrics) # Append method name to metrics summary.update({ ('%s_%s_%s' % (genome, key, get_filter_method_name(method))): txome_summary[key] \ for (key,_) in txome_summary.iteritems()}) summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome]) # NOTE: This metric only applies to the initial cell calls summary['%s_filtered_bcs_cv' % genome] = txome_summary['filtered_bcs_cv'] # Deduplicate and sort filtered barcode sequences # Sort by (gem_group, barcode_sequence) barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1] for genome, bcs in genome_filtered_bcs.iteritems(): genome_filtered_bcs[genome] = sorted(list(set(bcs)), key=barcode_sort_key) filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key) # Re-compute various metrics on the filtered matrix reads_summary = cr_utils.merge_jsons_as_dict( [args.raw_fastq_summary, args.attach_bcs_summary]) matrix_summary = rna_report_mat.report_genomes( matrix, reads_summary=reads_summary, barcode_summary_h5_path=args.barcode_summary, recovered_cells=args.recovered_cells, cell_bc_seqs=genome_filtered_bcs) # Write metrics json combined_summary = matrix_summary.copy() combined_summary.update(summary) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(combined_summary), f, indent=4, sort_keys=True) # Write the filtered barcodes file write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs) # Select cell-associated barcodes filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs) return filtered_matrix
def split(args): if args.skip: return {'chunks': []} gg_id_to_batch_id, batch_id_to_name = {}, {} for lib in args.library_info: gg_id_to_batch_id[lib['gem_group']] = lib['batch_id'] batch_id_to_name[lib['batch_id']] = lib['batch_name'] # load the barcodes with open(args.matrix_barcode_feature_info) as fp: bc_feature_info = cPickle.load(fp) bcs = bc_feature_info.get('barcodes') batch_ids = np.array( [gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in bcs]) with open(args.dimred_matrix) as fp: dimred_matrix = cPickle.load(fp) # re-order matrix such that barcodes from same batch are grouped together new_bc_indices = None batch_to_bc_indices = [] idx_to_batch_id = np.full(dimred_matrix.shape[0], 0, dtype=np.int8) base = 0 for b_id in range(len(batch_id_to_name)): batch_bc_indices = np.where(batch_ids == b_id)[0] if batch_bc_indices.shape[0] == 0: continue new_bc_indices = batch_bc_indices if new_bc_indices is None else np.append( new_bc_indices, batch_bc_indices) batch_to_bc_indices.append((base, base + batch_bc_indices.shape[0])) idx_to_batch_id[base:base + batch_bc_indices.shape[0]] = b_id base += len(batch_bc_indices) # 1. check if needs re-order; 2. if needs re-order, store the original order need_reorder_barcode = (not np.all(np.diff(new_bc_indices) >= 0)) if need_reorder_barcode: dimred_matrix = dimred_matrix[new_bc_indices] barcode_reorder_index = np.argsort(new_bc_indices) barcode_reorder_index_file = martian.make_path( 'barcode_reorder_index.pickle') with open(barcode_reorder_index_file, 'wb') as fp: cPickle.dump(barcode_reorder_index, fp, cPickle.HIGHEST_PROTOCOL) ordered_dimred_matrix_file = martian.make_path( 'ordered_dimred_matrix.pickle') with open(ordered_dimred_matrix_file, 'wb') as fp: cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL) else: barcode_reorder_index_file, ordered_dimred_matrix_file = None, None idx_to_batch_id_file = martian.make_path('idx_to_batch_id.pickle') with open(idx_to_batch_id_file, 'wb') as fp: cPickle.dump(idx_to_batch_id, fp, cPickle.HIGHEST_PROTOCOL) nitem, ndim = dimred_matrix.shape nbatch = len(batch_to_bc_indices) cbc_knn = option(args.cbc_knn, analysis_constants.CBC_KNN) matrix_mem_gb = sys.getsizeof( dimred_matrix) / 1e9 # float(nitem * ndim) / NUM_ENTRIES_PER_MEM_GB # 72 for size of tuple, 32 * 2 for size of 2 np.int64's, and 40% for inefficient dictionaries nn_mem_gb = 1.4 * nbatch * nitem * cbc_knn * (72 + 2 * 32) / 1e9 # presuming all in one batch, dimred_matrix, cur_matrix, ref_matrix main_mem_gb = max(int(3.0 * matrix_mem_gb + nn_mem_gb + 1.0), h5_constants.MIN_MEM_GB) chunks = [] for batch_id in xrange(len(batch_to_bc_indices)): chunks.append({ '__mem_gb': main_mem_gb, 'batch_id': batch_id, 'batch_to_bc_indices': batch_to_bc_indices, 'ordered_dimred_matrix': ordered_dimred_matrix_file, 'idx_to_batch_id': idx_to_batch_id_file, 'need_reorder_barcode': need_reorder_barcode, 'barcode_reorder_index': barcode_reorder_index_file, }) return {'chunks': chunks, 'join': {'__mem_gb': JOIN_MEM_GB}}
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()
def split(args): """ Chunk the UMI info HDF5 file by gem group """ num_entries = vdj_umi_info.get_num_rows(args.umi_info) if num_entries > 1e9: print 'Warning: There are >1e9 entries in the umi_info - this could potentially cause an out-of-memory error.' # This will cause an OOM if there are >1.5e9 UMIs barcode_indices = vdj_umi_info.get_column(args.umi_info, 'barcode_idx') barcodes = vdj_umi_info.get_column(args.umi_info, 'barcodes') chunks = [] start_row = 0 prev_gem_group = None prev_barcode_idx = None for row, barcode_idx in enumerate(barcode_indices): if barcode_idx == prev_barcode_idx: continue _, gem_group = cr_utils.split_barcode_seq(barcodes[barcode_idx]) if prev_gem_group is not None and gem_group != prev_gem_group: # Write complete chunk end_row = row mem_gb = max( cr_constants.MIN_MEM_GB, 2 * int( np.ceil( vdj_umi_info.get_mem_gb(args.umi_info, start_row=start_row, end_row=end_row)))) chunks.append({ 'gem_group': prev_gem_group, 'start_row': start_row, 'end_row': end_row, '__mem_gb': mem_gb, }) start_row = end_row prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Write final chunk end_row = vdj_umi_info.get_num_rows(args.umi_info) mem_gb = max( cr_constants.MIN_MEM_GB, 2 * int( np.ceil( vdj_umi_info.get_mem_gb( args.umi_info, start_row=start_row, end_row=end_row)))) # Handle case where umi info is empty by supplying a dummy gem group if prev_gem_group is None: prev_gem_group = args.gem_groups[0] chunks.append({ 'gem_group': prev_gem_group, 'start_row': start_row, 'end_row': end_row, '__mem_gb': mem_gb, }) return {'chunks': chunks}
def write_barcode_umi_summary(umi_info_filename, reporter, filename, threshold, cell_barcode_set): """ Write a summary of UMI readpair-counts per (barcode, chain) tuple. Args: filename - output filename threshold (int) - min read pairs per UMI used in asm barcodes - set of barcode strings """ # Load the umi info umi_info = vdj_umi_info.read_umi_info(umi_info_filename) chains = umi_info['chains'] barcodes = umi_info['barcodes'] sep = ',' with open(filename, 'w') as writer: field_names = ["bc"] field_names += [chain + "_all_umis" for chain in reporter.vdj_genes] + \ [chain + "_good_umis" for chain in reporter.vdj_genes] writer.write(sep.join(field_names)) writer.write("\n") # Assume sorted by barcode for bc_idx, umi_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['chain_idx'], umi_info['reads']), key=lambda x: x[0]): bc = barcodes[bc_idx] if bc not in cell_barcode_set: continue # Count UMIs umis = list(umi_iter) chain_counts = defaultdict(int) good_chain_counts = defaultdict(int) for bc_idx, chain_idx, reads in umis: chain = chains[chain_idx] chain_counts[chain] += 1 chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1 _, gem_group = cr_utils.split_barcode_seq(barcodes[bc_idx]) if reads >= threshold: good_chain_counts[chain] += 1 good_chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1 # Report barcode totals flds = {} flds["bc"] = bc num_good_umis = good_chain_counts[cr_constants.MULTI_REFS_PREFIX] reporter._get_metric_attr( 'vdj_recombinome_total_umis_per_cell_distribution').add( num_good_umis) reporter._get_metric_attr( 'vdj_recombinome_total_umis_per_cell_median').add( num_good_umis) # Report per-chain totals for this barcode for chain in reporter.vdj_genes: chain_all_umis = chain_counts[chain] chain_good_umis = good_chain_counts[chain] flds[chain + "_all_umis"] = chain_all_umis flds[chain + "_good_umis"] = chain_good_umis reporter._get_metric_attr( 'vdj_recombinome_umis_per_cell_distribution', chain).add(chain_good_umis) reporter._get_metric_attr( 'vdj_recombinome_umis_per_cell_median', chain).add(chain_good_umis) writer.write(sep.join([str(flds[name]) for name in field_names])) writer.write("\n")
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()} paired_end = args.read2s_chunk is not None # Lazy load R1 r1_file = cr_io.open_maybe_gzip(args.read1s_chunk) read1s = tk_fasta.read_generator_fastq(r1_file) # Lazy load R2 if paired_end: r2_file = cr_io.open_maybe_gzip(args.read2s_chunk) read2s = tk_fasta.read_generator_fastq(r2_file) else: read2s = [] # Lazy load corrected BCs bc_file = cr_io.open_maybe_gzip(args.bcs) bcs = (line.strip() for line in bc_file) buckets = {} bucket_filenames = {} for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) bucket_filenames[bucket_name] = filename buckets[bucket_name] = [] for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs): # Exclude unbarcoded reads if barcode == '': continue # Exclude short reads if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH): continue # Attach processed barcode to reads r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0]) r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r1_new_qname = r1_hdr.to_string() if paired_end: r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0]) r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r2_new_qname = r2_hdr.to_string() barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append((r1_new_qname, read1[1], read1[2])) if paired_end: buckets[bucket_name].append((r2_new_qname, read2[1], read2[2])) outs.buckets = {} # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) # Don't create empty bucket files. # This is common when the reads are ordered by gem group # And a chunk sees only a single gem group. if len(bucket) == 0: continue filename = bucket_filenames[bucket_name] with cr_io.open_maybe_gzip(filename, 'w') as f: for read in bucket: tk_fasta.write_read_fastq(f, *read) outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def call_cell_barcodes(umi_summary_filename, gem_group, min_umis, threshold_nx, threshold_ratio): """ Call cell barcodes by contig/UMI read support. umi_summary_filename (str) - path to umi summary tsv generated by vdj_asm gem_group (int) - gem group min_umis (int) - min passing UMIs on highest-passing-UMI-contig to call cell Returns: (d,b,t) where d = dict of { barcode: best_contig_kth_umi_readpairs }, k = min_umis and kth_umi_readpairs = 0 if best_contig has <k umis, b = list(str) of cell barcodes) t = read pair threshold used """ with open(umi_summary_filename) as f: # First pass: compute threshold reader = csv.reader(f, delimiter='\t') hdr = next(reader) bc_col = hdr.index('barcode') umi_col = hdr.index('umi') reads_col = hdr.index('reads') thresh_col = hdr.index('min_umi_reads') good_col = hdr.index('good_umi') contigs_col = hdr.index('contigs') def use_umi(row): return (row[umi_col] != '') and \ (row[contigs_col] != '') and \ (row[good_col] == 'True') read_pairs = [] assembly_rppu_threshold = 1 bc_support = {} for row in reader: # Only take this gem group _, gg = cr_utils.split_barcode_seq(row[bc_col]) if str(gg) != str(gem_group): continue # Initialize all barcodes bc_support[row[bc_col]] = 0 if not use_umi(row): continue # Get the RPPU threshold that was used in assembly # The tsv reports reads per UMI, so divide by 2 for pairs. assembly_rppu_threshold = int(row[thresh_col]) / 2 read_pairs.append(int(row[reads_col]) / 2) read_pairs = np.array(read_pairs, dtype=int) # Estimate the high end of the distribution if len(read_pairs) > 0: high_rppu = tk_stats.NX(read_pairs, threshold_nx) else: high_rppu = 1 # Take UMIs within X of the high end, roughly corresponding the to highest mode # and therefore to molecules amplified from the first cycle. threshold = int( round(tk_stats.robust_divide(high_rppu, threshold_ratio))) # Don't drop below the looser threshold that was used in assembly. threshold = max(assembly_rppu_threshold, threshold) # Second pass: Call as cell BCs those with at least k UMIs # passing the strict threshold computed above. f.seek(0) reader = csv.reader(f, delimiter='\t') next(reader) cell_barcodes = [] good_umi_iter = itertools.ifilter(use_umi, reader) bc_group_iter = itertools.groupby(good_umi_iter, key=lambda row: row[bc_col]) for bc, rows in bc_group_iter: # Restrict to the current gem group bc_seq, gg = cr_utils.split_barcode_seq(bc) if str(gg) != str(gem_group): continue # Collect readpair support for all UMIs for all contigs contig_umis_readpairs = defaultdict(list) for row in rows: contig_umis_readpairs[row[contigs_col]].append( int(row[reads_col]) / 2) # Get the max (contig-kth-umi) best_kth_umi_readpairs = 0 for contig, umi_readpairs in contig_umis_readpairs.iteritems(): # Sort UMIs by readpairs, descending umi_readpairs = np.array(umi_readpairs, dtype=int) umi_readpairs[::-1].sort() # Get the kth UMI's readpair support or 0 if len(umi_readpairs) >= min_umis: kth_umi_readpairs = umi_readpairs[min_umis - 1] else: kth_umi_readpairs = 0 best_kth_umi_readpairs = max(best_kth_umi_readpairs, kth_umi_readpairs) bc_support[bc] = best_kth_umi_readpairs if best_kth_umi_readpairs >= threshold: cell_barcodes.append(bc) return bc_support, cell_barcodes, threshold