def main(args, outs): molecule_counter = cr_mol_counter.MoleculeCounter.open( args.raw_molecules, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) raw_matrices = cr_matrix.GeneBCMatrices.build_from_mol_counter( molecule_counter) filtered_matrices = raw_matrices.filter_barcodes(filtered_bcs_per_genome) raw_matrices.save_h5(outs.raw_matrices_h5) raw_matrices.save_mex(outs.raw_matrices_mex) raw_matrices.save_barcode_summary_h5(outs.barcode_summary_h5) filtered_matrices.save_h5(outs.filtered_matrices_h5) filtered_matrices.save_mex(outs.filtered_matrices_mex) genome_ids = molecule_counter.get_ref_column('genome_ids') with cr_mol_counter.MoleculeCounter.open(outs.filtered_molecules, 'w') as ctr_out: summary = write_filtered_molecules(molecule_counter, ctr_out, genome_ids, filtered_bcs_per_genome) with open(outs.summary, 'w') as f: tk_json.dump_numpy(summary, f, pretty=True)
def get_gem_group_index_json(args, outs): if args.gem_group_index_json: cr_utils.copy(args.gem_group_index_json, outs.gem_group_index_json) else: generated_index = cr_matrix.get_gem_group_index( args.filtered_gene_bc_matrices_h5) if generated_index: with open(outs.gem_group_index_json, 'w') as outfile: tk_json.dump_numpy({"gem_group_index": generated_index}, outfile) return outs.gem_group_index_json
def get_gem_group_index_json(args, outs): if args.gem_group_index_json: cr_io.copy(args.gem_group_index_json, outs.gem_group_index_json) else: generated_index = cr_matrix.get_gem_group_index( args.feature_barcode_matrix) if generated_index is not None: with open(outs.gem_group_index_json, 'w') as outfile: tk_json.dump_numpy({"gem_group_index": generated_index}, outfile) else: outs.gem_group_index_json = None return outs.gem_group_index_json
def join(args, outs, chunk_defs, chunk_outs): outs.corrected_bcs = [co.corrected_bcs for co in chunk_outs] # Write barcode counts (merged by library_type) bc_counters = cr_fastq.BarcodeCounter.merge_by( [co.corrected_barcode_counts for co in chunk_outs], [cd.library_type for cd in chunk_defs], args.barcode_whitelist, args.gem_groups) with open(outs.corrected_barcode_counts, 'w') as f: tk_safe_json.dump_numpy(bc_counters, f) outs.chunked_reporter = None reporter = cr_report.merge_reporters( [chunk_out.chunked_reporter for chunk_out in chunk_outs]) reporter.report_summary_json(outs.summary) reporter.report_barcodes_h5(outs.barcode_summary)
def close(self): if self.barcode_seqs: with open(self.out_counts, 'w') as f: tk_safe_json.dump_numpy(self.to_json(), f)
def join(args, outs, chunk_defs, chunk_outs): outs.reads, outs.read2s, outs.tags = [], [], [] outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], [] for chunk_out in chunk_outs: outs.reads += [read for read in chunk_out.reads] outs.read2s += [read2 for read2 in chunk_out.read2s] outs.tags += [tags for tags in chunk_out.tags] outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups] outs.library_types += [lt for lt in chunk_out.library_types] outs.library_ids += [li for li in chunk_out.library_ids] outs.read_groups += [ read_group for read_group in chunk_out.read_groups ] # Ensure that we have non-zero reads if not outs.reads: martian.exit( "No reads found. Check the input fastqs and/or the chemistry definition" ) # Ensure consistency of BAM comments assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments for chunk_out in chunk_outs) outs.bam_comments = chunk_outs[0].bam_comments # Write barcode counts (merged by library_type) bc_counters = BarcodeCounter.merge_by( [co.barcode_counts for co in chunk_outs], [cd.library_type for cd in chunk_defs], args.barcode_whitelist, outs.gem_groups) with open(outs.barcode_counts, 'w') as f: tk_safe_json.dump_numpy(bc_counters, f) # Write feature counts feature_counts = None for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs): with open(chunk_out.feature_counts) as f: chunk_counts = np.asarray(json.load(f), dtype=int) if feature_counts is None: feature_counts = chunk_counts else: feature_counts += chunk_counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) outs.align = cr_utils.select_alignment_params(args.align) # Group reporters by library type outs.chunked_reporter = None reporter_groups = defaultdict(list) for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): if not chunk_out.reads: continue chunk_lib_types = set(lt for lt in chunk_out.library_types) assert len(chunk_lib_types) == 1 lib_type = list(chunk_lib_types)[0] reporter_groups[lib_type].append(chunk_out.chunked_reporter) # Merge reporters and prefix JSON keys by library type summary = {} for lib_type, reporters in reporter_groups.iteritems(): j = cr_report.merge_reporters(reporters).to_json() prefix = rna_library.get_library_type_metric_prefix(lib_type) j_prefixed = dict((prefix + k, v) for k, v in j.iteritems()) summary.update(j_prefixed) # Use a temporary reporter to generate the metadata (w/o a prefix) tmp_reporter = cr_report.Reporter() tmp_reporter.store_chemistry_metadata(args.chemistry_def) summary.update(tmp_reporter.to_json()) # Write summary JSON with open(outs.summary, 'w') as f: tk_safe_json.dump_numpy(summary, f, pretty=True)
def main(args, outs): np.random.seed(0) with cr_mol_counter.MoleculeCounter.open(args.molecules, 'r', start=int(args.chunk_start), length=int( args.chunk_len)) as ctr_in: with cr_mol_counter.MoleculeCounter.open(outs.out_molecules, 'w') as ctr_out: metrics_in = ctr_in.get_all_metrics() metrics_out = metrics_in.copy() reads = ctr_in.get_column('reads') gem_groups = ctr_in.get_column('gem_group') if args.downsample and len(args.downsample_map) > 1: downsample_func = np.vectorize( lambda gem_group, read_count: np.random.binomial( read_count, args.downsample_map[str(gem_group)][ 'frac_reads_kept'])) # downsample metrics for gg in metrics_out[cr_mol_counter.GEM_GROUPS_METRIC]: frac_reads_kept = args.downsample_map[str( gg)]['frac_reads_kept'] total_reads_in = metrics_in[ cr_mol_counter.GEM_GROUPS_METRIC][gg][ cr_mol_counter.GG_TOTAL_READS_METRIC] total_reads_out = round(frac_reads_kept * total_reads_in) metrics_out[cr_mol_counter.GEM_GROUPS_METRIC][gg][ cr_mol_counter. GG_DOWNSAMPLED_READS_METRIC] = total_reads_out ctr_out.set_all_metrics(metrics_out) # downsample molecule info subsampled_reads = downsample_func(gem_groups, reads) for col in cr_mol_counter.MOLECULE_INFO_COLUMNS: if col == 'reads': data = subsampled_reads else: data = ctr_in.get_column(col) ctr_out.add_many(col, data) # pass reference info for col in cr_mol_counter.MOLECULE_REF_COLUMNS: ctr_out.set_ref_column(col, ctr_in.get_ref_column(col)) else: subsampled_reads = reads # collect summary stats genomes = ctr_in.get_ref_column('genome_ids') raw_conf_mapped_per_genome = {} if len(genomes) == 1: genome = genomes[0] raw_conf_mapped_per_genome[genome] = subsampled_reads.sum() else: genome_ids = ctr_in.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for genome in genomes: genome_id = cr_reference.get_genome_id( genome, genome_index) raw_conf_mapped_per_genome[genome] = subsampled_reads[ genome_ids == genome_id].sum() summary = { 'raw_conf_mapped_per_genome': raw_conf_mapped_per_genome, 'mol_counter_metrics': metrics_out } with open(outs.summary, 'w') as f: tk_json.dump_numpy(summary, f, pretty=True)
def main(args, outs): reporter = vdj_report.VdjReporter() cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) barcode_contigs = vdj_annot.load_cell_contigs_from_json( args.annotations, args.vdj_reference_path, group_key='barcode') # From CDR sequence to sequence id sequences = {} # From clonotype (tuple of CDR ids) to clonotype id clonotypes = {} # From barcode to clonotype id bc_clonotype_assignments = {} # First pass: Just keep track of observed CDR3s for contig_list in barcode_contigs: # This will be a tuple of sequences like "TRA_<cdr seq>" barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=not args.use_non_productive, require_full_len=True, require_high_conf=True) # Give unique numerical ids to the CDR3 sequences if barcode_clonotype_tuple: for cdr_seq in barcode_clonotype_tuple: sequences.setdefault(cdr_seq, len(sequences)) # From sequence id to CDR sequence sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()} # Do a second pass to potentially use non-full length contigs with a valid CDR3. for contig_list in barcode_contigs: if args.use_non_full_len: barcode_clonotype_tuple = [] for c in contig_list.contigs(): (_, cl_seq) = c.clonotype_seq() # If this contig has a CDR3 and we can infer the gene type of # that CDR3 (either based on the contig itself or based on # other full-length contigs that had this CDR3, then add this # to the clonotype tuple). if cl_seq in sequences: # this will rescue contigs that have a chain and CDR3 assigned # but aren't full length barcode_clonotype_tuple.append(cl_seq) else: barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=(not args.use_non_productive), require_full_len=True, require_high_conf=True) barcode_clonotype = tuple( sorted(list(set([sequences[s] for s in barcode_clonotype_tuple])))) if barcode_clonotype: clonotype_id = clonotypes.setdefault(barcode_clonotype, len(clonotypes)) bc_clonotype_assignments[contig_list.name] = clonotype_id # From clonotype id to tuple of CDRs clonotype_ids = { clonotype_id: clonotype_tuple for clonotype_tuple, clonotype_id in clonotypes.iteritems() } out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw', cell_barcodes, clonotype_ids, sequence_ids, barcode_contigs, bc_clonotype_assignments) with open(outs.clonotype_assignments, 'w') as out_file: tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes), out_file, pretty=True) # Add clonotype assignments to contig annotations del barcode_contigs with open(args.annotations) as f: all_contigs = vdj_annot.load_contig_list_from_json( f, args.vdj_reference_path) vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw') # Write augmented contig annotations with open(outs.contig_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) with open(outs.contig_annotations_csv, 'w') as out_file: vdj_annot.save_contig_list_csv(out_file, all_contigs, write_inferred=False) with open(outs.contig_annotations_pickle, 'w') as out_file: cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL) # Write filtered contig annotations with open(outs.filtered_contig_annotations_csv, 'w') as out_file: filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell, all_contigs) vdj_annot.save_contig_list_csv(out_file, filtered_contigs, write_inferred=False) # Set a default value for paired clonotype diversity so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode paired_diversity_metric = reporter._get_metric_attr( 'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw') if not paired_diversity_metric.d: paired_diversity_metric.add(None, 0) reporter.report_summary_json(outs.summary)