def main(args, outs): genomes = cr_matrix.CountMatrix.get_genomes_from_h5(args.filtered_matrices) chemistry = cr_matrix.CountMatrix.load_chemistry_from_h5(args.filtered_matrices) total_cells = cr_matrix.CountMatrix.count_cells_from_h5(args.filtered_matrices) summary = {'chemistry_description': chemistry, 'filtered_bcs_transcriptome_union': total_cells} if args.analyze_matrices_summary: with open(args.analyze_matrices_summary) as reader: analysis_summary = json.load(reader) summary.update(analysis_summary) with open(outs.summary, 'w') as f: json.dump(tk_json.json_sanitize(summary), f, indent=4, sort_keys=True) sample_properties = ReanalyzeSampleProperties(sample_id=args.sample_id, sample_desc=args.sample_desc, genomes=genomes, version=martian.get_pipelines_version()) sample_properties = dict(sample_properties._asdict()) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.summary, analysis_path=args.analysis, ) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_REANALYZE)
def plot_clonotype_table(chart, sample_properties, sample_data): if sample_data.vdj_clonotype_summary is None: return None clonotypes = sample_data.vdj_clonotype_summary.iloc[0:10] # This column used to be called 'cdr3s'; allow the webshim to work on older data cdr3_aa_col = 'cdr3s_aa' if cdr3_aa_col not in clonotypes: cdr3_aa_col = 'cdr3s' col_defs = collections.OrderedDict([ ('clonotype_id', {'label': 'Clonotype ID', 'format': 'string', 'title': 'Clonotype ID', 'style': 'text-align: left'}), (cdr3_aa_col, {'label': 'CDR3s', 'format': 'string', 'title': 'CDR3s in clonotype', 'style': 'text-align: left'}), ('frequency', {'label': 'Frequency', 'format': 'integer', 'title': 'Number of cells with clonotype', 'style': 'text-align: right'}), ('proportion', {'label': 'Proportion', 'format': '%0.4f', 'title': 'Fraction of cell with clonotype', 'style': 'text-align: right'}), ]) cols = [] for name, col_def in col_defs.iteritems(): if name not in clonotypes: raise ValueError('Column not found in clonotype summary: %s' % name) cols.append({ 'label': col_defs[name]['label'], 'title': col_defs[name]['title'], }) rows = [] for _, cl_row in clonotypes.iterrows(): row = [] for col_name, col_def in col_defs.iteritems(): value = cl_row[col_name] formatted_value = format_value(value, col_def['format']) # Make the CDR3 list bit more readable formatted_value = formatted_value.replace(';', '; ') row.append({ 'v': tk_safe_json.json_sanitize(value), 'f': formatted_value, 's': col_def['style'], }) rows.append(row) chart['table'].update({'rows': rows, 'cols': cols}) return chart
def build_reference(self): print "Creating new reference folder at %s" % self.out_dir os.mkdir(self.out_dir) print "...done\n" print "Writing genome FASTA file into reference folder..." new_genome_fasta = os.path.join(self.out_dir, cr_constants.REFERENCE_FASTA_PATH) os.mkdir(os.path.dirname(new_genome_fasta)) self.write_genome_fasta(new_genome_fasta) print "...done\n" print "Computing hash of genome FASTA file..." fasta_hash = cr_utils.compute_hash_of_file(new_genome_fasta) print "...done\n" print "Writing genes GTF file into reference folder..." new_gene_gtf = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_GTF_PATH) os.mkdir(os.path.dirname(new_gene_gtf)) self.write_genome_gtf(new_gene_gtf) print "...done\n" print "Computing hash of genes GTF file..." gtf_hash = cr_utils.compute_hash_of_file(new_gene_gtf) print "...done\n" print "Writing genes index file into reference folder (may take over 10 minutes for a 3Gb genome)..." new_gene_index = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_INDEX_PATH) os.mkdir(os.path.dirname(new_gene_index)) self.write_genome_gene_index(new_gene_index, new_gene_gtf, new_genome_fasta) print "...done\n" print "Writing genome metadata JSON file into reference folder..." metadata = { cr_constants.REFERENCE_GENOMES_KEY: self.genomes, cr_constants.REFERENCE_NUM_THREADS_KEY: int(math.ceil(float(self.mem_gb) / 8.0)), cr_constants.REFERENCE_MEM_GB_KEY: self.mem_gb, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: [os.path.basename(x) for x in self.in_fasta_fns], cr_constants.REFERENCE_INPUT_GTF_KEY: [os.path.basename(x) for x in self.in_gtf_fns], cr_constants.REFERENCE_VERSION_KEY: self.ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: self.mkref_version, } new_metadata_json = os.path.join(self.out_dir, cr_constants.REFERENCE_METADATA_FILE) with open(new_metadata_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(metadata), f, sort_keys=True, indent=4) print "...done\n" print "Generating STAR genome index (may take over 8 core hours for a 3Gb genome)..." new_star_path = os.path.join(self.out_dir, cr_constants.REFERENCE_STAR_PATH) star = STAR(new_star_path) star.index_reference_with_mem_gb(new_genome_fasta, new_gene_gtf, num_threads=self.num_threads, mem_gb=self.mem_gb) print "...done.\n" print ">>> Reference successfully created! <<<\n" print "You can now specify this reference on the command line:" print "cellranger --transcriptome=%s ..." % self.out_dir
def save_gem_class_json(self, base_dir): json_file_path = MultiGenomeAnalysis.json_path(base_dir) cr_io.makedirs(os.path.dirname(json_file_path), allow_existing=True) with open(json_file_path, 'w') as f: json.dump(tk_safe_json.json_sanitize(self.result), f, indent=4, sort_keys=True)
def report_summary_json(self, filename, summary_json_paths, barcode_summary_h5_path, recovered_cells, cell_bc_seqs): """ summary_json_paths: paths to summary jsons containing total_reads and *_conf_mapped_reads_frac barcode_summary_h5_path: path to barcode summary h5 file """ d = self.report(summary_json_paths, barcode_summary_h5_path=barcode_summary_h5_path, recovered_cells=recovered_cells, cell_bc_seqs=cell_bc_seqs) with open(filename, 'w') as f: json.dump(tk_safe_json.json_sanitize(d), f, indent=4, sort_keys=True)
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file, paired_end): barcodes = set() # Note: The filehandle cache precludes the use of compressed files file_cache = tk_cache.FileHandleCache(mode='r', open_func=open) heap = [] key_func = vdj_utils.fastq_barcode_sort_key for filename in in_filenames: try: fastq = tk_fasta.read_generator_fastq(file_cache.get(filename), paired_end=paired_end) first_readpair = fastq.next() key = key_func(first_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, first_readpair, filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it. key, readpair, in_filename = heapq.heappop(heap) fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename), paired_end=paired_end) tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3]) if paired_end: tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6]) # Get the next item from the source file we just wrote from # If that file is out of items, then we leave that one out try: next_readpair = fastq.next() key = key_func(next_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, next_readpair, in_filename)) except StopIteration: pass json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
def build_web_summary_html(filename, sample_properties, sample_data, pipeline, template_dir=None, alerts_output_filename=None): view = build_web_summary_json(sample_properties, sample_data, pipeline) if not view: return with open(filename, 'w') as f: f.write(template.convert_webshim_json_to_html(view, pipeline, template_dir=template_dir)) # Write raised alerts to a json file if alerts_output_filename is not None: with open(alerts_output_filename, 'w') as f: json.dump(tk_safe_json.json_sanitize(view.get('alarms', [])), f, indent=4, sort_keys=True)
def convert_webshim_json_to_html(data, pipeline, template_dir=None): if template_dir is None: template_dir = DEFAULT_TEMPLATE_DIR loader = jinja2.FileSystemLoader(template_dir) env = jinja2.Environment(loader=loader, trim_blocks=True, lstrip_blocks=True, variable_start_string='[[', variable_end_string=']]') env.globals['include_file'] = lambda name: loader.get_source(env, name)[0] template_html = get_template_for_pipeline(pipeline, data) template = env.get_template(template_html) compressed_data = lz_string.compressToEncodedURIComponent( json.dumps(tk_safe_json.json_sanitize(data))) return template.render(data=data, js_compressed_data=compressed_data).encode('utf-8')
def main(args, outs): exclusions = {} for filename in args.barcode_exclusions: if filename is None or not os.path.isfile(filename): continue with open(filename, "r") as infile: data = json.load(infile) reason = data["label"] for species, barcode_data in data["data"].iteritems(): if species not in exclusions: exclusions[species] = {} for barcode, metric in barcode_data.iteritems(): if barcode in exclusions[species]: # This barcode was already excluded by another file continue exclusions[species][barcode] = [reason, metric] with open(outs.excluded_barcodes, "w") as outfile: json.dump(json_sanitize(exclusions), outfile, indent=4, sort_keys=True)
def split(args): # Write BAM comments to json file bam_comment_fn = martian.make_path('bam_comments.json') with open(bam_comment_fn, 'w') as f: json.dump(args.bam_comments, f) # Write library info to a file libraries_fn = martian.make_path('libraries.json') with open(libraries_fn, 'w') as f: json.dump(tk_safe_json.json_sanitize(args.library_info), f, indent=4, sort_keys=True) chunks = [] for chunk_genome_input, tags, gem_group, library_type, library_id, in itertools.izip_longest( args.genome_inputs, args.tags, args.gem_groups, args.library_types, args.library_ids): gem_group_str = str(gem_group) if gem_group_str in args.skip_translate and library_type in args.skip_translate[ gem_group_str]: this_skip_translate = args.skip_translate[gem_group_str][ library_type] else: this_skip_translate = True chunks.append({ 'chunk_genome_input': chunk_genome_input, 'chunk_tags': tags, 'gem_group': gem_group, 'library_type': library_type, 'library_id': library_id, 'library_info_json': libraries_fn, 'bam_comments_json': bam_comment_fn, 'skip_translate': this_skip_translate, '__mem_gb': 4, }) join = { '__mem_gb': 12, } return {'chunks': chunks, 'join': join}
def _plot_differential_expression(chart, analysis, clustering=None, diff_expr=None, original_cluster_sizes=None): n_clusters = clustering.clusters.max() # Get the union of top DE genes top_genes = set() # Limit the number of entries in the DE table n_genes = int( np.floor( float(ws_gex_constants.MAX_DE_TABLE_ENTRIES) / (n_clusters**2))) if n_genes < 1: n_genes = 1 elif n_genes > ws_gex_constants.MAX_TOP_N_GENES: n_genes = ws_gex_constants.MAX_TOP_N_GENES cols = [ { 'type': 'string', 'label': 'Gene ID' }, { 'type': 'string', 'label': 'Gene name' }, ] for i in xrange(n_clusters): # Filter genes by mean count and sort by log2 fold-change, descending means = diff_expr.data[:, 0 + 3 * i] log2fcs = diff_expr.data[:, 1 + 3 * i] keep_indices = np.flatnonzero( means >= ws_gex_constants.TOP_DE_GENES_MIN_MEAN) top_gene_indices = keep_indices[log2fcs[keep_indices].argsort() [::-1]][:n_genes] for j in top_gene_indices: top_genes.add(analysis.matrix.int_to_feature_id(j)) cols.append({ 'type': 'number', 'label': 'L2FC', 'title': 'Log2 fold-change in cluster %d vs other cells' % (i + 1) }) cols.append({ 'type': 'number', 'label': 'p-value', 'title': 'Adjusted p-value of differential expression in cluster %d' % (i + 1) }) rows = [] for gene_id in top_genes: i = analysis.matrix.feature_id_to_int(gene_id) gene_name = analysis.matrix.feature_id_to_name(gene_id) row = [gene_id, gene_name] for j in xrange(n_clusters): log2fc = diff_expr.data[i, 1 + (3 * j)] adj_p_value = diff_expr.data[i, 2 + (3 * j)] if log2fc <= 0 or adj_p_value >= ws_gex_constants.PVALUE_DEEMPHASIS_CUTOFF: style = '#DDD' else: style = '#000' row.append({ 'v': tk_safe_json.json_sanitize(log2fc), 'f': format_value(log2fc, '%.2f'), 's': style }) row.append({ 'v': tk_safe_json.json_sanitize(adj_p_value), 'f': format_value(adj_p_value, '%.0e'), 's': style }) rows.append(row) # Sort by log2fc, descending, in first cluster if n_clusters > 0: rows = sorted(rows, key=lambda row: row[2]['v'], reverse=True) chart['table'].update({'rows': rows, 'cols': cols}) return chart
def save_summary_json(self, filename): with open(filename, 'w') as f: json.dump(tk_safe_json.json_sanitize(self.summary), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] summary_df_parts = [] umi_summary_df_parts = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_df_parts.append( pd.read_csv(chunk_out.summary_tsv, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int })) umi_summary_df_parts.append( pd.read_csv(chunk_out.umi_summary_tsv, header=0, index_col=None, sep='\t', dtype={ 'umi_id': int, 'reads': int, 'min_umi_reads': int, 'contigs': str })) summary_df = pd.concat(summary_df_parts, ignore_index=True) umi_summary_df = pd.concat(umi_summary_df_parts, ignore_index=True) cr_utils.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: subprocess.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs) if summary_df is not None: summary_df.to_csv(outs.summary_tsv, header=True, index=False, sep='\t') if umi_summary_df is not None: umi_summary_df.to_csv(outs.umi_summary_tsv, header=True, index=False, sep='\t') if contig_bams: tk_bam.merge(outs.contig_bam, contig_bams, threads=args.__threads) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_utils.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def save_cell_barcodes_json(barcodes, filename): with open(filename, 'w') as f: json.dump(tk_safe_json.json_sanitize(sorted(list(barcodes))), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): outs.reads, outs.read2s, outs.tags = [], [], [] outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], [] for chunk_out in chunk_outs: outs.reads += [read for read in chunk_out.reads] outs.read2s += [read2 for read2 in chunk_out.read2s] outs.tags += [tags for tags in chunk_out.tags] outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups] outs.library_types += [lt for lt in chunk_out.library_types] outs.library_ids += [li for li in chunk_out.library_ids] outs.read_groups += [ read_group for read_group in chunk_out.read_groups ] # Ensure that we have non-zero reads if not outs.reads: martian.exit( "No reads found. Check the input fastqs and/or the chemistry definition" ) # Ensure consistency of BAM comments assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments for chunk_out in chunk_outs) outs.bam_comments = chunk_outs[0].bam_comments # Write barcode counts (merged by library_type) bc_counters = BarcodeCounter.merge_by( [co.barcode_counts for co in chunk_outs], [cd.library_type for cd in chunk_defs], args.barcode_whitelist, outs.gem_groups) with open(outs.barcode_counts, 'w') as f: tk_safe_json.dump_numpy(bc_counters, f) # Write feature counts feature_counts = None for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs): with open(chunk_out.feature_counts) as f: chunk_counts = np.asarray(json.load(f), dtype=int) if feature_counts is None: feature_counts = chunk_counts else: feature_counts += chunk_counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) outs.align = cr_utils.select_alignment_params(args.align) # Group reporters by library type outs.chunked_reporter = None reporter_groups = defaultdict(list) for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): if not chunk_out.reads: continue chunk_lib_types = set(lt for lt in chunk_out.library_types) assert len(chunk_lib_types) == 1 lib_type = list(chunk_lib_types)[0] reporter_groups[lib_type].append(chunk_out.chunked_reporter) # Merge reporters and prefix JSON keys by library type summary = {} for lib_type, reporters in reporter_groups.iteritems(): j = cr_report.merge_reporters(reporters).to_json() prefix = rna_library.get_library_type_metric_prefix(lib_type) j_prefixed = dict((prefix + k, v) for k, v in j.iteritems()) summary.update(j_prefixed) # Use a temporary reporter to generate the metadata (w/o a prefix) tmp_reporter = cr_report.Reporter() tmp_reporter.store_chemistry_metadata(args.chemistry_def) summary.update(tmp_reporter.to_json()) # Write summary JSON with open(outs.summary, 'w') as f: tk_safe_json.dump_numpy(summary, f, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): version = martian.get_pipelines_version() with open(args.summary) as f: summary = json.load(f) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() barcode_seqs = mc.get_barcodes() lib_types = sorted(set(lib['library_type'] for lib in library_info)) # make attrs for user-added columns in aggr csv extra_attrs = get_custom_aggr_columns(args.sample_defs) # track original library/gem info library_map = cr_matrix.make_library_map_aggr(args.gem_group_index) extra_attrs.update(library_map) # Merge raw matrix raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5) raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs) genomes = raw_matrix.get_genomes() # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot with h5py.File(outs.barcode_summary_h5, 'w') as f: cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs) gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64') genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key, data=gex_bc_counts) rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version) del raw_matrix # Merge filtered matrix filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5) filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs) # Summarize the matrix across library types and genomes for lib_type in lib_types: libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type) if rna_library.has_genomes(lib_type): genomes = filt_mat.get_genomes() else: genomes = [None] mat_lib = filt_mat.view().select_features_by_type(lib_type) for genome in genomes: if genome is None: mat = mat_lib genome_idx = None else: mat = mat_lib.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) # Select barcodes passing filter for this (lib_type, genome) filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) mat = mat.select_barcodes_by_seq(filtered_bcs) median_features = np.median(mat.count_ge(axis=0, threshold=cr_constants.MIN_COUNTS_PER_GENE)) median_counts = np.median(mat.sum(axis=0)) genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX prefixes = (libtype_prefix, genome_prefix) if genome is not None: flt_reads = summary['%s%s_flt_mapped_reads' % prefixes] raw_reads = summary['%s%s_raw_mapped_reads' % prefixes] frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] = frac_reads_in_cells summary.update({ '%s%s_filtered_bcs_median_counts' % prefixes: median_counts, '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features, }) # Compute frac reads in cells across all genomes prefixes = [(libtype_prefix, g) for g in genomes if g is not None] if len(prefixes) == 0: prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes) raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes) frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % ( libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells # Write MEX format (do it last because it converts the matrices to COO) rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Build the feature reference if args.reference_path: feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) else: feature_ref = rna_feature_ref.FeatureReference.empty() # Setup feature barcode extraction feature_extractor = rna_feature_ref.FeatureExtractor( feature_ref, use_feature_types=[args.library_type]) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) num_libraries = len(args.library_info) reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), num_libraries=num_libraries) # Determine if barcode sequences need to be reverse complemented. with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) as bc_check_rc: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, True) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice( r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) # Record feature counts: feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int) # If this library type has no feature barcodes, make the reader a NOOP if feature_extractor.has_features_to_extract(): feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor, args.reads_interleaved, r1_length, r2_length) else: feature_reads = FastqReader(None, None, None, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads, feature_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) tag_writer = None if not args.augment_fastq: tag_writer = ChunkedFastqWriter(outs.tags, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.chunk_initial_reads): # Downsample if random.random() > args.chunk_subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences lib_idx = [ i for i, x in enumerate(args.library_info) if x['library_id'] == args.library_id ][0] reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, lib_idx, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) feat_raw_bc = None feat_proc_bc = None feat_qual = None feat_ids = None if feature_extraction: if feature_extraction.barcode: feat_raw_bc = feature_extraction.barcode feat_qual = feature_extraction.qual if len(feature_extraction.ids) > 0: feat_proc_bc = feature_extraction.barcode feat_ids = ';'.join(feature_extraction.ids) # If hit a single feature ID, count its frequency if len(feature_extraction.ids) == 1: feature_counts[feature_extraction.indices[0]] += 1 if feat_raw_bc: fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read1_writer.write( (fastq_header1.to_string(), rna_read[1], rna_read[2])) else: read1_writer.write((rna_read[0], rna_read[1], rna_read[2])) tag_writer.write((fastq_header1.to_string(), '', '')) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) if feat_raw_bc: fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header2.set_tag( cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) else: read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() if not args.augment_fastq: tag_writer.close() bc_counter.close() # Write feature BC read counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] if args.augment_fastq: outs.tags = [] else: outs.tags = tag_writer.get_out_paths(len(outs.tags)) libraries = args.library_info library = [ li for li in libraries if li['library_id'] == args.library_id ][0] outs.gem_groups = [library['gem_group']] * len(outs.reads) outs.library_types = [library['library_type']] * len(outs.reads) outs.library_ids = [library['library_id']] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.tags = [] outs.gem_groups = [] outs.library_types = [] outs.library_ids = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) assert args.augment_fastq or len(outs.tags) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def join(args, outs, chunk_defs, chunk_outs): # compute invariants on input data input_genomes = set() input_features = set() input_bc_counts = {} input_feature_counts = {} input_num_gem_groups = 0 for sample_def in args.input_sample_defs: library_id = sample_def['library_id'] with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD], 'r') as mc: input_genomes.update(mol_counter_genomes(mc)) input_features.update(mol_counter_features_id_type(mc)) gem_groups = mc.get_gem_groups() input_num_gem_groups += len(gem_groups) mol_gem_group = mc.get_column('gem_group') mol_barcode_idx = mc.get_column('barcode_idx') for gg in gem_groups: input_bc_counts[(library_id, gg)] = np.zeros( len(mc.get_ref_column('barcodes'))) bc_idx, counts = np.unique( mol_barcode_idx[mol_gem_group == gg], return_counts=True) input_bc_counts[(library_id, gg)][bc_idx] = counts del mol_barcode_idx mol_feature_idx = mc.get_column('feature_idx') for gg in gem_groups: input_feature_counts[(library_id, gg)] = np.zeros( len(mc.feature_reference.feature_defs)) feature_idx, counts = np.unique( mol_feature_idx[mol_gem_group == gg], return_counts=True) input_feature_counts[(library_id, gg)][feature_idx] = counts del mol_feature_idx # compute invariants on output output_matrix = cr_matrix.CountMatrix.load_h5_file( args.merged_raw_gene_bc_matrices_h5) output_genomes = set(output_matrix.get_genomes()) output_features = set(count_matrix_features_id_type(output_matrix)) output_bc_counts = {} output_feature_counts = {} output_gem_index = cr_matrix.get_gem_group_index( args.merged_raw_gene_bc_matrices_h5) output_num_gem_groups = len(output_gem_index) for gg in output_gem_index: library_id, old_gg = output_gem_index[gg] matrix_gg = output_matrix.select_barcodes_by_gem_group(gg) output_bc_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_bc() output_feature_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_feature() exit_message = ( 'An internal problem in the aggr pipeline has been detected ' 'that might lead to incorrect results. Please report this ' 'problem to [email protected].') if input_genomes != output_genomes: martian.log_info( 'Genomes differ between input molecule files and aggregated matrix' ) martian.exit(exit_message) if input_features != output_features: martian.log_info( 'Features differ between input molecule files and aggregated matrix' ) martian.exit(exit_message) if input_num_gem_groups != output_num_gem_groups: martian.log_info( 'Number of GEM groups differs between input molecule files and aggregated matrix' ) martian.exit(exit_message) for lib_gg in input_bc_counts.keys(): if len(input_bc_counts[lib_gg]) != len(output_bc_counts[lib_gg]): martian.log_info( 'Barcode list for library {}, GEM group {} has different length ' 'in aggregated output compared to input.'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if np.any(input_bc_counts[lib_gg] < output_bc_counts[lib_gg]): martian.log_info( 'Barcode(s) in library {}, GEM group {} have higher UMI counts ' 'in aggregated output compared to inputs'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if len(input_feature_counts[lib_gg]) != len( output_feature_counts[lib_gg]): martian.log_info( 'Feature list for library {}, GEM group {} has different length ' 'in aggregated output compared to input.'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) if np.any( input_feature_counts[lib_gg] < output_feature_counts[lib_gg]): martian.log_info( 'Feature(s) in library {}, GEM group {} have higher UMI counts ' 'in aggregated output compared to inputs'.format( lib_gg[0], lib_gg[1])) martian.exit(exit_message) summary = { 'genomes_present': list(input_genomes), 'num_features_in_ref': len(input_features), 'num_gem_groups': input_num_gem_groups, } with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def filter_barcodes(args, outs): random.seed(0) np.random.seed(0) matrices = cr_matrix.GeneBCMatrices.load_h5(args.matrices_h5) summary = {} total_diversity = len(matrices.matrices.values()[-1].bcs) if args.cell_barcodes is not None: method_name = cr_constants.FILTER_BARCODES_MANUAL elif args.force_cells is not None: method_name = cr_constants.FILTER_BARCODES_FIXED_CUTOFF else: method_name = cr_constants.FILTER_BARCODES_ORDMAG summary['total_diversity'] = total_diversity summary['filter_barcodes_method'] = method_name # Initialize filtered matrices object filtered_matrices = cr_matrix.GeneBCMatrices( matrices.matrices.keys(), [m.genes for m in matrices.matrices.values()], [m.bcs for m in matrices.matrices.values()][0]) # Get unique gem groups unique_gem_groups = sorted(list(set(args.gem_groups))) # Get per-gem group cell load if args.recovered_cells is not None: gg_recovered_cells = int( float(args.recovered_cells) / float(len(unique_gem_groups))) else: gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP if args.force_cells is not None: gg_force_cells = int( float(args.force_cells) / float(len(unique_gem_groups))) filtered_metrics = [] filtered_bcs = [] # Track filtered barcodes for each genome bcs_per_genome = collections.defaultdict(list) # Filter each genome's matrix for genome, matrix in matrices.matrices.iteritems(): filtered_metrics = [] filtered_bcs = [] # Filter each gem group individually for gem_group in unique_gem_groups: gg_matrix = matrix.select_barcodes_by_gem_group(gem_group) if method_name == cr_constants.FILTER_BARCODES_ORDMAG: gg_total_diversity = len(gg_matrix.bcs) gg_bc_counts = gg_matrix.get_reads_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag( gg_bc_counts, gg_recovered_cells, gg_total_diversity) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) elif method_name == cr_constants.FILTER_BARCODES_MANUAL: with (open(args.cell_barcodes)) as f: cell_barcodes = json.load(f) gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual( gg_matrix, cell_barcodes) elif method_name == cr_constants.FILTER_BARCODES_FIXED_CUTOFF: gg_bc_counts = gg_matrix.get_reads_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff( gg_bc_counts, gg_force_cells) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) else: martian.exit("Unsupported BC filtering method: %s" % method_name) if msg is not None: martian.log_info(msg) filtered_metrics.append(gg_filtered_metrics) filtered_bcs.extend(gg_filtered_bcs) bcs_per_genome[genome].extend(gg_filtered_bcs) # Merge metrics over all gem groups txome_summary = cr_stats.merge_filtered_metrics(filtered_metrics) # Append method name to metrics summary.update({ ('%s_%s_%s' % (genome, key, method_name)): txome_summary[key] \ for (key,_) in txome_summary.iteritems()}) txome_filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs) filtered_matrices.matrices[genome] = txome_filtered_matrix summary['%s_filtered_bcs' % genome] = txome_summary['filtered_bcs'] summary['%s_filtered_bcs_cv' % genome] = txome_summary['filtered_bcs_cv'] # Re-compute various metrics on the filtered matrices matrix_summary = matrices.report( summary_json_paths=[args.raw_fastq_summary, args.attach_bcs_summary], barcode_summary_h5_path=args.barcode_summary, recovered_cells=args.recovered_cells, cell_bc_seqs=[ mat.bcs for mat in filtered_matrices.matrices.itervalues() ]) # Write summary json combined_summary = matrix_summary.copy() combined_summary.update(summary) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(combined_summary), f, indent=4, sort_keys=True) # Write the filtered barcodes file write_filtered_barcodes(outs.filtered_barcodes, bcs_per_genome) return filtered_matrices
def build_reference_fasta_from_fasta(fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a V(D)J segment FASTA file. """ seen_features = set() seen_ids = set() features = [] print 'Checking FASTA entries...' with open(fasta_path) as f: for header, sequence in cr_utils.get_fasta_iter(f): feat = parse_fasta_entry(header, sequence) # Enforce unique feature IDs if feat.feature_id in seen_ids: raise ValueError( 'Duplicate feature ID found in input FASTA: %d.' % feat.feature_id) # Sanity check values if ' ' in feat.region_type: raise ValueError('Spaces not allowed in region type: "%s"' % feat.region_type) if ' ' in feat.gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % feat.gene_name) if ' ' in feat.record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % feat.record_id) key = get_duplicate_feature_key(feat) if key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( feat.display_name, feat.region_type, feat.record_id) continue # Strip Ns from termini seq = feat.sequence if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \ str((feat.display_name, feat.record_id, feat.region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % \ str((feat.display_name, feat.record_id, feat.region_type)) continue # Warn on features we couldn't classify properly if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \ (str((feat.display_name, feat.record_id, feat.region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue seen_ids.add(feat.feature_id) seen_features.add(key) # Update the sequence since we may have modified it feat_dict = feat._asdict() feat_dict.update({'sequence': seq}) new_feat = VdjAnnotationFeature(**feat_dict) features.append(new_feat) print '...done.\n' print 'Writing sequences...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta: for feat in features: out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n') print '...done.\n' print 'Computing hash of input FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(fasta_path) print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: None, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: None, cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def write_json_from_dict(input_dict, out_file_name): with open(out_file_name, 'w') as f: json.dump(tk_safe_json.json_sanitize(input_dict), f, indent=4, sort_keys=True)
def main(args, outs): reporter = vdj_report.VdjReporter() cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) barcode_contigs = vdj_annot.load_cell_contigs_from_json( args.annotations, args.vdj_reference_path, group_key='barcode') # From CDR sequence to sequence id sequences = {} # From clonotype (tuple of CDR ids) to clonotype id clonotypes = {} # From barcode to clonotype id bc_clonotype_assignments = {} # First pass: Just keep track of observed CDR3s for contig_list in barcode_contigs: # This will be a tuple of sequences like "TRA_<cdr seq>" barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=not args.use_non_productive, require_full_len=True, require_high_conf=True) # Give unique numerical ids to the CDR3 sequences if barcode_clonotype_tuple: for cdr_seq in barcode_clonotype_tuple: sequences.setdefault(cdr_seq, len(sequences)) # From sequence id to CDR sequence sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()} # Do a second pass to potentially use non-full length contigs with a valid CDR3. for contig_list in barcode_contigs: if args.use_non_full_len: barcode_clonotype_tuple = [] for c in contig_list.contigs(): (_, cl_seq) = c.clonotype_seq() # If this contig has a CDR3 and we can infer the gene type of # that CDR3 (either based on the contig itself or based on # other full-length contigs that had this CDR3, then add this # to the clonotype tuple). if cl_seq in sequences: # this will rescue contigs that have a chain and CDR3 assigned # but aren't full length barcode_clonotype_tuple.append(cl_seq) else: barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=(not args.use_non_productive), require_full_len=True, require_high_conf=True) barcode_clonotype = tuple( sorted(list(set([sequences[s] for s in barcode_clonotype_tuple])))) if barcode_clonotype: clonotype_id = clonotypes.setdefault(barcode_clonotype, len(clonotypes)) bc_clonotype_assignments[contig_list.name] = clonotype_id # From clonotype id to tuple of CDRs clonotype_ids = { clonotype_id: clonotype_tuple for clonotype_tuple, clonotype_id in clonotypes.iteritems() } out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw', cell_barcodes, clonotype_ids, sequence_ids, barcode_contigs, bc_clonotype_assignments) with open(outs.clonotype_assignments, 'w') as out_file: tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes), out_file, pretty=True) # Add clonotype assignments to contig annotations del barcode_contigs with open(args.annotations) as f: all_contigs = vdj_annot.load_contig_list_from_json( f, args.vdj_reference_path) vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw') # Write augmented contig annotations with open(outs.contig_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) with open(outs.contig_annotations_csv, 'w') as out_file: vdj_annot.save_contig_list_csv(out_file, all_contigs, write_inferred=False) with open(outs.contig_annotations_pickle, 'w') as out_file: cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL) # Write filtered contig annotations with open(outs.filtered_contig_annotations_csv, 'w') as out_file: filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell, all_contigs) vdj_annot.save_contig_list_csv(out_file, filtered_contigs, write_inferred=False) # Set a default value for paired clonotype diversity so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode paired_diversity_metric = reporter._get_metric_attr( 'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw') if not paired_diversity_metric.d: paired_diversity_metric.add(None, 0) reporter.report_summary_json(outs.summary)
def main(args, outs): np.random.seed(0) LogPerf.mem() with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() metrics_in = mc.get_all_metrics() metrics_out = copy.deepcopy(metrics_in) # Compute subsampling rate and approximate new total readpair count frac_reads_kept = np.array(args.frac_reads_kept, dtype=float) total_reads_in = mc.get_raw_read_pairs_per_library() total_reads_out = total_reads_in * frac_reads_kept for lib_idx, _ in enumerate(library_info): metrics_out[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)][cr_mol_counter. DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx] # downsample molecule info chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_rate = frac_reads_kept[mol_library_idx] del mol_library_idx new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) del mol_read_pairs del mol_rate keep_mol = np.flatnonzero(new_read_pairs) new_read_pairs = new_read_pairs[keep_mol] mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol] # Assert that gem groups start at 1 and are contiguous gem_groups = sorted(set(lib['gem_group'] for lib in library_info)) assert(min(gem_groups) == 1 and \ np.all(np.diff(np.array(gem_groups,dtype=int)) == 1)) feature_ref = mc.get_feature_ref() # Compute matrix dimensions # Get the range of possible barcode indices for each gem group. gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int) gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int) for gg_str, idx_range in sorted( args.gem_group_barcode_ranges.iteritems(), key=lambda kv: int(kv[0])): gg = int(gg_str) gg_barcode_idx_start[gg] = idx_range[0] gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0] num_bcs = gg_barcode_idx_len.sum() num_features = feature_ref.get_num_features() print 'downsampled' LogPerf.mem() # Convert molecule barcode indices into matrix barcode indices # The molecule info barcode_idx is in this space: # [W_0, W_1, ...] where W_i is distinct original whitelist i. # The matrix is in, e.g., this space: # [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j. # Return to the original whitelist index mol_barcode_idx -= gg_barcode_idx_start.astype( np.uint64)[mol_gem_group] # Offset by the cumulative whitelist length up to a barcode's gem group gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype( np.uint64) mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1] ones = np.ones(len(mol_barcode_idx), dtype=cr_matrix.DEFAULT_DATA_DTYPE) umi_matrix = sp_sparse.coo_matrix( (ones, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) print 'created umi matrix' LogPerf.mem() # Create a read-count matrix so we can summarize reads per barcode read_matrix = sp_sparse.coo_matrix( (new_read_pairs, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) del ones del mol_feature_idx del mol_barcode_idx del new_read_pairs # Get all barcodes strings for the raw matrix barcode_seqs = mc.get_barcodes() print len(barcode_seqs), len(gem_groups) print 'creating barcode strings' LogPerf.mem() barcodes = [] for gg in gem_groups: idx_start = gg_barcode_idx_start[gg] idx_end = idx_start + gg_barcode_idx_len[gg] gg_bcs = np.array([ cr_utils.format_barcode_seq(bc, gg) for bc in barcode_seqs[idx_start:idx_end] ]) barcodes.append(gg_bcs) barcodes = np.concatenate(barcodes) barcodes.flags.writeable = False print 'created barcode strings' LogPerf.mem() # Get mapped reads per barcode per library,genome read_summary = {} read_matrix = CountMatrix(feature_ref, barcodes, read_matrix) read_matrix.m = read_matrix.m.tocsc(copy=True) read_summary = summarize_read_matrix(read_matrix, library_info, barcode_info, barcode_seqs) del read_matrix print 'created read matrix' LogPerf.mem() # Construct the raw UMI matrix raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix) raw_umi_matrix.save_h5_file(outs.raw_matrix_h5) outs.raw_nnz = raw_umi_matrix.m.nnz # Construct the filtered UMI matrix filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs) filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq( filtered_bcs) filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5) outs.filtered_nnz = filtered_umi_matrix.m.nnz print 'created filtered umi matrix' LogPerf.mem() summary = { 'read_summary': read_summary, 'mol_metrics': metrics_out, } with open(outs.chunk_summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True) # Don't write MEX from chunks. outs.raw_matrices_mex = None outs.filtered_matrices_mex = None
def join(args, outs, chunk_defs, chunk_outs): # Pass through the matrix chunks and nnz counts outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs] outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs) outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs] outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() lib_types = sorted(set(lib['library_type'] for lib in library_info)) summary = { 'frac_reads_kept': chunk_defs[0].frac_reads_kept, 'num_cells_by_library': chunk_defs[0].num_cells, } # Merge read summary metrics read_summary = defaultdict(int) for filename in [co.chunk_summary for co in chunk_outs]: with open(filename) as f: d = json.load(f) for k in d['read_summary'].iterkeys(): read_summary[k] += d['read_summary'][k] summary.update(read_summary) # Get summary metrics with open(chunk_outs[0].chunk_summary) as f: mol_metrics = json.load(f)['mol_metrics'] chem_keys = [ k for k in mol_metrics.iterkeys() if k.startswith('chemistry') ] for k in chem_keys: summary[k] = mol_metrics[k] print json.dumps(mol_metrics, indent=4, sort_keys=True) # Report normalization metrics all_batches = OrderedDict() # These are all per-library-type min_frac_reads_kept = np.ones(len(lib_types), dtype='float') total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_cells = np.zeros(len(lib_types), dtype='uint64') for lib_type_idx, lib_type in enumerate(lib_types): lib_inds = [ i for i, lib in enumerate(library_info) if lib['library_type'] == lib_type ] for lib_idx in lib_inds: aggr_id = library_info[lib_idx]['aggr_id'] old_gg = library_info[lib_idx]['old_gem_group'] batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '') all_batches[batch] = None n_cells = summary['num_cells_by_library'][lib_idx] total_cells[lib_type_idx] += n_cells lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)] raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC] mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC] ds_read_pairs = lib_metrics[ cr_mol_counter.DOWNSAMPLED_READS_METRIC] total_raw_read_pairs[lib_type_idx] += raw_read_pairs total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs frac_reads_kept = summary['frac_reads_kept'][lib_idx] min_frac_reads_kept[lib_type_idx] = min( min_frac_reads_kept[lib_type_idx], frac_reads_kept) pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells) pre_norm_mapped_rppc = tk_stats.robust_divide( mapped_read_pairs, n_cells) # Prefix with batch and library type if lib_type.lower().startswith( rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()): lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_' else: lib_prefix = rna_library.get_library_type_metric_prefix( lib_type) p = (batch, lib_prefix) summary.update({ '%s_%sfrac_reads_kept' % p: frac_reads_kept, '%s_%spre_normalization_raw_reads_per_filtered_bc' % p: pre_norm_raw_rppc, '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p: pre_norm_mapped_rppc, }) summary['batches'] = all_batches.keys() for lib_type_idx, lib_type in enumerate(lib_types): mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) ds_mean_rppc = tk_stats.robust_divide( total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) p = rna_library.get_library_type_metric_prefix(lib_type) summary.update({ '%spre_normalization_total_reads' % p: total_raw_read_pairs[lib_type_idx], '%spost_normalization_total_reads' % p: total_ds_raw_read_pairs[lib_type_idx], '%sfiltered_bcs_transcriptome_union' % p: total_cells[lib_type_idx], '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: mean_rppc, '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: ds_mean_rppc, '%slowest_frac_reads_kept' % p: min_frac_reads_kept[lib_type_idx], }) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path, genome_fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files. Input files are concatenated. No attempt to merge/reconcile information across them is made. Providing the files in a different order might change the output in cases where there are multiple entries with the same transcript id and the same feature type (eg. V-region). """ transcripts = collections.defaultdict(list) if transcripts_to_remove_path: with open(transcripts_to_remove_path) as f: rm_transcripts = set([line.strip() for line in f.readlines()]) else: rm_transcripts = set() # Note: We cannot symlink here because some filesystems in the wild # do not support symlinks. print 'Copying genome reference sequence...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta') cr_utils.copy(genome_fasta_path, tmp_genome_fa_path) print '...done.\n' print 'Indexing genome reference sequence...' tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path]) print '...done.\n' print 'Loading genome reference sequence...' genome_fasta = pysam.FastaFile(tmp_genome_fa_path) print '...done.\n' print 'Computing hash of genome FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path) print '...done.\n' for gtf in gtf_paths: print 'Reading GTF {}'.format(gtf) for line_no, entry in enumerate(get_gtf_iter(open(gtf))): if not entry.feature in [ ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE ]: continue entry = parse_attributes(entry) transcript_id = entry.attributes.get('transcript_id') transcript_biotype = entry.attributes.get('transcript_biotype') gene_biotype = entry.attributes.get('gene_biotype') gene_name = entry.attributes.get('gene_name') # Skip irrelevant biotypes if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES: continue # Skip blacklisted gene names if transcript_id in rm_transcripts: continue # Warn and skip if transcript_id missing if transcript_id is None: print 'Warning: Entry on row %d has no transcript_id' % line_no continue # Warn and skip if gene_name missing if gene_name is None: print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % ( transcript_id, line_no, transcript_biotype) continue # Infer region type from biotype if transcript_biotype in ENSEMBL_VDJ_BIOTYPES: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, transcript_biotype) else: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, gene_biotype) # Warn and skip if region type could not be inferred if vdj_feature is None: print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % ( transcript_id, transcript_biotype) continue # Features that share a transcript_id and feature type are presumably exons # so keep them together. transcripts[(transcript_id, vdj_feature)].append(entry) print '...done.\n' print 'Computing hash of genes GTF files...' digest = hashlib.sha1() # concatenate all the hashes into a string and then hash that string digest.update( reduce(lambda x, y: x + y, [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths])) gtf_hash = digest.hexdigest() print '...done.\n' print 'Fetching sequences...' out_fasta = open(get_vdj_reference_fasta(reference_path), 'w') feature_id = 1 seen_features = set() for (transcript_id, region_type), regions in transcripts.iteritems(): if not all(r.chrom == regions[0].chrom for r in regions): chroms = sorted(list(set([r.chrom for r in regions]))) print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % ( transcript_id, str(chroms)) continue if not all(r.strand == regions[0].strand for r in regions): print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id continue chrom = regions[0].chrom strand = regions[0].strand ens_gene_name = standardize_ensembl_gene_name( regions[0].attributes['gene_name']) transcript_id = regions[0].attributes['transcript_id'] if chrom not in genome_fasta: print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % ( transcript_id, chrom) continue # Build sequence regions.sort(key=lambda r: r.start) seq = '' for region in regions: # GTF coordinates are 1-based start, end = int(region.start) - 1, int(region.end) seq += genome_fasta.fetch(chrom, start, end) # Revcomp if transcript on reverse strand if strand == '-': seq = tk_seq.get_rev_comp(seq) # Strip Ns from termini if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str( (ens_gene_name, transcript_id, region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % str( (ens_gene_name, transcript_id, region_type)) continue # Infer various attributes from the Ensembl gene name record_id = transcript_id gene_name = ens_gene_name display_name = make_display_name(gene_name=gene_name, allele_name=None) chain = infer_ensembl_vdj_chain(gene_name) chain_type = infer_ensembl_vdj_chain_type(gene_name) # Ensembl doesn't encode alleles allele_name = '00' # Disallow spaces in these fields if ' ' in region_type: raise ValueError('Spaces not allowed in region type: "%s"' % region_type) if ' ' in gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % gene_name) if ' ' in record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % record_id) # Warn on features we couldn't classify properly if chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print ('Warning: Could not infer chain type for: %s. ' + \ 'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \ (str((gene_name, record_id, region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES: isotype = infer_ensembl_isotype(ens_gene_name) else: isotype = None feature = VdjAnnotationFeature( feature_id=feature_id, record_id=record_id, display_name=display_name, gene_name=gene_name, region_type=region_type, chain_type=chain_type, chain=chain, isotype=isotype, allele_name=allele_name, sequence=seq, ) # Don't add duplicate entries feat_key = get_duplicate_feature_key(feature) if feat_key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( display_name, region_type, record_id) continue seen_features.add(feat_key) feature_id += 1 out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n') print '...done.\n' print 'Deleting copy of genome fasta...' os.remove(tmp_genome_fa_path) os.remove(tmp_genome_fa_path + '.fai') print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(genome_fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]), cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_io.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_io.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_io.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_io.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_io.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_io.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def filter_barcodes(args, outs): random.seed(0) np.random.seed(0) correction_data = pd.read_csv(args.barcode_correction_csv) raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5) if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE, correction_data.library_type): matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads( correction_data, raw_matrix) ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads removed_bcs_df.to_csv(outs.aggregate_barcodes) summary = metrics_to_report else: matrix = raw_matrix summary = {} if args.cell_barcodes is not None: method = FilterMethod.MANUAL elif args.force_cells is not None: method = FilterMethod.TOP_N_BARCODES else: method = FilterMethod.ORDMAG_NONAMBIENT summary['total_diversity'] = matrix.bcs_dim summary['filter_barcodes_method'] = get_filter_method_name(method) # Get unique gem groups unique_gem_groups = sorted(list(set(args.gem_groups))) # Get per-gem group cell load if args.recovered_cells is not None: gg_recovered_cells = int( float(args.recovered_cells) / float(len(unique_gem_groups))) else: gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP if args.force_cells is not None: gg_force_cells = int( float(args.force_cells) / float(len(unique_gem_groups))) # Only use gene expression matrix for cell calling gex_matrix = matrix.view().select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Make initial cell calls for each genome separately genomes = gex_matrix.get_genomes() # (gem_group, genome) => dict filtered_metrics_groups = OrderedDict() # (gem_group, genome) => list of barcode strings filtered_bcs_groups = OrderedDict() for genome in genomes: genome_matrix = gex_matrix.select_features_by_genome(genome) # Make initial cell calls for each gem group individually for gem_group in unique_gem_groups: gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group) if method == FilterMethod.ORDMAG or \ method == FilterMethod.ORDMAG_NONAMBIENT: gg_total_diversity = gg_matrix.bcs_dim gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag( gg_bc_counts, gg_recovered_cells, gg_total_diversity) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) elif method == FilterMethod.MANUAL: with (open(args.cell_barcodes)) as f: cell_barcodes = json.load(f) gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual( gg_matrix, cell_barcodes) elif method == FilterMethod.TOP_N_BARCODES: gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff( gg_bc_counts, gg_force_cells) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) else: martian.exit("Unsupported BC filtering method: %s" % method) if msg is not None: martian.log_info(msg) filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs # Do additional cell calling outs.nonambient_calls = None if method == FilterMethod.ORDMAG_NONAMBIENT: # We need the full gene expression matrix instead of just a view full_gex_matrix = matrix.select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Track these for recordkeeping eval_bcs_arrays = [] umis_per_bc_arrays = [] loglk_arrays = [] pvalue_arrays = [] pvalue_adj_arrays = [] nonambient_arrays = [] genome_call_arrays = [] # Do it by gem group, but agnostic to genome for gg in unique_gem_groups: gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg) # Take union of initial cell calls across genomes gg_bcs = sorted( list( reduce(set.union, [ set(bcs) for group, bcs in filtered_bcs_groups.iteritems() if group[0] == gg ]))) result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs) if result is None: print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg continue # Assign a genome to the cell calls by argmax genome counts genome_counts = [] for genome in genomes: genome_counts.append(gg_matrix.view() \ .select_features_by_genome(genome) \ .select_barcodes(result.eval_bcs) \ .get_counts_per_bc()) genome_counts = np.column_stack(genome_counts) genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)] umis_per_bc = gg_matrix.get_counts_per_bc() eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs]) umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs]) loglk_arrays.append(result.log_likelihood) pvalue_arrays.append(result.pvalues) pvalue_adj_arrays.append(result.pvalues_adj) nonambient_arrays.append(result.is_nonambient) genome_call_arrays.append(genome_calls) # Update the lists of cell-associated barcodes for genome in genomes: eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs] filtered_bcs_groups[(gg, genome)].extend( eval_bc_strs[(genome_calls == genome) & (result.is_nonambient)]) if len(eval_bcs_arrays) > 0: nonambient_summary = pd.DataFrame( OrderedDict([ ('barcode', np.concatenate(eval_bcs_arrays)), ('umis', np.concatenate(umis_per_bc_arrays)), ('ambient_loglk', np.concatenate(loglk_arrays)), ('pvalue', np.concatenate(pvalue_arrays)), ('pvalue_adj', np.concatenate(pvalue_adj_arrays)), ('nonambient', np.concatenate(nonambient_arrays)), ('genome', np.concatenate(genome_call_arrays)), ])) nonambient_summary.to_csv(outs.nonambient_calls) # Record all filtered barcodes genome_filtered_bcs = defaultdict(set) filtered_bcs = set() for (gem_group, genome), bcs in filtered_bcs_groups.iteritems(): genome_filtered_bcs[genome].update(bcs) filtered_bcs.update(bcs) # Combine initial-cell-calling metrics for genome in genomes: # Merge metrics over all gem groups for this genome txome_metrics = [ v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome ] txome_summary = cr_stats.merge_filtered_metrics(txome_metrics) # Append method name to metrics summary.update({ ('%s_%s_%s' % (genome, key, get_filter_method_name(method))): txome_summary[key] \ for (key,_) in txome_summary.iteritems()}) summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome]) # NOTE: This metric only applies to the initial cell calls summary['%s_filtered_bcs_cv' % genome] = txome_summary['filtered_bcs_cv'] # Deduplicate and sort filtered barcode sequences # Sort by (gem_group, barcode_sequence) barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1] for genome, bcs in genome_filtered_bcs.iteritems(): genome_filtered_bcs[genome] = sorted(list(set(bcs)), key=barcode_sort_key) filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key) # Re-compute various metrics on the filtered matrix reads_summary = cr_utils.merge_jsons_as_dict( [args.raw_fastq_summary, args.attach_bcs_summary]) matrix_summary = rna_report_mat.report_genomes( matrix, reads_summary=reads_summary, barcode_summary_h5_path=args.barcode_summary, recovered_cells=args.recovered_cells, cell_bc_seqs=genome_filtered_bcs) # Write metrics json combined_summary = matrix_summary.copy() combined_summary.update(summary) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(combined_summary), f, indent=4, sort_keys=True) # Write the filtered barcodes file write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs) # Select cell-associated barcodes filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs) return filtered_matrix
def join(args, outs, chunk_defs, chunk_outs): # Merge tallies data = None for chunk in chunk_outs: with open(chunk.metrics) as f: chunk_data = cPickle.load(f) if data is None: data = chunk_data else: for k, v in data.iteritems(): data[k] += chunk_data[k] # Compute metrics for each subsampling rate summary = {} with MoleculeCounter.open(args.molecule_info, 'r') as mc: genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) lib_types = sorted(set(lib['library_type'] for lib in mc.library_info)) lib_type_map = dict((lt, idx) for (idx, lt) in enumerate(lib_types)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Give each cell-associated barcode an integer index cell_bcs = sorted(list(cell_bcs_by_genome[''])) cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)} subsample_info = chunk_defs[0].subsample_info if len( chunk_defs) > 0 else [] for i, task in enumerate(subsample_info): lib_type = task['library_type'] lib_type_idx = lib_type_map[lib_type] ss_type = task['subsample_type'] ss_depth = task['target_read_pairs_per_cell'] if rna_library.has_genomes(lib_type): genome_ints = list(range(data['umis_per_bc'].shape[1])) else: genome_ints = [0] # Per-genome metrics for g in genome_ints: if not data['lib_type_genome_any_reads'][lib_type_idx, g]: continue genome = genomes[g] # Only compute on cell-associated barcodes for this genome. # This only matters when there are multiple genomes present. cell_inds = np.array( sorted(cell_bc_to_int[bc] for bc in cell_bcs_by_genome[genome])) median_umis_per_cell = np.median(data['umis_per_bc'][i, g, cell_inds]) summary[make_metric_name('subsampled_filtered_bcs_median_counts', lib_type, genome, ss_type, ss_depth)] = median_umis_per_cell median_features_per_cell = np.median( data['features_det_per_bc'][i, g, cell_inds]) summary[make_metric_name( 'subsampled_filtered_bcs_median_unique_genes_detected', lib_type, genome, ss_type, ss_depth)] = median_features_per_cell dup_frac = compute_dup_frac(data['read_pairs'][i, g], data['umis'][i, g]) summary[make_metric_name('subsampled_duplication_frac', lib_type, genome, ss_type, ss_depth)] = dup_frac # Whole-dataset duplication frac all_read_pairs = np.sum(data['read_pairs'][i, :]) all_umis = np.sum(data['umis'][i, :]) dup_frac = compute_dup_frac(all_read_pairs, all_umis) summary[make_metric_name('subsampled_duplication_frac', lib_type, lib_constants.MULTI_REFS_PREFIX, ss_type, ss_depth)] = dup_frac with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)