def get_cell_barcodes(filename, ref, with_species=False): """Read singlecell.csv and emit barcodes""" scdf = pd.read_csv(filename, sep=',') ctg_mgr = ReferenceManager(ref) if not with_species: cell_barcodes = set() for species in ctg_mgr.list_species(): species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1 cell_barcodes.update(scdf[species_cell_mask]['barcode'].values.tolist()) else: cell_barcodes = {} for species in ctg_mgr.list_species(): species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1 cell_barcodes[species] = set(scdf[species_cell_mask]['barcode'].values.tolist()) return cell_barcodes
def split(args): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} chunks = [] matrix_mem_gb = 0. if args.filtered_tf_bc_matrix is not None: matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5 matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix) chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB))) if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS): raise ValueError('Invalid factorization provided') # create a chunk for each method x clustering combo for method in args.factorization: clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key) for cluster in set(clustering.clusters): chunks.append({ 'method': method, 'clustering_key': key, 'cluster': cluster, '__mem_gb': chunk_mem_gb, '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1, '__threads': 1, }) return {'chunks': chunks, 'join': {'__mem_gb': 3}}
def check_reference_format(reference_path): """Check file formats for files present in the reference""" try: contig_manager = ReferenceManager(reference_path) except Exception as e: martian.exit("Contig manager could not be initialized, Error:\n%s" % str(e)) # formatting error_msg = contig_manager.verify_contig_defs() if error_msg is not None: martian.exit(error_msg) # filecheck contig_manager.genes # check if motif file is in right format (naming convention) if len(contig_manager.list_species()) == 1: motif_format_checker(contig_manager.motifs) # checks for valid bed file formats in regions/ faidx_file = os.path.join(reference_path, 'fasta', 'genome.fa.fai') bed_format_checker(contig_manager.tss_track, faidx_file) bed_format_checker(contig_manager.transcripts_track, faidx_file) bed_format_checker(contig_manager.ctcf_track, faidx_file) bed_format_checker(contig_manager.blacklist_track, faidx_file) bed_format_checker(contig_manager.dnase_track, faidx_file) bed_format_checker(contig_manager.enhancer_track, faidx_file) bed_format_checker(contig_manager.promoter_track, faidx_file)
def split(args): """Compute base background in split and use it in each chunk.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} with open(args.globalGCdict, 'r') as f: GCdict = pickle.load(f) GCdict_paths = {} GCbins = sorted(GCdict.keys()) for gc in GCbins: GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format( gc[0], gc[1])) with open(GCdict_paths[gc], 'w') as dump: pickle.dump(GCdict[gc], dump) # write rows of each chunk to a new peak file mem_in_gb = 8 chunk_def = [{ '__mem_gb': mem_in_gb, '__vmem_gb': mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1, 'skip': False, 'GCdict': GCdict_paths[chunk] } for chunk in GCbins] return {'chunks': chunk_def}
def join(args, outs, chunk_defs, chunk_outs): ref_mgr = ReferenceManager(args.reference_path) if args.filtered_matrix is None or args.peak_motif_hits is None or len( ref_mgr.list_species()) > 1: outs.filtered_tf_bc_matrix = None outs.filtered_tf_bc_matrix_mex = None outs.tf_propZ_matrix = None return # motif scan is completed in ANNOTATE_PEAKS peaks = BedTool(args.peaks) motifs = Motifs(args.reference_path) peak_motif_hits = BedTool(args.peak_motif_hits) # extract peak coordinate to numerical index map peak_idx, n_peaks = _get_peak_indexes(peaks) # extract motif names to numerical index map motif_idx, n_motifs = _get_motif_indexes(motifs) # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix( peak_motif_hits, peak_idx, motif_idx, format='binary') # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)), shape=(n_motifs, n_peaks), dtype='int32') # compute the motif-BC matrix via pooling # The current method simply counts the number of hits for a motif inside the peaks in a barcode # cast as a CountMatrix peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix) motif_names = motif_idx.keys() barcodes = peak_matrix.bcs genomes = utils.generate_genome_tag(args.reference_path) motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes) tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes, tf_peak_matrix * peak_matrix.m) # perform MAD-zscoring of proportion values propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0)) propZ_matrix = MADzscore(propZ_matrix) outs.coerce_strings() # save to h5 and csv tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix, sw_version=martian.get_pipelines_version()) if not os.path.exists(outs.filtered_tf_bc_matrix_mex): os.mkdir(outs.filtered_tf_bc_matrix_mex) atac_matrix.save_mex( tf_matrix, outs.filtered_tf_bc_matrix_mex, feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE, sw_version=martian.get_pipelines_version()) # save propZ matrix as gz np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
def generate_genome_tag(ref_path): """Replace empty genome name for single genomes with valid genome name""" # For a single species reference, use contents of <reference_path>/genome ref_contig_manager = ReferenceManager(ref_path) genomes = ref_contig_manager.list_species() if len(genomes) == 1 and genomes[0] == '' or len(genomes) == 0: genomes = [ref_contig_manager.genome] return genomes
def main(args, outs): reference = ReferenceManager(args.reference_path) species_list = reference.list_species() is_barnyard = len(species_list) > 1 and args.singlecell is not None summary_data = None if args.summary_results: with open(args.summary_results, 'r') as infile: summary_data = json.load(infile) # Pull up the correct template information template_path = os.path.dirname(os.path.abspath(__file__)) template_file = os.path.join( template_path, '{}{}.html'.format('barnyard' if is_barnyard else 'single', '_debug' if args.debug else '')) with open(template_file, 'r') as infile: template = infile.read() metadata = MetricAnnotations() websummary_data = { 'alarms': { 'alarms': [] }, 'sample': { 'id': args.sample_id, 'description': args.sample_desc, 'pipeline': "Cell Ranger ATAC Renalyzer" } } singlecell_df = pd.read_csv( args.singlecell) if args.singlecell is not None else None add_data( websummary_data, get_hero_metric_data(metadata, summary_data, species_list, args.debug)) add_data(websummary_data, get_pipeline_info(args, reference, args.debug)) add_data( websummary_data, get_clustering_plots(metadata, summary_data, args.analysis, args.filtered_peak_bc_matrix, species_list, singlecell_df, is_barnyard)) # Modify the titles of plots to add consistent plot styling sample ID/descriptions for key, subdata in websummary_data.iteritems(): if "layout" in subdata: subdata["layout"][ "title"] += '<br><sup>Sample {} - {}</sup>'.format( args.sample_id, args.sample_desc) subdata["layout"]["hovermode"] = "closest" subdata["config"] = PLOT_CONFIG_KWARGS with open(outs.web_summary, 'w') as outfile: summarize.generate_html_summary(websummary_data, template, template_path, outfile)
def main(args, outs): if args.singlecell_mapping is None or args.singlecell_targets is None or args.singlecell_cells is None: outs.singlecell = None outs.summary = None return ref = ReferenceManager(args.reference_path) species_list = ref.list_species() # Merge the input singlecell data into a single dataframe and write it out mapping = pd.read_csv(args.singlecell_mapping) cells = pd.read_csv(args.singlecell_cells) targeting = pd.read_csv(args.singlecell_targets) merged = mapping.merge(cells, how="left", on="barcode", sort=False, validate="one_to_one") merged["cell_id"] = merged["cell_id"].fillna("None") for column in merged.columns: if column.endswith("_cell_barcode") or column.startswith( "passed_filters_") or column.startswith( "peak_region_fragments_"): merged[column] = merged[column].fillna(0).astype(int) merged = merged.merge(targeting, how="left", on="barcode", sort=False, validate="one_to_one") keys = [ "{}_fragments".format(region) for region in [ "TSS", "DNase_sensitive_region", "enhancer_region", "promoter_region", "on_target", "blacklist_region", "peak_region" ] ] + ["peak_region_cutsites"] for column in keys: merged[column] = merged[column].fillna(0).astype(int) merged.to_csv(outs.singlecell, index=None) summary_info = {} summary_info = add_bulk_targeting_metrics(summary_info, merged, species_list) summary_info = add_doublet_rate_metrics(summary_info, merged, species_list) summary_info = add_purity_metrics(summary_info, merged, species_list) summary_info = add_bulk_mapping_metrics(summary_info, merged, species_list) summary_info = add_singlecell_sensitivity_metrics(summary_info, merged, species_list) with open(outs.summary, 'w') as summary_file: summary_file.write(json.dumps(summary_info, indent=4))
def join(args, outs, chunk_defs, chunk_outs): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: outs.enrichment_analysis = None outs.enrichment_analysis_summary = {} return peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix) tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}} # for each method, we merge h5 files and copy csv directories to one place cr_io.mkdir(outs.enrichment_analysis, allow_existing=True) for method in args.factorization: method_dir = os.path.join(outs.enrichment_analysis, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method)) outs.enrichment_analysis_summary['h5'][method] = _h5 chunk_h5s = [] _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method)) outs.enrichment_analysis_summary['csv'][method] = _csv diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs] if args.filtered_tf_bc_matrix is not None: diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs] clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.clustering_key == key], key=lambda x: x[1].cluster) chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering] # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering])) # write out h5 chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key)) with analysis_io.open_h5_for_writing(chunk_h5) as f: cr_diffexp.save_differential_expression_h5(f, key, diffexp) chunk_h5s += [chunk_h5] # write out csv cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv) analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_MAP_DE[method]])
def split(args): """Compute base background in split and use it in each chunk """ n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0 ref_mgr = ReferenceManager(args.reference_path) if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} # write rows of each chunk to a new peak file mem_in_gb = 4.0 chunk_def = [{'__mem_gb': mem_in_gb, 'skip': False, 'chunk_start': chunk[0], 'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)] return {'chunks': chunk_def}
def main(args, outs): """Run this for each method x clustering key combination from split""" ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return # Load the peak-BC matrix and a clustering and perform DE peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) clustering_h5 = args.clustering_summary['h5'][args.method] clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key) mask = clustering.clusters == args.cluster clustering.clusters[mask] = 1 clustering.clusters[np.logical_not(mask)] = 2 # find depth using peak matrix and normalize scale = np.array(peak_matrix.m.sum(axis=0)).squeeze() depth = (scale + 1) / np.median(scale) cov_peak = [np.log(depth)] diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson', impute_rest=True, test_params={'cov': cov_peak}, verbose=True) # find empirical estimates of alpha tf_matrix = None diffexp_tf = None # do DE on tf-BC matrix if args.filtered_tf_bc_matrix is not None: tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix) ntfmatrix = normalize_matrix(tf_matrix.m, scale) alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix) barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix) cov_tf = [barcode_GC, np.log(depth)] diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True, test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True) # vstack diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data])) # write out temp file np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',') outs.enrichment_analysis = None outs.enrichment_analysis_summary = None
def split(args): ref_mgr = ReferenceManager(args.reference_path) if args.filtered_matrix is None or args.peak_motif_hits is None or len( ref_mgr.list_species()) > 1: return {'chunks': []} matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5( args.filtered_matrix) npeaks, nbcs, nnz = cr_matrix.CountMatrix.load_dims_from_h5( args.filtered_matrix) # assume we will never test more than 1000 TFs and # the relative hit-rate of a TF is a generous 1 out of every 10 peaks MAX_TF_COUNT = 1000 MAX_TF_PEAK_SPARSITY = 0.1 BYTES_PER_INT = np.dtype(int).itemsize BYTES_PER_FLOAT = np.dtype(float).itemsize BYTES_PER_GB = 1024**3 ENTRIES_PER_VAL = 3 predicted_tf_peak_matrix_mem_gb = ENTRIES_PER_VAL * MAX_TF_PEAK_SPARSITY * npeaks * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB predicted_tf_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB predicted_tf_propZ_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_FLOAT / BYTES_PER_GB chunk_mem_gb = int( np.ceil( max( matrix_mem_gb + predicted_tf_peak_matrix_mem_gb * 2 + predicted_tf_matrix_mem_gb * 2 + predicted_tf_propZ_matrix_mem_gb * 2, h5_constants.MIN_MEM_GB))) vmem_peak_motif_hits = int( np.ceil(predicted_tf_peak_matrix_mem_gb) * 3 + predicted_tf_peak_matrix_mem_gb) # HACK - give big jobs more threads in order to avoid overloading a node threads = cr_io.get_thread_request_from_mem_gb(chunk_mem_gb) return { 'chunks': [], 'join': { '__mem_gb': chunk_mem_gb, '__vmem_gb': chunk_mem_gb + vmem_peak_motif_hits + 1, '__threads': threads } }
def join(args, outs, chunk_defs, chunk_outs): """Compute base background in each peak.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: outs.GCdist = None return # get peak-GC distribution genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0]) GCdist = [ utils.get_peak_GC_counts(peak, genome_fa, counts=False) for peak in peak_reader(args.peaks) ] # compute base background from peaks in bins # merge extreme GC bins with adjoining ones if they're too narrow for motif scanner to work correctly GCbounds = [] nbins = NBINS for n, gc in enumerate( np.percentile(GCdist, np.linspace(0, 100, nbins + 1, endpoint=True), interpolation='lower')): if n == 0 or n == nbins: GCbounds += [gc] continue if gc >= LOW_GC and gc < HIGH_GC: GCbounds += [gc] GCbins = sorted(list(set(zip(GCbounds, GCbounds[1:])))) # uniqify peaks = peak_reader(args.peaks) GCdict = get_GCbinned_peaks_and_bg(peaks, genome_fa, GCbins) # dump with open(outs.GCdict, 'w') as f: pickle.dump(GCdict, f)
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() if args.fragments is None: outs.regenerated_metrics = None outs.singlecell = None return target_counts_by_barcode = {} ref_mgr = ReferenceManager(args.reference_path) for chunk_in, chunk_out in zip(chunk_defs, chunk_outs): with open(chunk_out.target_counts_by_barcode, 'r') as infile: chunk_counts = pickle.load(infile) for barcode, barcode_counts in chunk_counts.iteritems(): if barcode not in target_counts_by_barcode: target_counts_by_barcode[barcode] = barcode_counts else: for key, value in barcode_counts.iteritems(): if key == 'cell_id': target_counts_by_barcode[barcode][key] = value else: target_counts_by_barcode[barcode][key] += value species_list = ref_mgr.list_species() keys = ["{region}_fragments".format(region=reg) for reg in ["TSS", "DNase_sensitive_region", "enhancer_region", "promoter_region", "on_target", "blacklist_region", "peak_region"]] +\ ["peak_region_cutsites", "passed_filters", "duplicate", "cell_id"] +\ ["is_{}_cell_barcode".format(species) for species in species_list] if len(species_list) > 1: keys += ["passed_filters_{}".format(species) for species in species_list] +\ ["peak_region_fragments_{}".format(species) for species in species_list] with open(outs.singlecell, 'w') as outfile: outfile.write("barcode,") outfile.write(",".join(keys)) outfile.write("\n") for barcode in sorted(target_counts_by_barcode.keys()): outfile.write("{},".format(barcode)) outfile.write(",".join( [str(target_counts_by_barcode[barcode][key]) for key in keys])) outfile.write("\n") # write cell barcodes if uniques > 0 (i.e. subsampling didn't lose barcodes) # overwrite the singlecell.csv with update cell calls scdf = pd.read_csv(outs.singlecell, sep=',') scdf['cell_id'] = np.full(len(scdf), "None") ctg_mgr = ReferenceManager(args.reference_path) for species in ctg_mgr.list_species(): species_cell_mask = (scdf['is_{}_cell_barcode'.format(species)] >= 1) & (scdf['passed_filters'] > 0) scdf['is_{}_cell_barcode'.format(species)] = np.where( species_cell_mask, 1, 0) scdf['cell_id'][species_cell_mask] = np.array([ "{}_cell_{}".format(species, num) for num in xrange(np.sum(species_cell_mask)) ]) scdf.to_csv(outs.singlecell, sep=',', index=False) cell_barcodes = get_cell_barcodes(outs.singlecell, args.reference_path, with_species=True) with open(outs.cell_barcodes, 'w') as f: for species in cell_barcodes: f.write(species + "," + ",".join(cell_barcodes[species]) + "\n") # write frag metrics summary_info = {} summary_info = add_bulk_targeting_metrics(summary_info, scdf, species_list) summary_info = add_doublet_rate_metrics(summary_info, scdf, species_list) summary_info = add_purity_metrics(summary_info, scdf, species_list) summary_info = add_singlecell_sensitivity_metrics(summary_info, scdf, species_list) for species in species_list: key_suffix = "" if len(species_list) == 1 else "_{}".format(species) summary_info['annotated_cells{}'.format(key_suffix)] = scdf[ 'is_{}_cell_barcode'.format(species)].sum() with open(outs.regenerated_metrics, 'w') as summary_file: summary_file.write(json.dumps(summary_info, indent=4))
def join(args, outs, chunk_defs, chunk_outs): # Sample ID / pipestance name check_sample_id(args.sample_id) # force_cells check_force_cells(args.force_cells, ulimit=10000000) # allow arbitrarily large limit for reanalyzer # # Reference # ref directory structure and timestamps ok, msg = check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) # formatting check_reference_format(args.reference_path) contig_manager = ReferenceManager(args.reference_path) # peaks format check and nonoverlapping if args.peaks is None: martian.exit("peaks file not provided") exists_and_readable(args.peaks, "peaks") bed_format_checker(args.peaks, contig_manager.fasta_index) contain_three_columns(args.peaks) if is_overlapping(args.peaks): martian.exit("{} contains overlapping peak regions".format(args.peaks)) # check parameters files if args.parameters is not None: if not os.path.exists(args.parameters): martian.exit("{} does not exist".format(args.parameters)) # fragments checks whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist) species_list = contig_manager.list_species() observed_gem_groups = set() observed_species = set() if args.fragments is None: martian.exit("fragments file not provided") exists_and_readable(args.fragments, "fragments") contig_lens = contig_manager.get_contig_lengths() # check bounds and matching contigs in reference and species for chrom, start, stop, bc, _ in open_fragment_file(args.fragments): spec = chrom.split("_") observed_species.add(spec[0] if spec[0] != chrom else "") barcode, gem_group = bc.split("-") observed_gem_groups.add(gem_group) if args.check_executables: # run this only non-locally if barcode not in whitelist_barcodes: martian.exit("{} is not a valid whitelist barcode".format(barcode)) if chrom not in contig_lens: martian.exit("contig {} not present in reference".format(chrom)) if stop > contig_lens[chrom]: martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom])) # ensure fragments are on the correct reference for species in observed_species: if species not in species_list: martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments)) if len(observed_gem_groups) > 1: martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments)) # fragments index is synced with fragments if args.fragments_index is None: martian.exit("fragments index file not provided") if not os.path.exists(args.fragments_index): martian.exit("{} does not exist".format(args.fragments_index)) try: all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True) for contig in all_contigs: en = 0 for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index): if en >= FRAGMENTS_SCAN_SIZE: break en += 1 except: martian.exit("fragments index is not in sync with the fragments file") # aggr csv checks if args.aggregation_csv is not None: check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True) # cell barcode checks if args.cell_barcodes is not None: if not os.path.exists(args.cell_barcodes): martian.exit("{} does not exist".format(args.cell_barcodes)) check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes) # Open file handles limit if args.check_executables: check_filehandle_limit() martian.log_info(tk_preflight.record_package_versions())
def join(args, outs, chunk_defs, chunk_outs): if args.fragments is None: outs.connect_matrix = None outs.gel_bead_doublet_summary = None outs.gel_bead_doublet_barcodes = None return with open(args.barcode_counts, "r") as infile: barcode_counts = Counter(json.load(infile)) with open(chunk_defs[0].valid_barcodes, 'r') as f: valid_barcodes = np.array(f.readlines()[0].strip("\n").split(",")) barcode_seqs, gem_groups = query_barcodes_and_gem_groups(valid_barcodes) barcode_seq_count = max([len(barcode_seqs[gg]) for gg in gem_groups]) n_gem_groups = len(gem_groups) index_by_barcode = { gg: {bc: i for i, bc in enumerate(barcode_seqs[gg])} for gg in gem_groups } index_by_gg = {gg: i for i, gg in enumerate(gem_groups)} connect_matrix = np.zeros( (n_gem_groups, barcode_seq_count, barcode_seq_count), dtype=np.uint32) # This can be memory intensive due to loading the same amount of memory for chunk_out in chunk_outs: with open(chunk_out.connect_matrix, "r") as infile: connect_matrix += np.load(infile) # Write out the raw matrix with open(outs.connect_matrix, "w") as outfile: for gg in gem_groups: outfile.write(",".join([ merge_barcode_and_gem_group(bc, gg) for bc in barcode_seqs[gg] ])) outfile.write("\n") for i in range(len(barcode_seqs[gg])): outfile.write(",".join( (str(count) for count in connect_matrix[index_by_gg[gg], i, :]))) outfile.write("\n") # Identify mutual nearest neighbors as putative doublets putative_doublets = [] for barcode in valid_barcodes: bc_seq, gg = split_barcode_and_gem_group(barcode) gg_index = index_by_gg[gg] bc_index = index_by_barcode[gg][bc_seq] neighbor = nearest_neighbor(connect_matrix, bc_index, gg_index) if nearest_neighbor(connect_matrix, neighbor, gg_index) == bc_index: if bc_index < neighbor: putative_doublets.append( (barcode, merge_barcode_and_gem_group(barcode_seqs[gg][neighbor], gg))) # Generate the exclusions. Note we write it out once per species since # cell calling is species-specific but these exclusions are not. ref = ReferenceManager(args.reference_path) species_list = ref.list_species() excluded_barcodes = { "label": "gel_bead_doublet", "data": {species: {} for species in species_list} } for pair in putative_doublets: if barcode_counts[pair[0]] < barcode_counts[pair[1]]: excluded_bc, major_bc = pair else: major_bc, excluded_bc = pair for species in species_list: excluded_barcodes["data"][species][excluded_bc] = major_bc with open(outs.gel_bead_doublet_barcodes, "w") as outfile: outfile.write(json.dumps(excluded_barcodes)) estimated_doublet_gelbeads = len(putative_doublets) metrics = {"putative_gelbead_doublets_found": estimated_doublet_gelbeads} with open(outs.gel_bead_doublet_summary, "w") as outfile: outfile.write(json.dumps(metrics))
def main(args, outs): """Compute the depth and signal per library""" # read lib_id = args.n + 1 aggr_df = pd.read_csv(args.aggr_csv, sep=',') library_info = {lib_id: {}} for label in aggr_df.columns.values.tolist(): library_info[lib_id][label] = str(aggr_df.iloc[args.n][label]) # if no normalization, don't waste compute if args.normalization is None: with open(outs.library_info, 'w') as f: pickle.dump(library_info, f) return # set ref properties ctg_mgr = ReferenceManager(args.reference_path) contig_lens = ctg_mgr.get_contig_lengths() max_contig_len = max(contig_lens.values()) curr_chrom = None count_dict = Counter() chrom_len = 1 half_window = WINDOW_SIZE // 2 # traverse fragments file and count stats fragments_f = aggr_df.iloc[args.n]['fragments'] Cuts = None special_normalization = (args.normalization in ["signal_mean", "signal_noise_threshold"]) if special_normalization: Cuts = np.zeros(max_contig_len, dtype='int32') for chrom, start, stop, bc, dups in open_fragment_file( filename=fragments_f): if chrom != curr_chrom: curr_chrom = chrom if chrom not in contig_lens: martian.exit( "fragment {}:{}-{} in {} is mapped to a contig not in the reference" .format(chrom, start, stop, fragments_f)) if special_normalization: count_dict += Counter( Cuts[i] for i in xrange(chrom_len) if Cuts[i] > 0) # only traverse chrom len Cuts[:] = 0 # reset and reuse chrom_len = contig_lens[chrom] if special_normalization: Cuts[max(0, start - half_window):min(start + half_window + 1, chrom_len)] += 1 Cuts[max(0, stop - half_window):min(stop + half_window + 1, chrom_len)] += 1 if special_normalization: count_dict += Counter(Cuts[i] for i in xrange(chrom_len) if Cuts[i] > 0) # only traverse chrom len scdf = pd.read_csv(library_info[lib_id]['cells'], sep=',') cell_mask = np.full(len(scdf), False) for species in ctg_mgr.list_species(): cell_mask |= scdf['is_{}_cell_barcode'.format(species)] == 1 library_info[lib_id]['total_fragments_per_cell'] = np.median( scdf[cell_mask]['total'] if 'total' in scdf[cell_mask].columns else scdf[cell_mask]['passed_filters'] + scdf[cell_mask]['duplicate']) library_info[lib_id]['unique_fragments_per_cell'] = np.median( scdf[cell_mask]['passed_filters']) # do peak calling fit on the count dict and get signal fit if args.normalization in ["signal_mean", "signal_noise_threshold"]: threshold, params = estimate_final_threshold(count_dict, PEAK_ODDS_RATIO) library_info[lib_id]['original_threshold'] = threshold library_info[lib_id]['signal_mean'] = 1 / params.p_signal # dump library info with open(outs.library_info, 'w') as f: pickle.dump(library_info, f)
def check_aggr_csv(aggr_csv, reference_path, cursory=False): """Check aggr csv has correct columns, then progressively stronger checks on duplicates and formating of files. These stronger checks are enabled by default, unless you want to test the basic minimum, for example in reanalyzer""" contig_manager = ReferenceManager(reference_path) # aggr_csv checks exists_and_readable(aggr_csv, "aggr_csv") if cursory: nlibs, library_info, msg = parse_aggr_csv(aggr_csv, whitelist=["library_id"], blacklist=None) else: nlibs, library_info, msg = parse_aggr_csv(aggr_csv) if msg is not None: martian.exit(msg) # At least one library should be there if nlibs == 0: martian.exit("aggregation csv does not include any library. Provide at least two libraries.") if cursory: return # Enable aggr(count1) to run if nlibs == 1: martian.log_info("Aggregator should be run on more than one library") # avoid aggr of duplicate files (assessed by filename). species_list = contig_manager.list_species() for aggr_key in library_info[1]: # at least one library is present files = {} for lib_id in library_info: fname = library_info[lib_id][aggr_key] if fname in files: martian.exit("File {} already specified for a different library under {}".format(fname, aggr_key)) # singlecell.csv should contain 'barcode' and 'is_{}_cell_barcode' columns with the correct type if aggr_key == "cells": check_singlecell_format(fname, species_list, allow_multi_gem_groups=False) # peaks.bed need to be formatted correctly with right contigs if provided in aggr.csv # also check if peaks are non overlapping if aggr_key == "peaks": exists_and_readable(fname, "peaks") bed_format_checker(fname, contig_manager.fasta_index) contain_three_columns(fname) if is_overlapping(fname): martian.exit("{} contains overlapping peak regions".format(fname)) # checks on fragments contig_lens = contig_manager.get_contig_lengths() if aggr_key == "fragments": observed_gem_groups = set() observed_species = set() exists_and_readable(fname, "fragments") en = 0 for chrom, start, stop, bc, _ in open_fragment_file(fname): if en >= FRAGMENTS_SCAN_SIZE: break spec = chrom.split("_") observed_species.add(spec[0] if spec[0] != chrom else "") observed_gem_groups.add(bc.split("-")[1]) if chrom not in contig_lens: martian.exit("fragment {}:{}-{} in {} is mapped to a contig not in the reference".format(chrom, start, stop, fname)) if stop > contig_lens[chrom]: martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom])) en += 1 for species in observed_species: if species not in species_list: martian.exit("{} contains fragments mapped to species not recognized in the reference".format(fname)) if len(observed_gem_groups) > 1: martian.exit("multiple gem groups present in {}, likely generated in a previous aggregation run".format(fname))
def get_counts_by_barcode(reference_path, peaks, fragments, fragments_index=None, contig=None, known_cells=None): """Generate targeting, raw and dup counts per barcode. If cell identity is known, then also return that as part of the counts """ def load_reference_track(track, padding=0): if track is not None: with open(track, 'r') as infile: regions = regtools.get_target_regions(infile, padding=padding) else: regions = None return regions def point_is_in_target(contig, position, target_regions): if target_regions is None: return False if contig not in target_regions: return False return target_regions[contig].contains_point(position) def fragment_overlaps_target(contig, start, stop, target_regions): if target_regions is None: return False if contig not in target_regions: return False return target_regions[contig].overlaps_region(start, stop) ref_manager = ReferenceManager(reference_path) # Load in and pad TSS/CTCF regions if present tss_regions = load_reference_track(ref_manager.tss_track, padding=2000) ctcf_regions = load_reference_track(ref_manager.ctcf_track, padding=250) # Load in regions from reference-associated tracks dnase_regions = load_reference_track(ref_manager.dnase_track) enhancer_regions = load_reference_track(ref_manager.enhancer_track) promoter_regions = load_reference_track(ref_manager.promoter_track) blacklist_regions = load_reference_track(ref_manager.blacklist_track) peak_regions = load_reference_track(peaks) # load cell - species map cell_barcodes = {} species_list = ref_manager.list_species() if known_cells is not None: with open(known_cells, 'r') as infile: for line in infile: items = line.strip("\n").split(",") for barcode in items[1:]: if barcode != "null": if barcode not in cell_barcodes: cell_barcodes[barcode] = [] cell_barcodes[barcode] += [items[0]] # get cell index cell_index = {} spnum = {species: 0 for species in species_list} for species in species_list: for barcode in cell_barcodes: if species in cell_barcodes[barcode]: label = "{}_cell_{}".format(species, spnum[species]) spnum[species] += 1 cell_index[barcode] = label if barcode not in cell_index else '_'.join([cell_index[barcode], label]) counts_by_barcode = {} tss_relpos = Counter() ctcf_relpos = Counter() read_count = 0 iterator = open_fragment_file(fragments) if contig is None else \ parsed_fragments_from_contig(contig, fragments, index=fragments_index) for contig, start, stop, barcode, dups in iterator: read_count += 2 if barcode not in counts_by_barcode: counts_by_barcode[barcode] = Counter() if known_cells is not None: cell_species = cell_barcodes.get(barcode, []) counts_by_barcode[barcode]["cell_id"] = cell_index.get(barcode, "None") for species in species_list: if species in cell_species: counts_by_barcode[barcode]["is_{}_cell_barcode".format(species)] = 1 else: counts_by_barcode[barcode]["is_{}_cell_barcode".format(species)] = 0 # species splits if known_cells is not None and len(species_list) > 1: contig_species = ref_manager.species_from_contig(contig) counts_by_barcode[barcode]["passed_filters_{}".format(contig_species)] += 1 if fragment_overlaps_target(contig, start, stop, peak_regions): counts_by_barcode[barcode]["peak_region_fragments_{}".format(contig_species)] += 1 # raw mapping counts_by_barcode[barcode]["passed_filters"] += 1 counts_by_barcode[barcode]["total"] += dups counts_by_barcode[barcode]["duplicate"] += dups - 1 # Count up transposition site targeting for position in (start, stop): if point_is_in_target(contig, position, tss_regions): region = tss_regions[contig].get_region_containing_point(position) tss_relpos[region.get_relative_position(position)] += 1 if point_is_in_target(contig, position, ctcf_regions): region = ctcf_regions[contig].get_region_containing_point(position) ctcf_relpos[region.get_relative_position(position)] += 1 if point_is_in_target(contig, position, peak_regions): counts_by_barcode[barcode]["peak_region_cutsites"] += 1 # Count up fragment overlap targeting is_targeted = False if fragment_overlaps_target(contig, start, stop, tss_regions): counts_by_barcode[barcode]["TSS_fragments"] += 1 is_targeted = True if fragment_overlaps_target(contig, start, stop, dnase_regions): counts_by_barcode[barcode]["DNase_sensitive_region_fragments"] += 1 is_targeted = True if fragment_overlaps_target(contig, start, stop, enhancer_regions): counts_by_barcode[barcode]["enhancer_region_fragments"] += 1 is_targeted = True if fragment_overlaps_target(contig, start, stop, promoter_regions): counts_by_barcode[barcode]["promoter_region_fragments"] += 1 is_targeted = True if is_targeted: counts_by_barcode[barcode]["on_target_fragments"] += 1 if fragment_overlaps_target(contig, start, stop, blacklist_regions): counts_by_barcode[barcode]["blacklist_region_fragments"] += 1 if fragment_overlaps_target(contig, start, stop, peak_regions): counts_by_barcode[barcode]["peak_region_fragments"] += 1 return read_count, counts_by_barcode, tss_relpos, ctcf_relpos
def main(args, outs): reference = ReferenceManager(args.reference_path) species_list = reference.list_species() is_barnyard = len(species_list) > 1 and args.singlecell is not None singlecell_df = pd.read_csv( args.singlecell) if args.singlecell is not None else None summary_data = None if args.summary_results: with open(args.summary_results, 'r') as infile: summary_data = json.load(infile) # Pull up the correct template information template_path = os.path.dirname(os.path.abspath(__file__)) template_file = os.path.join( template_path, '{}{}.html'.format('barnyard' if is_barnyard else 'single', '_debug' if args.debug else '')) with open(template_file, 'r') as infile: template = infile.read() metadata = MetricAnnotations() websummary_data = { 'alarms': { 'alarms': [] }, 'sample': { 'id': args.sample_id, 'description': args.sample_desc, 'pipeline': "Cell Ranger ATAC" } } # Pull out all the general-purpose information add_data( websummary_data, get_hero_metric_data(metadata, summary_data, species_list, args.debug)) add_data(websummary_data, get_pipeline_info(args, reference, args.debug)) add_data( websummary_data, get_sequencing_info(metadata, summary_data, species_list, args.debug)) add_data( websummary_data, get_cell_metrics_data(metadata, summary_data, species_list, singlecell_df, args.excluded_barcodes, args.debug)) add_data( websummary_data, get_clustering_plots(args.analysis, args.filtered_peak_bc_matrix, species_list, singlecell_df, is_barnyard)) add_data( websummary_data, get_insertsize_data(metadata, summary_data, singlecell_df, args.insert_sizes, species_list, args.debug)) add_data( websummary_data, get_targeting_data(metadata, summary_data, species_list, singlecell_df, args.tss_relpos, args.ctcf_relpos, args.debug, DOWNSAMPLE_TARGETING)) add_data( websummary_data, get_complexity_data(metadata, summary_data, args.bulk_complexity, args.singlecell_complexity, species_list, args.debug)) # For barnyard samples only if is_barnyard: add_data( websummary_data, get_barnyard_data(metadata, summary_data, species_list, singlecell_df, args.debug, DOWNSAMPLE_BARNYARD)) # For PD runs only if args.debug: add_data( websummary_data, get_peakcalling_data(metadata, summary_data, species_list, args.debug)) add_data( websummary_data, get_wasted_data(metadata, summary_data, singlecell_df, species_list, args.debug)) add_data( websummary_data, get_master_table(metadata, summary_data, species_list, is_barnyard, args.debug)) # Modify the titles of plots to add consistent plot styling sample ID/descriptions for key, subdata in websummary_data.iteritems(): if "layout" in subdata: subdata["layout"][ "title"] += '<br><sup>Sample {} - {}</sup>'.format( args.sample_id, args.sample_desc) subdata["layout"]["hovermode"] = "closest" subdata["config"] = PLOT_CONFIG_KWARGS with open(outs.web_summary, 'w') as outfile: summarize.generate_html_summary(websummary_data, template, template_path, outfile)
def main(args, outs): metrics = {} for fname in args.metrics: if fname is not None: with open(fname, 'r') as f: metrics.update(json.load(f)) # Normalize "NaN" values for key in metrics: value = metrics[key] if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)): metrics[key] = None # add version info metrics['cellranger-atac_version'] = martian.get_pipelines_version() if len(metrics) > 0: martian.log_info('Writing out summary_metrics') with open(outs.metrics, 'w') as outfile: outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True)) # compile summary.csv metrics # load library info and fake libraries as species metric_registry = MetricAnnotations() metrics_csv_dict = {} if args.library_info is not None: with open(args.library_info, 'r') as f: library_info = pickle.load(f) library_list = [library_info[n]['library_id'] for n in library_info.keys()] metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list)) # load species level metrics ctg_mgr = ReferenceManager(args.reference_path) metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species())) write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
def join(args, outs, chunk_defs, chunk_outs): if args.fragments is None: outs.barcode_multiplets = None outs.barcode_multiplets_summary = None return with open(args.barcode_counts, "r") as infile: barcode_counts = Counter(json.load(infile)) valid_barcodes = barcode_counts.keys() part_a_seqs, part_c_seqs, part_b_seqs, gem_group_seqs = query_barcode_subsequences( valid_barcodes) part_a_count = max([len(part_a_seqs[c]) for c in part_c_seqs]) part_b_count = max([len(part_b_seqs[c]) for c in part_c_seqs]) part_c_count = len(part_c_seqs) index_by_part_a = { part_c: {part_a: i for i, part_a in enumerate(part_a_seqs[part_c])} for part_c in part_c_seqs } index_by_part_b = { part_c: {part_b: i for i, part_b in enumerate(part_b_seqs[part_c])} for part_c in part_c_seqs } index_by_part_c = {part_c: i for i, part_c in enumerate(part_c_seqs)} part_a_linkage_matrix = np.zeros( (part_c_count, part_b_count, part_a_count, part_a_count), dtype=np.uint32) part_b_linkage_matrix = np.zeros( (part_c_count, part_a_count, part_b_count, part_b_count), dtype=np.uint32) # Search for contaminants as barcodes with higher similarity to a major barcode # with some mimimum signal than self-similarity. barcode_multiplets = {} # group chunks by gem group and aggregate across contigs for post-processing for gem_group_seq in gem_group_seqs: part_a_linkage_matrix[:, :, :, :] = 0 part_b_linkage_matrix[:, :, :, :] = 0 for chunk_in, chunk_out in zip(chunk_defs, chunk_outs): if gem_group_seq != chunk_in.gem_group: continue # aggregate across contigs infile = gzip.GzipFile(chunk_out.part_a_linkage_matrix, 'r') part_a_linkage_matrix += np.load(infile) infile.close() infile = gzip.GzipFile(chunk_out.part_b_linkage_matrix, 'r') part_b_linkage_matrix += np.load(infile) infile.close() for major_barcode, count in barcode_counts.iteritems(): if count < MINIMUM_COUNT: continue part_a, part_c, part_b, gem_group = split_barcode(major_barcode, return_gg=True) if gem_group != gem_group_seq: continue part_a_index = index_by_part_a[part_c][part_a] part_b_index = index_by_part_b[part_c][part_b] part_c_index = index_by_part_c[part_c] for other_part_a in part_a_seqs[part_c]: if other_part_a == part_a: continue minor_barcode = merge_barcode(other_part_a, part_c, part_b, gem_group) other_part_a_index = index_by_part_a[part_c][other_part_a] self_signal = part_a_linkage_matrix[part_c_index, part_b_index, other_part_a_index, other_part_a_index] major_signal = part_a_linkage_matrix[part_c_index, part_b_index, other_part_a_index, part_a_index] if major_signal > (self_signal * SELF_SIGNAL_THRESHOLD_MULTIPLIER): if minor_barcode not in barcode_multiplets: barcode_multiplets[minor_barcode] = major_barcode else: old_major = barcode_multiplets[minor_barcode] old_a, _, _ = split_barcode(old_major) old_a_index = index_by_part_a[part_c][old_a] old_signal = part_a_linkage_matrix[part_c_index, part_b_index, other_part_a_index, old_a_index] if major_signal > old_signal: barcode_multiplets[minor_barcode] = major_barcode for other_part_b in part_b_seqs[part_c]: if other_part_b == part_b: continue minor_barcode = merge_barcode(part_a, part_c, other_part_b, gem_group) other_part_b_index = index_by_part_b[part_c][other_part_b] self_signal = part_b_linkage_matrix[part_c_index, part_a_index, other_part_b_index, other_part_b_index] major_signal = part_b_linkage_matrix[part_c_index, part_a_index, other_part_b_index, part_b_index] if major_signal > (self_signal * SELF_SIGNAL_THRESHOLD_MULTIPLIER): if minor_barcode not in barcode_multiplets: barcode_multiplets[minor_barcode] = major_barcode else: old_major = barcode_multiplets[minor_barcode] _, _, old_b = split_barcode(old_major) old_b_index = index_by_part_b[part_c][old_b] old_signal = part_b_linkage_matrix[part_c_index, part_a_index, other_part_b_index, old_b_index] if major_signal > old_signal: barcode_multiplets[minor_barcode] = major_barcode # Post-screen the contaminants for pairs that are linked to each other. In that # case, remove the pair where we've excluded the larger barcode for minor_barcode in barcode_multiplets.keys(): if minor_barcode not in barcode_multiplets: # Because we've popped it off before we got here continue major_barcode = barcode_multiplets[minor_barcode] if major_barcode in barcode_multiplets and barcode_multiplets[ major_barcode] == minor_barcode: if barcode_counts[major_barcode] > barcode_counts[minor_barcode]: barcode_multiplets.pop(major_barcode) else: barcode_multiplets.pop(minor_barcode) # Post-screen barcode multiplets for those where the major barcode is itself # linked to another barcode for minor_barcode, major_barcode in barcode_multiplets.iteritems(): if major_barcode in barcode_multiplets: major_barcode = barcode_multiplets[major_barcode] barcode_multiplets[minor_barcode] = major_barcode # Generate the exclusions. Note we write it out once per species since # cell calling is species-specific but these exclusions are not. ref = ReferenceManager(args.reference_path) species_list = ref.list_species() excluded_barcodes = { "label": "whitelist_contam", "data": {species: barcode_multiplets for species in species_list} } with open(outs.barcode_multiplets, "w") as outfile: outfile.write(json.dumps(excluded_barcodes)) # Generate some reporting metrics summary_metrics = { "putative_barcode_multiplets_found": len(barcode_multiplets), } with open(outs.barcode_multiplets_summary, "w") as outfile: outfile.write(json.dumps(summary_metrics))
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() if args.fragments is None: outs.cell_barcodes = None outs.cell_calling_summary = None outs.singlecell = None return if args.excluded_barcodes is not None: with open(args.excluded_barcodes, 'r') as infile: excluded_barcodes = json.load(infile) else: excluded_barcodes = None # Merge the chunk inputs ref = ReferenceManager(args.reference_path) species_list = ref.list_species() barcode_counts_by_species = { species: Counter() for species in species_list } targeted_counts_by_species = { species: Counter() for species in species_list } fragment_depth = 0 for chunk_in, chunk_out in zip(chunk_defs, chunk_outs): species = ref.species_from_contig(chunk_in.contig) with open(chunk_out.barcode_counts, 'r') as infile: barcode_counts_by_species[species] += pickle.load(infile) with open(chunk_out.targeted_counts, 'r') as infile: targeted_counts_by_species[species] += pickle.load(infile) fragment_depth += chunk_out.fragment_depth print('Total fragments across all chunks: {}'.format(fragment_depth)) barcodes = list({ bc for species in species_list for bc in barcode_counts_by_species[species] }) non_excluded_barcodes = { species: [bc for bc in barcodes if bc not in excluded_barcodes[species]] for species in species_list } print('Total barcodes observed: {}'.format(len(barcodes))) retained_counts = {} for species in species_list: if excluded_barcodes is None: retained_counts[species] = np.array( [targeted_counts_by_species[species][bc] for bc in barcodes]) else: retained_counts[species] = np.array([ targeted_counts_by_species[species][bc] for bc in barcodes if bc not in excluded_barcodes[species] ]) print('Barcodes excluded for species {}: {}'.format( species, len(excluded_barcodes[species]))) print('Barcodes remaining for species {}: {}'.format( species, len(non_excluded_barcodes[species]))) parameters = {} whitelist_length = len(load_barcode_whitelist(args.barcode_whitelist)) count_shift = max( MINIMUM_COUNT, int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length)) print('Count shift for whitelist contamination: {}'.format(count_shift)) for (species, count_data) in retained_counts.iteritems(): print('Analyzing species {}'.format(species)) # Subtract MINIMUM_COUNT from all counts to remove the effects of whitelist contamination shifted_data = count_data[count_data >= count_shift] - count_shift print('Number of barcodes analyzed: {}'.format(len(shifted_data))) count_dict = Counter(shifted_data) parameters[species] = {} forced_cell_count = None if args.force_cells is not None: if species in args.force_cells: forced_cell_count = int(args.force_cells[species]) elif "default" in args.force_cells: forced_cell_count = int(args.force_cells["default"]) if forced_cell_count > MAXIMUM_CELLS_PER_SPECIES: forced_cell_count = MAXIMUM_CELLS_PER_SPECIES martian.log_info( 'Attempted to force cells to {}. Overriding to maximum allowed cells.' .format(forced_cell_count)) # Initialize parameters to empty parameters[species]['noise_mean'] = None parameters[species]['noise_dispersion'] = None parameters[species]['signal_mean'] = None parameters[species]['signal_dispersion'] = None parameters[species]['fraction_noise'] = None parameters[species]['cell_threshold'] = None parameters[species]['goodness_of_fit'] = None parameters[species]['estimated_cells_present'] = 0 # Corner case where FRIP is 0 because the number of peaks is tiny (fuzzer tests) if len(count_dict) < 10: parameters[species]['cells_detected'] = 0 forced_cell_count = None elif forced_cell_count is None: print('Estimating parameters') fitted_params = estimate_parameters(count_dict) signal_threshold = estimate_threshold( fitted_params, CELL_CALLING_THRESHOLD) + count_shift print('Primary threshold: {}'.format(signal_threshold)) parameters[species]['noise_mean'] = fitted_params.mu_noise parameters[species]['noise_dispersion'] = fitted_params.alpha_noise parameters[species]['signal_mean'] = fitted_params.mu_signal parameters[species][ 'signal_dispersion'] = fitted_params.alpha_signal parameters[species]['fraction_noise'] = fitted_params.frac_noise parameters[species]['cell_threshold'] = signal_threshold parameters[species]['goodness_of_fit'] = goodness_of_fit( shifted_data, fitted_params) called_cell_count = np.sum(count_data >= signal_threshold) parameters[species]['cells_detected'] = called_cell_count parameters[species]['estimated_cells_present'] = int( (1 - fitted_params.frac_noise) * len(shifted_data)) if called_cell_count > MAXIMUM_CELLS_PER_SPECIES: # Abort the model fitting and instead force cells to the maximum forced_cell_count = MAXIMUM_CELLS_PER_SPECIES if forced_cell_count is not None: print('Forcing cells to {}'.format(forced_cell_count)) if forced_cell_count <= 0: raise ValueError("Force cells must be positive") else: adj_data = shifted_data[shifted_data > 0] print('Total barcodes considered for forcing cells: {}'.format( len(adj_data))) parameters[species]['cell_threshold'] = min(adj_data) if forced_cell_count >= len(adj_data) else \ sorted(adj_data, reverse=True)[forced_cell_count - 1] parameters[species]['cell_threshold'] += count_shift parameters[species]['cells_detected'] = np.sum( count_data >= parameters[species]['cell_threshold']) # For barnyard samples, mask out the noise distribution and re-fit to get cleaner separation if len(retained_counts) == 2 and (args.force_cells is None or not args.force_cells): print('Estimating secondary thresholds') sp1, sp2 = species_list sp1_threshold = -1 if parameters[sp1][ 'cell_threshold'] is not None else parameters[sp1]['cell_threshold'] sp2_threshold = -1 if parameters[sp2][ 'cell_threshold'] is not None else parameters[sp2]['cell_threshold'] if parameters[sp1]['cell_threshold'] is not None: sp1_counts = np.array([ targeted_counts_by_species[sp1][bc] for bc in non_excluded_barcodes[sp1] if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and ( targeted_counts_by_species[sp2][bc] > sp2_threshold) ]) sp1_params = estimate_parameters(Counter(sp1_counts), threshold=sp1_threshold) if not np.isnan(sp1_params.frac_noise): parameters[sp1]['cell_threshold'] = max( sp1_threshold, estimate_threshold(sp1_params, 20)) parameters[sp1]['cells_detected'] = np.sum( sp1_counts >= parameters[sp1]['cell_threshold']) else: parameters[sp1]['cells_detected'] = 0 if parameters[sp2]['cell_threshold'] is not None: sp2_counts = np.array([ targeted_counts_by_species[sp2][bc] for bc in non_excluded_barcodes[sp2] if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and ( targeted_counts_by_species[sp2][bc] > sp2_threshold) ]) sp2_params = estimate_parameters(Counter(sp2_counts), threshold=sp2_threshold) if not np.isnan(sp2_params.frac_noise): parameters[sp2]['cell_threshold'] = max( sp2_threshold, estimate_threshold(sp2_params, 20)) parameters[sp2]['cells_detected'] = np.sum( sp2_counts >= parameters[sp2]['cell_threshold']) else: parameters[sp2]['cells_detected'] = 0 print('Secondary threshold ({}): {}'.format( sp1, parameters[sp1]['cell_threshold'])) print('Secondary threshold ({}): {}'.format( sp2, parameters[sp2]['cell_threshold'])) print('Writing out cell barcodes') cell_barcodes = {} for (species, count_data) in retained_counts.iteritems(): threshold = parameters[species]['cell_threshold'] cell_barcodes[species] = {} print('Cell threshold for species {}: {}'.format(species, threshold)) if threshold is not None: for count, barcode in zip(count_data, non_excluded_barcodes[species]): if count >= threshold: print('{} - Total {}, Targeted {}, Count {}, Threshold {}'. format(barcode, barcode_counts_by_species[species][barcode], targeted_counts_by_species[species][barcode], count, threshold)) cell_barcodes[species][barcode] = count if len(cell_barcodes[species] ) != parameters[species]['cells_detected']: print(len(cell_barcodes[species]), parameters[species]['cells_detected']) raise ValueError( 'Mismatch in called cells identified - failure in threshold setting' ) print('Selected {} barcodes of species {}'.format( len(cell_barcodes[species]), species)) with open(outs.cell_barcodes, 'w') as outfile: # low mem reduce op to merge-sort bcs across species for species in cell_barcodes.keys(): outfile.write(species + ",") outfile.write(",".join(cell_barcodes[species]) + "\n") cell_index = compute_cell_index(species_list, cell_barcodes) with open(outs.singlecell, 'w') as outfile: outfile.write("barcode,cell_id,") outfile.write(",".join([ "is_{}_cell_barcode".format(species) for species in species_list ])) if len(species_list) > 1: for species in species_list: outfile.write(",passed_filters_{}".format(species)) outfile.write(",peak_region_fragments_{}".format(species)) outfile.write("\n") for barcode in [NO_BARCODE] + sorted(barcodes): outfile.write("{},".format(barcode)) outfile.write("{},".format(cell_index.get(barcode, "None"))) values = [ str( int(species in cell_barcodes and barcode in cell_barcodes[species])) for species in species_list ] outfile.write(",".join(values)) if len(species_list) > 1: for species in species_list: outfile.write(",{:d}".format( barcode_counts_by_species[species][barcode])) outfile.write(",{:d}".format( targeted_counts_by_species[species][barcode])) outfile.write("\n") # process data into summary metrics summary_info = {} summary_info.update( generate_cell_calling_metrics(parameters, cell_barcodes)) summary_info.update(generate_gb_metrics(cell_barcodes, excluded_barcodes)) with open(outs.cell_calling_summary, 'w') as outfile: outfile.write(json.dumps(summary_info, indent=4))
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() if args.fragments is None: outs.low_targeting_barcodes = None outs.low_targeting_summary = None return # Merge the chunk inputs ref = ReferenceManager(args.reference_path) species_list = ref.list_species() barcode_counts_by_species = {species: Counter() for species in species_list} targeted_counts_by_species = {species: Counter() for species in species_list} peak_bp_by_species = {species: 0 for species in species_list} genome_bp_by_species = {species: 0 for species in species_list} fragment_lengths = {padding: Counter() for padding in PADDING_VALUES} covered_bases = {padding: Counter() for padding in PADDING_VALUES} for chunk_in, chunk_out in zip(chunk_defs, chunk_outs): species = ref.species_from_contig(chunk_in.contig) with open(chunk_out.fragment_counts, "r") as infile: barcode_counts_by_species[species] += pickle.load(infile) with open(chunk_out.targeted_counts, "r") as infile: targeted_counts_by_species[species] += pickle.load(infile) with open(chunk_out.fragment_lengths, "r") as infile: data = pickle.load(infile) for padding in PADDING_VALUES: fragment_lengths[padding] += data[padding] with open(chunk_out.covered_bases, "r") as infile: data = pickle.load(infile) for padding in PADDING_VALUES: covered_bases[padding] += data[padding] peak_bp_by_species[species] += chunk_out.peak_coverage genome_bp_by_species[species] += ref.contig_lengths[chunk_in.contig] frac_genome_in_peaks_by_species = { species: peak_bp_by_species[species] / genome_bp_by_species[species] for species in species_list } # Identify barcodes that have lower fraction of reads overlapping peaks than the # genomic coverage of the peaks low_targeting_barcodes = { "label": "low_targeting", "data": {species: {} for species in species_list} } for species in species_list: for barcode, total_count in barcode_counts_by_species[species].iteritems(): barcode_frac_peaks = ( targeted_counts_by_species[species][barcode] / total_count ) if barcode_frac_peaks < frac_genome_in_peaks_by_species[species]: low_targeting_barcodes["data"][species][barcode] = barcode_frac_peaks # Sum up the total fragment counts per barcode across all species total_barcode_counts = Counter() for species, barcode_counts in barcode_counts_by_species.iteritems(): total_barcode_counts += barcode_counts with open(outs.barcode_counts, "w") as outfile: outfile.write(json.dumps(total_barcode_counts, indent=4)) summary_data = {} for species in species_list: key_suffix = "" if len(species_list) == 1 else "_{}".format(species) summary_data["number_of_low_targeting_barcodes{}".format(key_suffix)] = len( low_targeting_barcodes["data"][species] ) summary_data[ "fraction_of_genome_within_{}bp_of_peaks{}".format(DISTANCE, key_suffix) ] = frac_genome_in_peaks_by_species[species] with open(outs.low_targeting_summary, "w") as outfile: outfile.write(json.dumps(summary_data, indent=4)) with open(outs.low_targeting_barcodes, "w") as outfile: outfile.write(json.dumps(low_targeting_barcodes, indent=4)) with open(outs.fragment_lengths, "w") as outfile: outfile.write(json.dumps(fragment_lengths, indent=4)) with open(outs.covered_bases, "w") as outfile: outfile.write(json.dumps(covered_bases, indent=4))
def main(args, outs): reference = ReferenceManager(args.reference_path) martian.log_info('Writing analysis parameters') write_analysis_parameters(outs.analysis_params) martian.log_info('Initializing summary metrics') summary_metrics = {} summary_metrics = simple_load_metrics(summary_metrics, args.basic_results) if args.singlecell_results is not None: martian.log_info('Loading single cell results') summary_metrics = simple_load_metrics(summary_metrics, args.singlecell_results) if args.insert_summary is not None: martian.log_info('Loading insert summary') summary_metrics = simple_load_metrics(summary_metrics, args.insert_summary) if args.complexity_summary is not None: martian.log_info('Loading complexity summary') summary_metrics = simple_load_metrics(summary_metrics, args.complexity_summary) if args.error_results_summary is not None: martian.log_info('Loading error summary') summary_metrics = simple_load_metrics(summary_metrics, args.error_results_summary) if args.downsample_info is not None: martian.log_info('Loading downsampling information') summary_metrics = simple_load_metrics(summary_metrics, args.downsample_info) if args.contam_results is not None: martian.log_info('Loading contamination results') summary_metrics = simple_load_metrics(summary_metrics, args.contam_results) if args.peak_results is not None: martian.log_info('Loading peak results') summary_metrics = simple_load_metrics(summary_metrics, args.peak_results) if args.enrichment_results is not None: martian.log_info('Loading TSS and CTCF scores') summary_metrics = simple_load_metrics(summary_metrics, args.enrichment_results) if args.cell_calling_summary is not None: martian.log_info('Loading cell calling parameters') summary_metrics = simple_load_metrics(summary_metrics, args.cell_calling_summary) # Normalize "NaN" values for key in summary_metrics: value = summary_metrics[key] if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)): summary_metrics[key] = None if reference.metadata: # If we have reference metadata - copy over the data to summary.json for (key, value) in reference.metadata.items(): summary_metrics["reference_" + key] = value martian.log_info('Writing out summary_metrics') with open(outs.summary, 'w') as outfile: outfile.write( tenkit.safe_json.safe_jsonify(summary_metrics, pretty=True)) # compile summary.csv metrics metric_registry = MetricAnnotations() species_list = reference.list_species() summary_csv_dict = metric_registry.compile_summary_metrics( summary_metrics, species_list=species_list) write_dict_to_csv(outs.summary_csv, summary_csv_dict, sort=True)