def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() # Concatenate chunks if len(chunk_outs) == 1: subprocess.call(['mv', chunk_outs[0].phased_possorted_bam, outs.phased_possorted_bam]) else: tk_bam.concatenate(outs.phased_possorted_bam, [out.phased_possorted_bam for out in chunk_outs]) tk_bam.index(outs.phased_possorted_bam) outs.phased_possorted_bam_index = outs.phased_possorted_bam + ".bai" total_reads = 0 phased_reads = 0 molecule_tagged_reads = 0 for chunk_out in chunk_outs: total_reads += chunk_out.total_reads phased_reads += chunk_out.phased_reads molecule_tagged_reads += chunk_out.molecule_tagged_reads outs.total_reads = total_reads outs.phased_reads = phased_reads outs.molecule_tagged_reads = molecule_tagged_reads fract_reads_phased = tk_stats.robust_divide(float(phased_reads), float(total_reads)) fract_reads_molecule_id = tk_stats.robust_divide(float(molecule_tagged_reads), float(total_reads)) stats = { "fract_reads_phased": fract_reads_phased, "fract_reads_molecule_id": fract_reads_molecule_id, } with open(outs.summary, 'w') as summary_file: json.dump(tenkit.safe_json.json_sanitize(stats), summary_file)
def evaluate_snp_cluster_calls(cluster_assignment, thresholded_calls, actual): """ Args: - cluster_assignment: list(int) - thresholded_calls: list(int), None if no call - actual: list(int) """ cluster_assignment = np.array(cluster_assignment, dtype=int) actual = np.array(actual, dtype=int) minor_called_class = 1 - sp_stats.mode(cluster_assignment).mode[0] minor_actual_class = 1 - sp_stats.mode(actual).mode[0] was_called = np.array([x is not None for x in thresholded_calls]) called_pos = (cluster_assignment == minor_called_class)[was_called] actual_pos = (actual == minor_actual_class)[was_called] nc = sum(np.logical_not(was_called)) tp = sum(called_pos & actual_pos) tn = sum(np.logical_not(called_pos) & np.logical_not(actual_pos)) fp = sum(called_pos & np.logical_not(actual_pos)) fn = sum(np.logical_not(called_pos) & actual_pos) return { 'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn, 'sensitivity': tk_stats.robust_divide(tp, tp + fn), 'ppv': tk_stats.robust_divide(tp, tp + fp), 'no_call_rate': tk_stats.robust_divide(nc, len(actual)), }
def checkOverlap(self, chrom, start, end): if start > end: raise Exception("errRegionForamt","the stop position is smaller than the start position "+" ".join([start,end])) # database_bed is the object returned by read_database_bed # return # 1. True or False for finding or not finding # 2. Total overlapping base pairs # 3. The percentage of overlap of the query # 4. The percentage of overlap of the largest region in database # 5. Number of regions overlapped if chrom not in self.content: return OverlapInfo(False, 0, 0, 0, 0, 0, self.name) else: overlapping_regions = self.content[chrom].overlapping_regions(start,end) if len(overlapping_regions) == 0: return OverlapInfo(False, 0, 0, 0, 0, 0, self.name) for r in overlapping_regions: key = "_".join([chrom, str(r[0]), str(r[1])]) self.found[key]=1 region_sizes = [r[1]-r[0] for r in overlapping_regions] overlapping_sizes = [min(end,r[1])-max(start,r[0]) for r in overlapping_regions] overlapping_fractions=[robust_divide(o*1.0, s) for s, o in zip(region_sizes, overlapping_sizes)] total_overlap_size=sum(overlapping_sizes) fraction_as_query=robust_divide(total_overlap_size*1.0, (end-start)) #print region_sizes, overlapping_fractions return OverlapInfo(total_overlap_size>0, total_overlap_size, fraction_as_query, \ max(overlapping_fractions), max(region_sizes), len(overlapping_sizes), self.name)
def split(args): # default to downsampling by mapped reads downsample = True use_raw_reads = False if args.normalization_mode == cr_constants.NORM_MODE_RAW: use_raw_reads = True elif args.normalization_mode == cr_constants.NORM_MODE_NONE: downsample = False # compute downsample rates for each gem group downsample_map = args.detect_cells_gg_metrics with cr_mol_counter.MoleculeCounter.open(args.molecules, 'r') as mol_counter: for (gg, submetrics) in mol_counter.get_metric( cr_mol_counter.GEM_GROUPS_METRIC).iteritems(): info = downsample_map[str(gg)] info['total_reads'] = submetrics[ cr_mol_counter.GG_TOTAL_READS_METRIC] reads = info['total_reads'] if use_raw_reads else info['cmb_reads'] cells = info['cells'] info['rpc'] = tk_stats.robust_divide(reads, cells) if cells > 0 else 0.0 lowest_rpc = min([gg['rpc'] for gg in downsample_map.values()]) for gg, info in downsample_map.iteritems(): if downsample and len(downsample_map) > 1: if lowest_rpc == 0: # one or more samples are empty. just do the naive thing for now. frac_reads_kept = 0.0 else: frac_reads_kept = tk_stats.robust_divide( lowest_rpc, info['rpc']) else: frac_reads_kept = 1.0 info['frac_reads_kept'] = frac_reads_kept # Split the molecule info h5 into equi-RAM chunks, preserving (barcode, gem_group) boundaries # Assumes the molecule_info is sorted by (barcode, gem_group) chunks = [] with cr_mol_counter.MoleculeCounter.open(args.molecules, 'r') as mol_counter: for chunk_start, chunk_len in mol_counter.get_chunks( cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK, preserve_boundaries=False): chunks.append({ 'downsample': downsample, 'downsample_map': downsample_map, 'chunk_start': str(chunk_start), 'chunk_len': str(chunk_len), '__mem_gb': cr_mol_counter.MoleculeCounter.estimate_mem_gb(chunk_len), }) return {'chunks': chunks}
def compute_summary_metrics(misc_sm): """Called in the join step to extract summary metrics from the pooled summarizer objects""" metrics = misc_sm.get_summarizer('metrics') metrics['r1_q30_bases_fract'] = robust_divide(metrics['r1_q30_bases'], metrics['r1_tot_bases']) metrics['r2_q30_bases_fract'] = robust_divide(metrics['r2_q30_bases'], metrics['r2_tot_bases']) metrics['si_q30_bases_fract'] = robust_divide(metrics['si_q30_bases'], metrics['si_tot_bases']) metrics['bc_q30_bases_fract'] = robust_divide(metrics['bc_q30_bases'], metrics['bc_tot_bases']) return metrics
def add_doublet_rate_metrics(summary_info, singlecell_df, species_list): """Infer doublet rate from observed doublets""" def infer_multiplets_from_observed(n_obs_multiplets, n_cells0, n_cells1): """Estimates the number of real multiplets based on the number observed from a barnyard (mixed species) experiment""" if n_cells0 == 0 or n_cells1 == 0 or n_obs_multiplets == 0: return 0 # Prior probability of a doublet given counts for each cell type (ignore N_cells > 2) p_obs_multiplet = (2 * (n_cells0 / (n_cells0 + n_cells1)) * (n_cells1 / (n_cells0 + n_cells1))) # Brute force MLE of binomial n likelihood = scipy.stats.binom.pmf(n_obs_multiplets, xrange(0, n_cells0 + n_cells1), p_obs_multiplet) return np.argmax(likelihood) has_species_info = (species_list != [""]) if not has_species_info or len(species_list) < 2: return summary_info counts = [] cell_barcodes_dict = {} for species in species_list: species_cell_mask = singlecell_df["is_%s_cell_barcode" % species] == 1 print singlecell_df['barcode'][species_cell_mask].values.tolist() cell_barcodes_dict[species] = singlecell_df['barcode'][ species_cell_mask].values.tolist() counts.append(len(cell_barcodes_dict[species])) total_unique_cell_barcodes = { bc for barcodes in cell_barcodes_dict.values() for bc in barcodes } total_cell_barcodes = sum(counts) summary_info['cells_detected'] = len(total_unique_cell_barcodes) if len(species_list) > 1: observed_doublets = total_cell_barcodes - len( total_unique_cell_barcodes) observed_doublet_rate = robust_divide(observed_doublets, total_cell_barcodes) inferred_doublets = infer_multiplets_from_observed( observed_doublets, counts[0], counts[1]) inferred_doublet_rate = robust_divide(inferred_doublets, total_cell_barcodes) summary_info['observed_doublets'] = observed_doublets summary_info['observed_doublet_rate'] = observed_doublet_rate summary_info['inferred_doublets'] = inferred_doublets summary_info['inferred_doublet_rate'] = inferred_doublet_rate return summary_info
def _compute_count_purity(counts0, counts1): """ Compute fraction of counts in putative single-cell GEMs originating from the non-cell transcriptome """ gem_occupancy = MultiGenomeAnalysis._classify_gems(counts0, counts1) frac0 = counts0.astype(float) / (counts0 + counts1).astype(float) purity0 = frac0[gem_occupancy == cr_constants.GEM_CLASS_GENOME0] purity1 = 1 - frac0[gem_occupancy == cr_constants.GEM_CLASS_GENOME1] overall_purity = np.concatenate([purity0, purity1]) # Compute number of purity outliers threshold0, threshold1 = 1.0, 1.0 fit_purity0 = purity0[np.logical_and(purity0 > 0, purity0 < 1)] fit_purity1 = purity1[np.logical_and(purity1 > 0, purity1 < 1)] if len(fit_purity0) > 1 and len(fit_purity1) > 1: try: alpha0, beta0, _, _ = scipy.stats.beta.fit(fit_purity0, floc=0, fscale=1) alpha1, beta1, _, _ = scipy.stats.beta.fit(fit_purity1, floc=0, fscale=1) threshold0 = scipy.stats.beta.ppf( cr_constants.COUNT_PURITY_OUTLIER_PROB_THRESHOLD, alpha0, beta0) threshold1 = scipy.stats.beta.ppf( cr_constants.COUNT_PURITY_OUTLIER_PROB_THRESHOLD, alpha1, beta1) except scipy.stats._continuous_distns.FitSolverError as e: print >> sys.stderr, e threshold0, threshold1 = 1.0, 1.0 except scipy.stats._continuous_distns.FitDataError as e: print >> sys.stderr, e threshold0, threshold1 = 1.0, 1.0 outlier0 = np.logical_and( gem_occupancy == cr_constants.GEM_CLASS_GENOME0, frac0 < threshold0) outlier1 = np.logical_and( gem_occupancy == cr_constants.GEM_CLASS_GENOME1, (1 - frac0) < threshold1) n_outlier0 = sum(outlier0) n_outlier1 = sum(outlier1) frac_outlier0 = tk_stats.robust_divide(n_outlier0, len(purity0)) frac_outlier1 = tk_stats.robust_divide(n_outlier1, len(purity1)) is_outlier = np.logical_or(outlier0, outlier1).astype(int) return (purity0.mean(), purity1.mean(), overall_purity.mean(), n_outlier0, n_outlier1, frac_outlier0, frac_outlier1, is_outlier)
def join(args, outs, chunk_defs, chunk_outs): os.mkdir(outs.demultiplexed_fastq_path) # Move output file to final location for chunk_out in chunk_outs: for f in os.listdir(chunk_out.demultiplexed_fastq_path): in_file = os.path.join(chunk_out.demultiplexed_fastq_path, f) subprocess.call(['mv', in_file, outs.demultiplexed_fastq_path]) # Combine result data r = { 'num_reads': 0, 'num_clusters': 0, 'invalid_count': 0, 'sample_index_counts': {} } for chunk_out in chunk_outs: # We count each end of a paired-end read separately in the summary file. summary_counts = json.load(open(chunk_out.demultiplex_summary)) num_clusters = sum(summary_counts.values()) num_reads = 2 * num_clusters invalid_reads = summary_counts[INVALID_SAMPLE_INDEX] del summary_counts[INVALID_SAMPLE_INDEX] summary_counts = {k: 2 * v for (k, v) in summary_counts.iteritems()} r['num_clusters'] += num_clusters r['num_reads'] += num_reads r['invalid_count'] += invalid_reads r['sample_index_counts'] = tk_dict.add_dicts(r['sample_index_counts'], summary_counts, depth=1) r['invalid_frac'] = tk_stats.robust_divide(r['invalid_count'], r['num_clusters']) json.dump(r, open(outs.demultiplex_summary, 'w'))
def split(args): # determine number of fastq file for each library and gem group, {gem_group : {library_type : count_of_fastq_file} } chunk_counts = defaultdict(lambda: defaultdict(int)) for chunk in args.chunks: chunk_counts[chunk["gem_group"]][chunk["library_type"]] += 1 single_library = True for gem_group in chunk_counts: if len(chunk_counts[gem_group]) > 1: single_library = False if single_library: martian.log_info( 'Single library in input. No need to check barcode compatibility.') # `[]` for the chunks will skip the main return {'chunks': [], 'join': {}} num_reads_to_check_barcode = cr_constants.NUM_READS_TO_CHECK_BARCODE if args.num_reads_to_check_barcode is None else args.num_reads_to_check_barcode chunks = [] for chunk in args.chunks: chunk_def = chunk chunk_def['num_reads_per_chunk_to_check_barcode'] = int( tk_stats.robust_divide( num_reads_to_check_barcode, chunk_counts[chunk["gem_group"]][chunk["library_type"]])) chunks.append(chunk_def) return {'chunks': chunks, 'join': {'__mem_gb': 4}}
def main(args, outs): args.coerce_strings() outs.coerce_strings() if args.confident_regions is None: confident_regions = None else: confident_regions = tk_io.get_target_regions( open(args.confident_regions)) outfile = open(outs.confident_windows, "w") for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): conf_regions = get_conf_regions(chrom, confident_regions) location = start while location < end: region = tk_regions.Regions(regions=[(location, location + args.window_size)]) isect = region.intersect(conf_regions) size = isect.get_total_size() percent = tk_stats.robust_divide(float(size), float(args.window_size)) row = [chrom, location, location + args.window_size, percent] outfile.write("\t".join(map(str, row)) + "\n") location += args.window_size outfile.close()
def infer_barcode_reverse_complement(barcode_whitelist, barcode_files): rc_valid_count = 0 reg_valid_count = 0 if barcode_whitelist: barcode_rc = [] for barcode_file in barcode_files: read_num = 0 if barcode_file[-3:] == ".gz": barcode_open_file = gzip.open(barcode_file) else: barcode_open_file = open(barcode_file, 'r') read_iter = tk_fasta.read_generator_fastq(barcode_open_file) for (name, seq, qual) in read_iter: if seq in barcode_whitelist: reg_valid_count += 1 if tk_seq.get_rev_comp(seq) in barcode_whitelist: rc_valid_count += 1 if read_num > 1000: break read_num += 1 if tk_stats.robust_divide(float(rc_valid_count), float(rc_valid_count + reg_valid_count)) > 0.75: barcode_rc.append(True) else: barcode_rc.append(False) barcode_open_file.close() return barcode_rc else: return [False] * len(barcode_files)
def generate_cell_calling_metrics(parameters, cell_barcodes): summary_info = {} species_list = parameters.keys() for species in species_list: key_suffix = "" if len(species_list) == 1 else "_{}".format(species) # Cell calling metrics summary_info["fitted_mean_noise{}".format( key_suffix)] = parameters[species]["noise_mean"] summary_info["fitted_dispersion_noise{}".format( key_suffix)] = parameters[species]["noise_dispersion"] summary_info["fitted_mean_signal{}".format( key_suffix)] = parameters[species]["signal_mean"] summary_info["fitted_dispersion_signal{}".format( key_suffix)] = parameters[species]["signal_dispersion"] summary_info["fraction_cell_calling_noise{}".format( key_suffix)] = parameters[species]["fraction_noise"] summary_info["cell_threshold{}".format( key_suffix)] = parameters[species]["cell_threshold"] summary_info["goodness_of_fit{}".format( key_suffix)] = parameters[species]["goodness_of_fit"] summary_info["estimated_cells_present{}".format( key_suffix)] = parameters[species]["estimated_cells_present"] summary_info["annotated_cells{}".format(key_suffix)] = len( cell_barcodes[species]) summary_info["estimated_fraction_cells_annotated{}".format(key_suffix)] = \ robust_divide(len(cell_barcodes[species]), parameters[species]["estimated_cells_present"]) summary_info["cells_detected"] = len( {bc for barcodes in cell_barcodes.values() for bc in barcodes}) return summary_info
def get_cov_frac(black_regions, chrom, start, stop): regions = tk_sv_utils.strictly_overlapping_regions( black_regions, chrom, start, stop) tot_black = np.sum([r[1] - r[0] for r in regions]) tot_len = float(stop - start) black_frac = tk_stats.robust_divide(tot_black, tot_len) return black_frac
def split(args): input_bam = tk_bam.create_bam_infile(args.bam_infile) chroms = input_bam.references chrom_lengths = input_bam.lengths cov_hist = p.read_csv(args.cov_hist) weighted_count = cov_hist.counts[1:] * cov_hist.coverage[1:] mean_pos_cov = tk_stats.robust_divide(weighted_count.sum(), cov_hist.counts[1:].sum()) primary_contigs = tenkit.reference.load_primary_contigs( args.reference_path) - {'chrM', 'chrY'} loci = tk_chunks.chunk_by_locus(chroms, chrom_lengths, tenkit.constants.PARALLEL_LOCUS_SIZE * 2, contig_whitelist=primary_contigs, extra_args={'mean': mean_pos_cov}) # Handle empty case if len(loci) == 0: loci = [{'locus': None, 'mean': None}] return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
def load_barcode_dist(filename, barcode_whitelist, gem_group, proportions=True): """ Load barcode count distribution from a json file """ # Input barcode whitelist must be an ordered type; # safeguard against it going out of sync with the distribution file assert barcode_whitelist is None or isinstance(barcode_whitelist, list) if not os.path.isfile(filename): return None with open(filename, 'r') as f: values = json.load(f) start = (gem_group - 1) * len(barcode_whitelist) end = gem_group * len(barcode_whitelist) barcode_counts = { bc: value for bc, value in zip(barcode_whitelist, values[start:end]) } if proportions: total_barcode_counts = sum(barcode_counts.values()) barcode_dist = { bc: tk_stats.robust_divide(float(value), float(total_barcode_counts)) for bc, value in barcode_counts.iteritems() } return barcode_dist else: return barcode_counts
def _compute_frac_barcodes_on_whitelist(fastqs, barcode_whitelist_set, reads_interleaved, read_def): """ Compute fraction of observed barcodes on the barcode whitelist """ num_reads = 0 barcodes_on_whitelist = 0 for fastq in fastqs: barcode_reads = cr_fastq.FastqReader({read_def.read_type: fastq}, read_def, reads_interleaved, None, None) for read in barcode_reads.in_iter: if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break _, barcode, _ = read num_reads += 1 if barcode in barcode_whitelist_set: barcodes_on_whitelist += 1 if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break if num_reads > 0: return tk_stats.robust_divide(barcodes_on_whitelist, num_reads) else: return 0.0
def merge_filtered_metrics(filtered_metrics): result = { 'filtered_bcs': 0, 'filtered_bcs_lb': 0, 'filtered_bcs_ub': 0, 'max_filtered_bcs': 0, 'filtered_bcs_var': 0, 'filtered_bcs_cv': 0, } for i, fm in enumerate(filtered_metrics): # Add per-gem group metrics result.update({ 'gem_group_%d_%s' % (i + 1, key): value for key, value in fm.iteritems() }) # Compute metrics over all gem groups result['filtered_bcs'] += fm['filtered_bcs'] result['filtered_bcs_lb'] += fm['filtered_bcs_lb'] result['filtered_bcs_ub'] += fm['filtered_bcs_ub'] result['max_filtered_bcs'] += fm['max_filtered_bcs'] result['filtered_bcs_var'] += fm['filtered_bcs_var'] # Estimate CV based on sum of variances and means result['filtered_bcs_cv'] = tk_stats.robust_divide( np.sqrt(result['filtered_bcs_var']), fm['filtered_bcs']) return result
def join(args, outs, chunk_defs, chunk_outs): if args.output_format == 'bam': tenkit.bam.concatenate(outs.barcoded_unaligned, [c.barcoded_unaligned for c in chunk_outs]) outs.barcoded = None elif args.output_format == 'fastq': fqs = [c.barcoded for c in chunk_outs] subprocess.check_call('cat ' + ' '.join(fqs) + ' | bgzip -c > ' + outs.barcoded, shell=True) outs.barcoded_unaligned = None # Make a basic set of metrics num_pairs = sum(c.num_pairs for c in chunk_outs) correct_bc_pairs = sum(c.correct_bc_pairs for c in chunk_outs) stats = {} stats['num_read_pairs'] = num_pairs stats['bc_on_whitelist'] = tk_stats.robust_divide(float(correct_bc_pairs), num_pairs) if args.bc_counts is not None: # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) count_arrays = [ np.array(gem_group['bc_counts'], dtype=np.float) for gem_group in counts.values() ] # Compute effective BC diversity and n90 bc count bc_df = pandas.DataFrame( {'bc_num_reads': np.concatenate(count_arrays)}) # Read-based effective diversity reads = bc_df.bc_num_reads.values sum_sq = (reads**2.0).sum() effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq)) stats['barcode_diversity'] = effective_diversity else: stats['barcode_diversity'] = None basic_stats = pandas.DataFrame(stats, index=[0]) basic_stats.to_csv(outs.basic_stats, index=False)
def _summarize_per_barcode(a): mean = np.mean(a) stddev = np.std(a) return { 'mean': mean, 'median': np.median(a), 'cv': tk_stats.robust_divide(float(stddev), float(mean)), 'iqr': np.percentile(a, 75) - np.percentile(a, 25), }
def sample_bcs_and_het_snps(vcf, contigs): block_size = 1e6 num_samples = 10 total_recs = 0 het_snp_recs = 0 het_snp_bcs = 0 total_size = 0 for i in xrange(num_samples): (chrom, length) = random.choice(contigs) start = random.randint(0, max(0, length - block_size)) end = min(start + block_size, length) (tot, snp, bcs) = sample_by_locus(vcf, (chrom, start, end)) total_recs += tot het_snp_recs += snp het_snp_bcs += bcs total_size += (end - start) return (tk_stats.robust_divide(het_snp_recs, total_recs), tk_stats.robust_divide(het_snp_bcs, het_snp_recs), tk_stats.robust_divide(het_snp_recs, total_size))
def summarize_bootstrapped_top_n(top_n_boot): top_n_bcs_mean = np.mean(top_n_boot) top_n_bcs_sd = np.std(top_n_boot) top_n_bcs_var = np.var(top_n_boot) result = {} result['filtered_bcs_var'] = top_n_bcs_var result['filtered_bcs_cv'] = tk_stats.robust_divide(top_n_bcs_sd, top_n_bcs_mean) result['filtered_bcs_lb'] = round(sp_stats.norm.ppf(0.025, top_n_bcs_mean, top_n_bcs_sd)) result['filtered_bcs_ub'] = round(sp_stats.norm.ppf(0.975, top_n_bcs_mean, top_n_bcs_sd)) result['filtered_bcs'] = int(round(top_n_bcs_mean)) return result
def get_protospacer_call_metrics(ps_calls_summary, num_gex_cbs, report_prefix): metrics_dict = {} num_cells_with_multiple_protospacers = ps_calls_summary.loc[ 'More than 1 protospacer expressed', 'num_cells'] num_cells_with_protospacer = ( ps_calls_summary.loc['1 protospacer expressed', 'num_cells'] + num_cells_with_multiple_protospacers) frac_cells_with_protospacer = tk_stats.robust_divide( num_cells_with_protospacer, num_gex_cbs) frac_cells_with_multiple_protospacer = tk_stats.robust_divide( num_cells_with_multiple_protospacers, num_gex_cbs) metrics_dict.update({ report_prefix + 'frac_cells_with_protospacer': frac_cells_with_protospacer, report_prefix + 'frac_cells_with_multiple_protospacer': frac_cells_with_multiple_protospacer, }) return metrics_dict
def infer_barcode_reverse_complement(barcode_whitelist, read_iter): if barcode_whitelist is None: return False reg_valid_count = 0 rc_valid_count = 0 for name, seq, qual in itertools.islice(read_iter, cr_constants.NUM_CHECK_BARCODES_FOR_ORIENTATION): if seq in barcode_whitelist: reg_valid_count += 1 if tk_seq.get_rev_comp(seq) in barcode_whitelist: rc_valid_count += 1 frac_rc = tk_stats.robust_divide(rc_valid_count, rc_valid_count + reg_valid_count) return frac_rc >= cr_constants.REVCOMP_BARCODE_THRESHOLD
def checkCallPerformance(self, SENIDX=0, PPVIDX=2, trackAvoided=[], overlapThr={}): self.clearStatus() overlapThrsGood = True for n in self.genomeTracks: if not n in overlapThr: overlapThrsGood = False break if overlapThr[n] <0.0 or overlapThr[n] > 1.0: overlapThrsGood = False break if not overlapThrsGood: overlapThr = self.genomeTracksOverlapThr validTrackAvoided=[] for n in trackAvoided: if n in self.genomeTracks: validTrackAvoided.append(n) for i in range(self.TotalEvent): evt = self.AllEvents[i] isPassTrackFilter = True for n in self.genomeTracks: if self.Status[i].trackInfo[n].queryFraction > overlapThr[n] : isPassTrackFilter = False break if isPassTrackFilter: self.Status[i].isPass = True self.AllPos+=1 for j in range(self.NumTD): if self.TruthData[j].checkOverlap(evt.chrom, evt.start, evt.end)[0]: self.NPos[j]+=1 self.Status[i].TDStatus[j].isTrue = True self.Status[i].TDStatus[j].isFalse = False else: self.Status[i].TDStatus[j].isTrue = False self.Status[i].TDStatus[j].isFalse = True else: self.Status[i].isPass = False self.HasPerformance = True #print len(self.AllEvents), numFailDMAS, numFailTrack, numFail if self.AllPos <= 0: return self.TruthData[SENIDX].getSensitivity(), 0.0, self.AllPos else: return self.TruthData[SENIDX].getSensitivity(), robust_divide(self.NPos[PPVIDX]*1.0, self.AllPos), self.AllPos
def construct_perturbation_efficiency_summary(f_change, f_change_ci, num_cells_per_perturbation, by_feature, summary_columns = PERTURBATION_EFFICIENCY_SUMMARY_COLUMNS): if (f_change is None) or (f_change_ci is None): return None if by_feature: summary_columns[1] = 'Target Guide' else: summary_columns[1] = 'Target Gene' this_df = pd.DataFrame(columns = summary_columns) counter = 0 control_num_cells = num_cells_per_perturbation['Non-Targeting'] for key in sorted(f_change.keys()): this_key_results = f_change.get(key) this_key_ci = f_change_ci.get(key) if this_key_results is None: continue this_num_cells = num_cells_per_perturbation[key] for (ps, results) in this_key_results.iteritems(): lower_bound = this_key_ci.get(ps)[0] upper_bound = this_key_ci.get(ps)[1] this_df.loc[counter] = (key, ps, results[0], results[1], lower_bound, upper_bound, this_num_cells, tk_stats.robust_divide(results[2], this_num_cells), control_num_cells, tk_stats.robust_divide(results[3], control_num_cells) ) counter += 1 this_df.sort_values(by=['Log2 Fold Change'], ascending = True, inplace = True) return this_df
def get_depth_positional_cv(info, trim_tail): fixed_info = {int(x): y for (x, y) in info.iteritems()} total_count = sum(fixed_info.values()) cutoff_count = total_count * trim_tail seen_count = 0 for depth in sorted(fixed_info.iterkeys(), reverse=True): seen_count += fixed_info[depth] if seen_count >= cutoff_count: cutoff = depth break trimmed_info = {x: y for (x, y) in fixed_info.iteritems() if x <= cutoff} mean_val, var_val = tk_stats.mean_var_from_counts(trimmed_info) if mean_val > var_val: return float('NaN') return tk_stats.robust_divide(numpy.sqrt(var_val - mean_val), mean_val)
def split(args): # Need to store umi_info and a json with a dict containing 1 key per barcode umi_info_mem_gb = 2 * int(np.ceil(vdj_umi_info.get_mem_gb(args.umi_info))) bc_diversity = len(cr_utils.load_barcode_whitelist(args.barcode_whitelist)) assemble_summary_mem_gb = tk_stats.robust_divide(bc_diversity, DICT_BCS_PER_MEM_GB) return { 'chunks': [{ '__mem_gb': int( np.ceil( max(cr_constants.MIN_MEM_GB, umi_info_mem_gb + assemble_summary_mem_gb))), }] }
def get_depth_info_json(info): fixed_info = {int(x): y for (x, y) in info.iteritems()} total_depth_counts = sum(fixed_info.values()) median_depth = None sorted_depths = sorted(fixed_info.keys()) seen_depth_count = 0 mean_depth = 0.0 for depth in sorted_depths: seen_depth_count += fixed_info[depth] mean_depth += float( depth * fixed_info[depth]) / float(total_depth_counts) if seen_depth_count > total_depth_counts / 2 and median_depth is None: median_depth = depth zero_cov_fract = tk_stats.robust_divide(float(fixed_info.get(0, 0.0)), float(total_depth_counts)) return (mean_depth, median_depth, zero_cov_fract)
def _compute_frac_barcodes_on_whitelist(fastqs, barcode_whitelist_set, reads_interleaved, read_def, tolerate_n=True): """ Compute fraction of observed barcodes on the barcode whitelist """ num_reads = 0 barcodes_on_whitelist = 0 for fastq in fastqs: barcode_reads = cr_fastq.FastqReader({read_def.read_type: fastq}, read_def, reads_interleaved, None, None) for read in barcode_reads.in_iter: if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break _, barcode, _ = read num_reads += 1 if barcode in barcode_whitelist_set: barcodes_on_whitelist += 1 elif tolerate_n and 'N' in barcode: # If there's a single N in the barcode, check if # we can can replace the N with a valid base & get # a whitelist hit. This makes us robust to N-cycles. npos = barcode.find("N") a = array.array('c', barcode) for base in ['A', 'C', 'G', 'T']: a[npos] = base new_barcode = a.tostring() if new_barcode in barcode_whitelist_set: barcodes_on_whitelist += 1 break if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break if num_reads > 0: return tk_stats.robust_divide(barcodes_on_whitelist, num_reads) else: return 0.0
def add_bulk_targeting_metrics(summary_info, singlecell_df, species_list): """Take singlecell targeting data and calculate bulk targeting metrics from them. """ for species in species_list: species_cell_mask = singlecell_df["is_%s_cell_barcode" % species] == 1 key_suffix = "" if len(species_list) == 1 else "_{}".format(species) total = singlecell_df[species_cell_mask]["passed_filters"].sum() tss = singlecell_df[species_cell_mask]["TSS_fragments"].sum() dnase = singlecell_df[species_cell_mask][ "DNase_sensitive_region_fragments"].sum() enhancer = singlecell_df[species_cell_mask][ "enhancer_region_fragments"].sum() promoter = singlecell_df[species_cell_mask][ "promoter_region_fragments"].sum() ontarget = singlecell_df[species_cell_mask]["on_target_fragments"].sum( ) blacklist = singlecell_df[species_cell_mask][ "blacklist_region_fragments"].sum() peaks = singlecell_df[species_cell_mask]["peak_region_fragments"].sum() summary_info['frac_fragments_overlapping_targets{}'.format( key_suffix)] = robust_divide(ontarget, total) summary_info['frac_fragments_overlapping_tss{}'.format( key_suffix)] = robust_divide(tss, total) summary_info['frac_fragments_overlapping_dnase{}'.format( key_suffix)] = robust_divide(dnase, total) summary_info['frac_fragments_overlapping_enhancer{}'.format( key_suffix)] = robust_divide(enhancer, total) summary_info['frac_fragments_overlapping_promoter{}'.format( key_suffix)] = robust_divide(promoter, total) summary_info['frac_fragments_overlapping_blacklist{}'.format( key_suffix)] = robust_divide(blacklist, total) summary_info['frac_fragments_overlapping_peaks{}'.format( key_suffix)] = robust_divide(peaks, total) cell_mask = singlecell_df['cell_id'] != 'None' cut_frags_in_peaks = singlecell_df[cell_mask]["peak_region_cutsites"].sum() total = singlecell_df[cell_mask]["passed_filters"].sum() summary_info['frac_cut_fragments_in_peaks'] = robust_divide( cut_frags_in_peaks, 2 * total) return summary_info