def test_targets(self): bam_bc_file = tk_test.in_path("namesort_test.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = tk_seq.load_barcode_whitelist("737K-april-2014") targets_filename = tk_test.in_path('agilent_kinome_targs.bed') targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, target_regions, 1000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r nearest_targ_dists = bc_sms.get('nearest_targ_dists') maxTargetDist = max(nearest_targ_dists.get_summarizer(60).dict.keys()) minTargetDist = min(nearest_targ_dists.get_summarizer(60).dict.keys()) self.assertEqual(minTargetDist, 130) self.assertEqual(maxTargetDist, 10000)
def test_attach_bcs(self): # --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s args = { 'barcode_whitelist' : IN_WHITELIST, 'align_chunk' : IN_BAM, 'barcode_chunk' : IN_I2, 'sample_index_chunk' : IN_I1, 'gem_group' : None, 'paired_end' : True, 'exclude_non_bc_reads' : False, 'max_expected_bc_error': 0.75, 'subsample_rate' : 1.0, } outs = { 'output': OUT_BAM } args = martian.Record(args) outs = martian.Record(outs) main(args, outs) # Get the barcodes barcode_whitelist = tk_seq.load_barcode_whitelist(IN_WHITELIST) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) for r in out_bam: tag_dict = { k:v for (k,v) in r.tags } tag_names = [ k for (k,v) in r.tags ] self.assertTrue(RAW_BARCODE_TAG in tag_names) if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist: self.assertTrue(PROCESSED_BARCODE_TAG in tag_names) self.assertTrue(SAMPLE_INDEX_TAG in tag_names) # Make sure we put out the full BAM file out_len = len([ x for x in pysam.Samfile(OUT_BAM)]) in_len = len([ x for x in pysam.Samfile(IN_BAM)]) self.assertEqual(out_len, in_len) def get_bc(r): tags = { k:v for (k,v) in r.tags } return tags[RAW_BARCODE_TAG] # Ensure each read pair has the same barcode out_bam = pysam.Samfile(OUT_BAM) reads = [ x for x in out_bam ] for (grp, reads) in groupby(reads, lambda x: x.qname): bcs = set(tk_io.get_read_barcode(r) for r in reads) self.assertEqual(len(bcs), 1)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None or args.chunk['barcode'] is None: outs.bc_counts = None return def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) barcode_whitelist = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 barcode_file = open_maybe_gzip(args.chunk['barcode']) bc_iterator = tk_fasta.read_generator_fastq(barcode_file) for (bc_read, raw_bc_seq, raw_bc_qual) in bc_iterator: idx = bc_idx.get(raw_bc_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 # Write BC count array and bad count to pickle result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out)
def test_barcode_counts(self): bam_bc_file = tk_test.in_path("attach_bcs/attach_bcs_output.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = tk_seq.load_barcode_whitelist("737K-april-2014") bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, {}, 2000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r # Look at the barcode results -- there should be a raw bc count for each read pair # n_raw_bcs = bc_table["count"].sum() n_reads = len([x for x in tk_bam.create_bam_infile(bam_bc_file)]) # self.assertEqual(n_raw_bcs, n_reads / 2) # Load the per-cluster table -- there should be a row for each read pair read_info = tenkit.hdf5.read_data_frame(read_info_out) self.assertEqual(read_info.shape[0], n_reads / 2)
def split(args): if args.input is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} # Some R&D bc sets have very small diversity -- don't run on them barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 100: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} min_chunks = 5 if len(barcode_whitelist) > 1e6: min_chunks = 10 bam_in = tk_bam.create_bam_infile(args.input) chunks = tk_bam.chunk_bam_records(bam_in, chunk_split_func, chunk_size_gb=8.0, min_chunks=min_chunks) for c in chunks: c['__mem_gb'] = 7.0 return {'chunks': chunks, 'join': {'__mem_gb': 32.0}}
def join(args, outs, chunk_defs, chunk_outs): final_chunks = [] for cl in chunk_outs: final_chunks.extend(cl.chunks) outs.chunks = final_chunks valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None] # No counts if there's no whitelist or actual counts if args.barcode_whitelist is None or len(valid_counts) == 0: outs.bc_counts = None outs.lot_info = None return result = {} for (c_out, c_def) in zip(chunk_outs, chunk_defs): gem_group = c_def.chunk['gem_group'] if c_out.bc_counts is None: continue with open(c_out.bc_counts) as f: r = json.load(f) gg_result = result.setdefault(gem_group, { 'bad_bc_count': 0, 'bc_counts': None }) gg_result['bad_bc_count'] += r['bad_bc_count'] if gg_result['bc_counts'] is None: gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32) else: gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32) for gg in result.keys(): rgg = result[gg] rgg['bc_error_rate'] = tk_stats.robust_divide( float(rgg['bad_bc_count']), float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())) # possibly do lot detection lot_detection = {} lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist) if lot_map is not None: # get BC counts histogram # for now, just sum over all gem groups bc_seq = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0) bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)} (gelbead_lot, gelbead_lot_confidence, gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map) # only report on lots with nonzero counts gelbead_lot_counts_nonzero = { lot: count for lot, count in gelbead_lot_counts.items() if count > 0 } lot_detection['gelbead_lot'] = gelbead_lot lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero martian.log_info("Gelbead lot detected: %s, reason (if None): %s" % (gelbead_lot, gelbead_lot_confidence)) with open(outs.lot_info, 'w') as f: tenkit.safe_json.dump_numpy(lot_detection, f) with open(outs.bc_counts, 'w') as f: tenkit.safe_json.dump_numpy(result, f)
def main(args, outs): """ Trim the reads in a series of fasta files """ # Set a fixed random seed to eliminate noise in metrics random.seed(0) chunk = args.chunk interleaved = chunk['reads_interleaved'] have_read2 = chunk['read2'] is not None paired = interleaved or have_read2 read1_trim = args.read1_trim_length read2_trim = args.read2_trim_length subsample_rate = chunk['subsample_rate'] # BC config -- BC come from separate fastq, or are embedded in R1 or R2 have_barcode = False bc_in_read1 = False bc_in_read2 = False bc_in_fastq = False # If we have bc in read, use that & ignore a separate BC read if chunk.get('bc_in_read', None) is not None and chunk.get('bc_length', 0) > 0: have_barcode = True bc_length = chunk['bc_length'] if chunk['bc_in_read'] == 1: bc_in_read1 = True read1_trim += bc_length elif chunk['bc_in_read'] == 2: bc_in_read2 = True read2_trim += bc_length else: martian.exit( "bc_in_read configuration incorrect -- read must be 1 or 2") # Otherwise use the BC file elif chunk['barcode'] is not None: have_barcode = True bc_in_fastq = True have_sample_index = chunk['sample_index'] is not None output_directory = os.path.dirname(os.path.realpath(outs.placeholder)) max_read_num = args.max_read_num # counter for sub-chunked files file_number = 1 # open the available read files and make the appropriate iterators if interleaved: read_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True) else: if have_read2: read1_in = openfq(chunk['read1']) read1_iter = tk_fasta.read_generator_fastq(read1_in) read2_in = openfq(chunk['read2']) read2_iter = tk_fasta.read_generator_fastq(read2_in) read_iter = itertools.imap( lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter, read2_iter) else: read1_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read1_in) # open read file read_name = output_directory + "/read" + str(file_number) + ".fastq" read_names = [read_name] out_read_fastq = open(read_name, 'w') # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None: outs.bc_counts = None bc_idx = None else: barcode_whitelist = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = output_directory + "/BC" + str(file_number) + ".fastq" out_bc_fastq = open(bc_name, 'w') bc_names = [bc_name] if bc_in_fastq: bc_in = openfq(chunk['barcode']) bc_iter = tk_fasta.read_generator_fastq(bc_in) elif bc_in_read1 or bc_in_read2: # BC in read -- have output file but no input file bc_iter = itertools.repeat(None) else: bc_iter = itertools.repeat(None) bc_names = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = output_directory + "/SI" + str(file_number) + ".fastq" out_si_fastq = open(si_name, 'w') si_in = openfq(chunk['sample_index']) si_iter = tk_fasta.read_generator_fastq(si_in) si_names = [si_name] else: si_iter = itertools.repeat(None) si_names = [None] # loop through reads read_num = 0 for read, barcode_read, sample_index_read in itertools.izip( read_iter, bc_iter, si_iter): if read_num > 0 and random.random() > subsample_rate: continue if paired: (name1, seq1, qual1, name2, seq2, qual2) = read else: (name1, seq1, qual1) = read new_seq1 = seq1[read1_trim:] new_qual1 = qual1[read1_trim:] if paired: new_seq2 = seq2[read2_trim:] new_qual2 = qual2[read2_trim:] # Get BC sequence out of the read, for BC-in-read schemes if bc_in_read1: barcode_read = (name1, seq1[:bc_length], qual1[:bc_length]) if bc_in_read2: barcode_read = (name2, seq2[:bc_length], qual2[:bc_length]) read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = output_directory + "/read" + str( file_number) + ".fastq" out_read_fastq.close() out_read_fastq = open(read_name, 'w') read_names.append(read_name) if have_barcode: bc_name = output_directory + "/BC" + str( file_number) + ".fastq" out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') bc_names.append(bc_name) else: bc_names.append(None) if have_sample_index: si_name = output_directory + "/SI" + str( file_number) + ".fastq" out_si_fastq.close() out_si_fastq = open(si_name, 'w') si_names.append(si_name) else: si_names.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if chunk['barcode_reverse_complement']: barcode_seq = tk_seq.get_rev_comp(barcode_seq) barcode_qual = barcode_qual[:: -1] # obscure way to reverse string if bc_idx is not None: idx = bc_idx.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1) if paired: tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2, new_qual2) if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) if have_sample_index: out_si_fastq.close() out_read_fastq.close() chunks = [] for (r, bc, si) in zip(read_names, bc_names, si_names): new_chunk = { 'read1': r, 'read2': None, 'barcode': bc, 'sample_index': si, 'barcode_reverse_complement': False, 'reads_interleaved': have_read2 or interleaved, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } chunks.append(new_chunk) outs.chunks = chunks
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk #subsample_rate = 1.0 #if args.subsample_rate is not None: # subsample_rate = args.subsample_rate bam_in = tk_bam.create_bam_infile(args.align_chunk) bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] reads_attached = 0 #emit_read_pair = random.random() < subsample_rate emit_read_pair = True while read.qname == read_name or read_name == None: if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags reads_attached += 1 if not (read_name is None): assert(read.qname == read_name) if emit_read_pair: # Count the perfect reads -- will be used when subsampling in dedup if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(tk_io.get_read_barcode(read) is None): bam_out.write(read) else: bam_out.write(read) try: read = bam_in.next() except StopIteration: read = None break # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def join(args, outs, chunk_defs, chunk_outs): args_dict ={} args_dict["bc_allow_indel"]=args.bc_allow_indel args_dict["bc_max_error_allowed"]=args.bc_max_error_allowed args_dict["bc_pseudo_count"]=args.bc_pseudo_count args_dict["bc_use_mapping"]=args.bc_use_mapping args_dict["bc_mapq"]=args.bc_mapq args_dict["frag_no_merging"]=args.frag_no_merging args_dict["frag_mapq"]=args.frag_mapq args_dict["frag_pval"]=args.frag_pval args_dict["frag_freq"]=args.frag_freq fsummary = open(outs.summary, "w") fsummary.write(safe_json.safe_jsonify(args_dict)) fsummary.close() tk_bam.concatenate(out_file_name=outs.pos_sorted_bam, all_in_file_names=[chunk.pos_sorted_bam for chunk in chunk_outs]) tk_bam.index(outs.pos_sorted_bam) outs.pos_sorted_bam_index = outs.pos_sorted_bam + '.bai' bam_in = tk_bam.create_bam_infile(outs.pos_sorted_bam) chroms = bam_in.references barcode_whitelist = list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)) barcode_whitelist.sort() # Combine fragment csv files into a single h5 file in_csv_files = [co.fragments+"_"+cd.tid+".csv" for (cd, co) in zip(chunk_defs, chunk_outs) if os.path.exists(co.fragments+"_"+cd.tid+".csv")] nfrags = 0 if len(in_csv_files) > 0: bc_num_frags = defaultdict(int) bc_num_reads = defaultdict(int) bc_num_single_reads = defaultdict(int) bc_num_lens = defaultdict(int) temp_csv_barcodes = outs.barcodes+"_temp.csv" nfrags = 0 for f in in_csv_files: # TODO - sequentially append to fragments.h5 file to keep memory under control # - handle multiple GEM groups properly. # ensure the chroms column has string /categorical type in hdf5 # - same fixes for barcodes.h5 file # handle 0-length outputs -- does that result in None file outs? frag_in = p.read_csv(f, names=["tid", "start_pos", "end_pos", "bc_id", "num_reads"]) frag_in["obs_len"] = frag_in.end_pos - frag_in.start_pos frag_in[frag_in.num_reads <= 1].obs_len = 1000 frag_in["est_len"] = np.maximum(1, frag_in["obs_len"] * (frag_in.num_reads + 1) / np.maximum(1, frag_in.num_reads - 1)).astype("int") frag_in[frag_in.num_reads <= 1].est_len = 1000 barcode_seqs = [] molecule_ids = [] for (i, row) in frag_in.iterrows(): bc_num_frags[row.bc_id] += 1 bc_num_reads[row.bc_id] += row.num_reads bc_num_lens[row.bc_id] += row.est_len bc_wl_id = int(row.bc_id) % len(barcode_whitelist) gg = int(row.bc_id) / len(barcode_whitelist) + 1 barcode_seq = "%s-%d" % (barcode_whitelist[bc_wl_id], gg) barcode_seqs.append(barcode_seq) molecule_ids.append(nfrags) nfrags += 1 frag_in["bc"] = p.Categorical(barcode_seqs) frag_in["chrom"] = p.Categorical.from_codes(frag_in.tid, chroms) frag_in["molecule_id"] = molecule_ids del frag_in["tid"] del frag_in["bc_id"] if len(frag_in) > 0: tenkit.hdf5.append_data_frame(outs.fragments, frag_in) with open(temp_csv_barcodes, "w") as csv_out: csv_out.write("bc,bc_est_len,bc_linked_read_fraction,bc_linked_fragment_fraction,bc_mean_reads_per_fragment,bc_num_fragments,bc_num_reads\n") for bc_id in range(len(barcode_whitelist)): bc = barcode_whitelist[bc_id]+"-1" if bc_id in bc_num_frags: bc_est_len = bc_num_lens[bc_id] bc_linked_read_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_reads[bc_id] bc_linked_fragment_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_frags[bc_id] bc_mean_reads_per_fragment = bc_num_reads[bc_id]*1.0/bc_num_frags[bc_id] csv_out.write("%s,%d,%f,%f,%f,%d,%d\n" % (bc, bc_est_len, bc_linked_read_fraction, bc_linked_fragment_fraction, bc_mean_reads_per_fragment, bc_num_frags[bc_id], bc_num_reads[bc_id])) if nfrags == 0: outs.fragments = None outs.barcodes = None else: tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') df_barcodes = p.read_csv(temp_csv_barcodes) tenkit.hdf5.append_data_frame(outs.barcodes, df_barcodes) else: outs.fragments = None outs.barcodes= None summary = {} # Compute high-level BC summary metrics # Load BC data if outs.barcodes: bc_df = tenkit.hdf5.read_data_frame(outs.barcodes) fragment_df = tenkit.hdf5.read_data_frame(outs.fragments, query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos']) bc_df.sort('bc_num_reads', inplace=True) # bin the bc counts and write a json histogram file n_reads = bc_df.bc_num_reads.values max_val = np.percentile(n_reads, 99.99) * 1.3 min_val = n_reads.min() num_bins = 400 step = math.ceil((max_val - min_val)/num_bins) bins = np.arange(min_val, max_val, step) (hist, edges) = np.histogram(n_reads, bins=bins) bc_count_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)} # Summarize properties of n50 and n90 BC set bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5 n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh] n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)] n50_stats = high_level_stats("n50", n50_fra, n50_bcs) del n50_fra n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1 n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh] n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)] n90_stats = high_level_stats("n90", n90_fra, n90_bcs) del n90_fra for (k,v) in n50_stats.iteritems(): summary[k] = v for (k,v) in n90_stats.iteritems(): summary[k] = v # Generate a fragment length histogram fragment_df['len_bin'] = np.floor_divide(fragment_df.est_len.values, FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE multi_read_frags = fragment_df[fragment_df.num_reads > 1] len_bins = multi_read_frags.groupby(['len_bin']).apply(len) del multi_read_frags len_hist = {k:v for (k,v) in len_bins.iteritems()} # Write fragment length hist to json with open(outs.fragment_size, 'w') as fragment_size_file: tenkit.safe_json.dump_numpy(len_hist, fragment_size_file) # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0, bc_df.bc_num_reads > 25)] hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True) if len(hot_bcs) > 50: hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:] summary['estimated_dna_per_partition'] = round(scipy.stats.tmean(hot_bcs.bc_est_len, scipy.percentile(hot_bcs.bc_est_len, (1,99)))) else: summary['estimated_dna_per_partition'] = None # Read-based effective diversity reads = bc_df.bc_num_reads.values sum_sq = (reads**2.0).sum() effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq)) summary['effective_diversity_reads'] = effective_diversity # Fragment-based effective diversity fragments = bc_df.bc_num_fragments.values sum_sq = (fragments**2.0).sum() effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0), float(sum_sq)) summary['effective_diversity_fragments'] = effective_diversity else: # No fragment_size file emitted outs.fragment_size = None n50_stats = high_level_stats("n50", None, None) n90_stats = high_level_stats("n90", None, None) for (k,v) in n50_stats.iteritems(): summary[k] = v for (k,v) in n90_stats.iteritems(): summary[k] = v bc_count_hist = {} summary['estimated_dna_per_partition'] = None summary['effective_diversity_reads'] = None summary['effective_diversity_fragments'] = None with open(outs.barcode_histogram, 'w') as barcode_hist_file: tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file) # Write summary to json with open(outs.single_partition, 'w') as summary_file: tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): final_chunks = [] for cl in chunk_outs: final_chunks.extend(cl.chunks) outs.chunks = final_chunks valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None] # No counts if there's no whitelist or actual counts if args.barcode_whitelist is None or len(valid_counts) == 0: outs.bc_counts = None outs.lot_info = None return result = {} for (c_out, c_def) in zip(chunk_outs, chunk_defs): gem_group = c_def.chunk['gem_group'] if c_out.bc_counts is None: continue with open(c_out.bc_counts) as f: r = json.load(f) gg_result = result.setdefault(gem_group, { 'bad_bc_count': 0, 'bc_counts': None }) gg_result['bad_bc_count'] += r['bad_bc_count'] if gg_result['bc_counts'] is None: gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32) else: gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32) total_counts = 0 total_errors = 0 for gg in result.keys(): rgg = result[gg] rgg['bc_error_rate'] = tk_stats.robust_divide( float(rgg['bad_bc_count']), float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())) total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()) total_errors += float(rgg['bad_bc_count']) # Hardcoded bail-out if the BC-correct rate is extremely high bc_error_rate = total_errors / total_counts if bc_error_rate > 0.97: martian.exit( "Extremely high rate of incorrect barcodes observed (%.2f %%). Check that input is 10x Chromium data, and that there are no missing cycles in the first 16bp of Read 1. Please note that Supernova does not support GemCode data." % (bc_error_rate * 100.0)) # possibly do lot detection lot_detection = {} lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist) if lot_map is not None: # get BC counts histogram # for now, just sum over all gem groups bc_seq = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0) bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)} (gelbead_lot, gelbead_lot_confidence, gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map) # only report on lots with nonzero counts gelbead_lot_counts_nonzero = { lot: count for lot, count in gelbead_lot_counts.items() if count > 0 } lot_detection['gelbead_lot'] = gelbead_lot lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero martian.log_info("Gelbead lot detected: %s, reason (if None): %s" % (gelbead_lot, gelbead_lot_confidence)) with open(outs.lot_info, 'w') as f: tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True) with open(outs.bc_counts, 'w') as f: tenkit.safe_json.dump_numpy(result, f)
def main_report_length_mass(args, outs): tmp_dir = os.path.dirname(outs.summary) empty_stats = { 'alpha': [], 'alpha_mean': None, 'alpha_cv': None, 'mean_frags': None, 'total_frags': [], 'length_distribution': {}, 'empirical_length_distribution': {}, 'inferred_mean_length': None, 'inferred_lw_mean_length': None, 'inferred_total_mass_ng': None, 'inferred_bp_per_bc': [], 'mean_bp_per_bc': 0, 'occupied_bcs': 0, 'inferred_number_gems': 0, } if args.barcodes is None or args.barcode_whitelist is None or not os.path.exists( args.barcodes): return empty_stats barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 1000: return empty_stats if args.targets_file is None: targeted = False num_frags = NUM_FRAGS else: targeted = True num_frags = NUM_FRAGS_TARGETED bc_df = tenkit.hdf5.read_data_frame(args.barcodes) frag_df = tenkit.hdf5.read_data_frame( args.fragments, ['bc', 'chrom', 'start_pos', 'obs_len', 'num_reads', 'est_len']) input_num_frags = len(frag_df) gem_group = [int(bc.split('-')[1]) for bc in bc_df.bc] num_gem_groups = len(set(gem_group)) # Start with data about all barcodes. # First filter out any barcodes that don't have at least 1 molecule that has > 1 read # This eliminates most of the background contamination of barcodes bc_df = bc_df[bc_df.bc_mean_reads_per_fragment > 1.0].copy() bc_df.sort('bc_num_reads', inplace=True) # Subset set to the N99 barcodes. (i.e. barcode that account for 99% of reads), and have at least 1 valid fragment # A valid fragment must have >= 1 MAPQ30 read and at least 1 bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) prod_bc_thresh = 0.01 * bc_df.bc_num_reads.sum() occupied_bcs_df = bc_df[np.logical_and(bc_df.cum_reads > prod_bc_thresh, bc_df.bc_num_fragments > 0)] if len(occupied_bcs_df) == 0: martian.log_info( "No valid barcodes for length/mass inference -- exiting") return empty_stats # Figure out the subset of BCs likely to be singleton BCs # Only run estimation on that subset # Infer the expected total GEM count that should have been present occupied_bcs = len(occupied_bcs_df) total_diversity = len(barcode_whitelist) * num_gem_groups # Poisson correction -- we know how many barcodes have >= 1 GEM, and we know # how many total barcodes are possible. he Poission distribution to back-calculate # The number of GEMs that must have been present. # For Chromium there are 4.2M barcodes. p_occupied = float(occupied_bcs) / total_diversity mean_gems_per_bc = min(100, -np.log(1 - p_occupied)) p_singleton = scipy.stats.poisson.pmf(1, mean_gems_per_bc) n_singleton = p_singleton * total_diversity # n_gems gets reported out as 'Gems Detected' in Loupe n_gems = int(round(mean_gems_per_bc * total_diversity)) # Only use the bottom 90% of singleton BCs, to avoid contamination at high end bc_df_frags = occupied_bcs_df.sort('bc_num_fragments') singleton_bcs = bc_df_frags[int(round(n_singleton * 0.1)):int(round(n_singleton * 0.9))] martian.log_info("Read Count Threshold for Occupied Barcodes: %f" % occupied_bcs_df.iloc[0].bc_num_reads) martian.log_info("Occupied Barcodes: %d" % occupied_bcs) martian.log_info("Singleton Barcodes: %f" % n_singleton) martian.log_info("Number of GEMs in slice used for inference: %d" % len(singleton_bcs)) martian.log_info("Inferred Number of GEMS: %f" % n_gems) # Get empirical fragment length distribution obs_len = frag_df.obs_len.values # It's possible for multi-read fragments to have a size of zero, which # causes a vanishing density - set a lower limit obs_len = np.maximum(obs_len, 200) empirical_dist = empirical_length_distribution(frag_df) # Cap the obs_len at a reasonable value, then set the length bins accordingly if targeted: max_len_adj_factor = 1.6 else: max_len_adj_factor = 1.3 # select the max length for the fragment length distribution max_len = np.int32(np.percentile(obs_len, 99.97) * max_len_adj_factor) max_len = np.maximum(max_len, 100000) obs_len = np.minimum(obs_len, max_len, dtype=np.int32) max_bin = max_len * 1.01 bin_data = gen_bin_length(NUM_LENGTH_BINS, min_len=500, max_len=max_bin) martian.log_info("Fragments trimmed to max length of %d" % max_len) # Select a random subset of BCS to work with # Fix random seed so that we get repeatable results num_bcs = max(MIN_BCS, float(num_frags) / singleton_bcs.bc_num_fragments.mean()) np.random.seed(0) if len(singleton_bcs) > 0: sel_bcs = singleton_bcs.irow( np.random.randint(0, len(singleton_bcs), num_bcs)).copy() sel_bcs['bc_id'] = np.arange(1, len(sel_bcs) + 1) sel_frags = frag_df[frag_df.bc.isin(sel_bcs.bc)].copy() sel_frags['bc_string'] = sel_frags.bc.astype('string') sel_frags.sort(['bc_string'], inplace=True) martian.log_info("Usings %d fragments" % len(sel_frags)) bc_id_lookup = {} for (bc, bc_id) in zip(sel_bcs.bc, sel_bcs.bc_id): bc_id_lookup[bc] = bc_id # Write out the fragment data for stan to consume nbcs = len(sel_bcs) obs_len = sel_frags.obs_len.values # It's possible for multi-read fragments to have a size of zero, which # causes a vanishing density - set a lower limit obs_len = np.maximum(obs_len, 200) # obs_len for single-read fragments is 1000 in the # fragment file -- remap to 0 obs_len[sel_frags.num_reads.values == 1] = 0.0 obs_len = np.minimum(obs_len, max_len, dtype=np.int32) # Data to be passed to stan data = { # data sizes 'N': len(sel_frags), 'BC': nbcs, # Per BC stats 'bc_observed_frags': sel_bcs.bc_num_fragments, # Fragment data: bc_id maps fragments to bc, num_reads, and obs_length fragment stats 'bc_id': [bc_id_lookup[bc] for bc in sel_frags.bc], 'num_reads': sel_frags.num_reads, 'obs_length': obs_len, } # The number of sizes of the length bins data.update(bin_data) # Add extra data for targeting if neccesary if args.targets_file is not None: targets = tk_io.get_target_regions_dict(open(args.targets_file)) fasta = tenkit.reference.open_reference(args.reference_path) ctg_sizes = [(name, len(seq)) for (name, seq) in fasta.items()] genome_size = float(sum(l for (name, l) in ctg_sizes)) gb_size = 1024 ctg_round_sizes = np.array([ math.ceil(float(sz) / gb_size) * gb_size for (name, sz) in ctg_sizes ]) ctg_starts = np.cumsum(np.concatenate([[0], ctg_round_sizes[:-1]])) ctg_start_series = p.Series(np.array(ctg_starts, dtype=np.int64), index=[name for (name, l) in ctg_sizes]) targ_cs_ctgs = [] on_target_bps = {} rsum = 0 for ((ctg, sz), round_sz) in zip(ctg_sizes, ctg_round_sizes): targs = np.zeros(round_sz, dtype=np.int32) # Mark bases as targeted for (s, e) in targets.get(ctg, []): targs[s:e] = 1 for frag_len in data['bin_length']: on_target_chrom = np.zeros(round_sz, dtype=np.int8) for (s, e) in targets.get(ctg, []): ss = max(0, s - int(frag_len)) ee = min(round_sz, e) on_target_chrom[ss:ee] = 1 # Determine the probability that a fragment w/ a given length will touch an exon on_target_bps[frag_len] = on_target_bps.get( frag_len, 0) + on_target_chrom.sum() del on_target_chrom # Running sum over chromosomes targs_cs = np.cumsum(targs) + rsum rsum += np.sum(targs) targ_cs_bins = targs_cs[::gb_size].copy() del targs del targs_cs targ_cs_ctgs.append(targ_cs_bins) total_target_size = sum( (e - s) for regs in targets.values() for (s, e) in regs) print "Total target size: %d" % total_target_size on_target_fracs = { k: float(v) / genome_size for (k, v) in on_target_bps.items() } print on_target_fracs # STAN will use this to interpolate the target sizes cum_target_bins = np.concatenate(targ_cs_ctgs) assert (cum_target_bins.shape[0] == int( np.sum(ctg_round_sizes / gb_size))) # Get the position of each fragment on the laid-out genome, with the position decimated by 8 ctg_starts = ctg_start_series[sel_frags.chrom].values stan_pos = ((ctg_starts + sel_frags.start_pos) / 8).astype(np.int32) sel_frags['stan_pos'] = stan_pos print sel_frags.head(20) data['pos'] = sel_frags.stan_pos data['genome_size'] = genome_size data['gb_size'] = gb_size data['GB'] = len(cum_target_bins) data['cum_target_bases'] = cum_target_bins # Write out the stan input data input_fn = os.path.join(tmp_dir, "input.R") write_stan_input(input_fn, data) # Generate initial values for optimization ramp = np.linspace(1, 0.1, NUM_LENGTH_BINS) ramp = ramp / ramp.sum() # assume that fragments with 1 read were 2kb when setting initial alpha seen_dna = sel_frags.obs_len.sum() + 2000.0 * (sel_frags.num_reads == 1).sum() mean_alpha = float(sel_frags.num_reads.sum()) / seen_dna frags_mu = sel_bcs.bc_num_fragments.mean() # Initial values of parameters to be estimated by Stan init_data = { # BC amp rate 'alpha': [mean_alpha] * nbcs, # Length distribution 'theta': list(ramp), # Average number of fragments 'mean_frags': frags_mu, # Number of unobserved fragments 'bc_unobserved_frags': [100] * nbcs, 'read_disp': 10, 'amp_length_k': 1.0 / 200000, } init_fn = os.path.join(tmp_dir, "init.R") write_stan_input(init_fn, init_data) # check if we have valid data for stan # need some observed fragments, and a minimal reads / fragments mean_rpf = sel_frags.num_reads.mean() martian.log_info("Mean LPM of molecules selected for inference: %f" % mean_rpf) success = 0 if len(sel_frags) > 0 and mean_rpf > MIN_RPF and ( not targeted or total_target_size >= MIN_TARGET_SIZE): success = run_model(tmp_dir, targeted) else: if targeted and total_target_size < MIN_TARGET_SIZE: martian.log_info( "Target size is too small for length/mass inference: %d" % total_target_size) if len(sel_frags) == 0: martian.log_info("Aborting length-mass inference: no fragments") if mean_rpf < MIN_RPF: martian.log_info( "Reads per fragment too low for length-mass inference: %f" % mean_rpf) if success: res = load_stan_output(os.path.join(tmp_dir, "output.csv")) # If targeted, adjust the fragment length distribution and mass according to the fragment # visibility function if targeted: theta = res['theta'] bl = data['bin_length'] vis_func = np.array([on_target_fracs[l] for l in bl]) print vis_func adj_theta = theta / vis_func adj_theta = adj_theta / adj_theta.sum() missing_factor = 1.0 / (adj_theta * vis_func).sum() # Put back in the adjusted values res['theta'] = adj_theta res['mean_frags'] = missing_factor * res['mean_frags'] res['bc_total_frags'] = missing_factor * res['bc_total_frags'] # print the mass distribution, alpha distributions mean_length = (data['bin_length'] * res['theta']).sum() mean_length_weighted = np.average(data['bin_length'], weights=data['bin_length'] * res['theta']) # Mass conversion ng_per_bp = 1.025e-12 bases_per_bc = res['bc_total_frags'] * mean_length total_bases = res['bc_total_frags'].mean() * mean_length * n_gems total_mass_ng = total_bases * ng_per_bp # calculation bp_per_ng = 9.76e11 # try to calc input mass #z2_vol_per_gem - ufluidcs number, corrected for empty GEMS #bp_per_gem = loaded_mass * bp_per_ng * z2_vol_per_gem / total_z2_vol_input # z2_vol_per_gem = 144 pL # total_z2_vol_input = 65uL # Fixme -- product configuration needs to be passed in & fixed for future products fluidics_params = FLUIDICS_PARAMS['Chromium'] loaded_mass = np.mean(bases_per_bc) * fluidics_params[ 'total_z2_vol_input'] / bp_per_ng / fluidics_params[ 'z2_vol_per_gem'] # Me: magic number, David: empirically derived correction factor DENATURATION_FACTOR = 1.6 # Ad-hoc correction for the apparent 'denaturation' of the input material, which leads to double counting on input DNA corrected_loaded_mass = loaded_mass / DENATURATION_FACTOR stats = { 'alpha': list(res['alpha']), 'alpha_mean': np.mean(res['alpha']), 'alpha_cv': tk_stats.robust_divide(np.std(res['alpha']), np.mean(res['alpha'])), 'mean_frags': res['mean_frags'], 'total_frags': res['bc_total_frags'], 'length_distribution': { str(l): frac for (l, frac) in zip(data['bin_length'], input_num_frags * res['theta']) }, 'empirical_length_distribution': empirical_dist, 'inferred_mean_length': mean_length, 'inferred_lw_mean_length': mean_length_weighted, 'inferred_total_mass_ng': total_mass_ng, 'inferred_bp_per_bc': bases_per_bc, 'mean_bp_per_bc': np.mean(bases_per_bc), 'loaded_mass_ng': loaded_mass, 'corrected_loaded_mass_ng': corrected_loaded_mass, } else: len_dist_default = {str(k): 1.0 / k for k in data['bin_length']} stats = { 'alpha': [], 'alpha_mean': None, 'alpha_cv': None, 'mean_frags': None, 'total_frags': [], 'length_distribution': len_dist_default, 'empirical_length_distribution': empirical_dist, 'inferred_mean_length': None, 'inferred_lw_mean_length': None, 'inferred_total_mass_ng': None, 'inferred_bp_per_bc': [], 'mean_bp_per_bc': None, 'loaded_mass_ng': None, 'corrected_loaded_mass_ng': None, } stats['occupied_bcs'] = occupied_bcs stats['inferred_number_gems'] = n_gems return stats