def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() # Concatenate chunks if len(chunk_outs) == 1: subprocess.call(['mv', chunk_outs[0].phased_possorted_bam, outs.phased_possorted_bam]) else: tk_bam.concatenate(outs.phased_possorted_bam, [out.phased_possorted_bam for out in chunk_outs]) tk_bam.index(outs.phased_possorted_bam) outs.phased_possorted_bam_index = outs.phased_possorted_bam + ".bai" total_reads = 0 phased_reads = 0 molecule_tagged_reads = 0 for chunk_out in chunk_outs: total_reads += chunk_out.total_reads phased_reads += chunk_out.phased_reads molecule_tagged_reads += chunk_out.molecule_tagged_reads outs.total_reads = total_reads outs.phased_reads = phased_reads outs.molecule_tagged_reads = molecule_tagged_reads fract_reads_phased = tk_stats.robust_divide(float(phased_reads), float(total_reads)) fract_reads_molecule_id = tk_stats.robust_divide(float(molecule_tagged_reads), float(total_reads)) stats = { "fract_reads_phased": fract_reads_phased, "fract_reads_molecule_id": fract_reads_molecule_id, } with open(outs.summary, 'w') as summary_file: json.dump(tenkit.safe_json.json_sanitize(stats), summary_file)
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() outs.buckets = {} outs.qnames = [] for out in chunk_outs: outs.qnames += out.qnames for prefix, filename in out.buckets.iteritems(): if prefix not in outs.buckets: outs.buckets[prefix] = [] outs.buckets[prefix].append(filename) # Concatenate all non-barcode reads into single bam file for next bucketing pass if len(chunk_outs) == 1: subprocess.call(['mv', chunk_outs[0].default, outs.default]) else: tk_bam.concatenate(outs.default, [out.default for out in chunk_outs]) # Remove duplicate and sort all non-barcode qnames from chunks to determine qname bucket keys # for next bucketing pass outs.qnames = list(set(outs.qnames)) outs.qnames.sort() n = int(len(outs.qnames) / len(outs.buckets)) outs.qnames = [ outs.qnames[i] for i in xrange(0, len(outs.qnames), max(1, n)) ] # Need to add the empty string to qnames to catch all qname strings before first qname bucket key outs.qnames.insert(0, '')
def main(args, outs): args.coerce_strings() tmp_bam = martian.make_path(str(args.cluster_id) + '.unsorted.bam') tk_bam.concatenate(tmp_bam, args.cluster_bams) outs.merged_bam = martian.make_path('{}.bam'.format(args.cluster_id)) subprocess.check_call([ 'sambamba', 'sort', '-t', str(args.__threads), '-o', outs.merged_bam, tmp_bam ]) os.remove(tmp_bam)
def join(args, outs, chunk_defs, chunk_outs): chunks = zip(chunk_defs, chunk_outs) chunks.sort(key=lambda chunk: chunk[0].prefix) buckets = [] outs.total_reads = 0 for chunk in chunks: buckets.append(chunk[1].default) outs.total_reads += chunk[1].total_reads tk_bam.concatenate(outs.default, buckets)
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() input_bams = [str(chunk.output) for chunk in chunk_outs] #merg and index args_merge = [ 'sambamba', 'merge', '-t', str(args.__threads), 'output_merge.bam' ] #create an extended list to put at the end of args_merge args_merge.extend(input_bams) subprocess.check_call(args_merge) os.rename('output_merge.bam', outs.output) os.rename('output_merge.bam.bai', outs.output + '.bai') tk_bam.concatenate(outs.output, input_bams) tk_bam.index(outs.output)
def join(args, outs, chunk_defs, chunk_outs): chunk_lists = [[], []] outs.total_reads = 0 for chunk in zip(chunk_defs, chunk_outs): index = chunk[0].index chunk_lists[index].append(chunk) outs.total_reads += chunk[1].total_reads # Sanity check vs. position-sorted BAM with tk_bam.create_bam_infile(args.possorted_bam) as possorted_bam_in: assert possorted_bam_in.unmapped + possorted_bam_in.mapped == outs.total_reads buckets = [] for chunks in chunk_lists: chunks = sorted(chunks, key=lambda chunk: chunk[0].prefix) buckets += [chunk[1].default for chunk in chunks] tk_bam.concatenate(outs.default, buckets)
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() # combine the duplicate summary counts dup_summaries = [ json.load(open(out.duplicate_summary)) for out in chunk_outs ] combined_dups = reduce(lambda x, y: tenkit.dict_utils.add_dicts(x, y, 2), dup_summaries) combined_dups['read_counts'] = {} combined_dups['read_counts'][ 'perfect_read_count'] = args.perfect_read_count f = open(outs.duplicate_summary, 'w') json.dump(combined_dups, f) f.close() # combine & index the chunks of the BAM tk_bam.concatenate(outs.output, [c.output for c in chunk_outs]) tk_bam.index(outs.output) outs.index = outs.output + '.bai'
def join(args, outs, chunk_defs, chunk_outs): chunk_lists = [[], []] outs.total_reads = 0 for chunk in zip(chunk_defs, chunk_outs): index = chunk[0].index chunk_lists[index].append(chunk) outs.total_reads += chunk[1].total_reads # Sanity check vs. position-sorted BAM with create_bam_infile(args.possorted_bam) as possorted_bam_in: assert possorted_bam_in.unmapped + possorted_bam_in.mapped == outs.total_reads buckets = [] for chunks in chunk_lists: chunks = sorted(chunks, key=lambda chunk: chunk[0].prefix) buckets += [chunk[1].bcsorted_bam for chunk in chunks] tk_bam.concatenate(outs.bcsorted_bam, buckets) print "%s indexing BAM file" % PROCESSED_BARCODE_TAG index = tenkit.bam.BamBCIndex(outs.bcsorted_bam) index.save_index() outs.bcsorted_bam_index = outs.bcsorted_bam + ".bxi" print "Wrote bx index to %s" % outs.bcsorted_bam_index
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() input_bams = [str(chunk.output) for chunk in chunk_outs] tk_bam.concatenate(outs.output, input_bams) tk_bam.index(outs.output)
def join(args, outs, chunk_defs, chunk_outs): args_dict ={} args_dict["bc_allow_indel"]=args.bc_allow_indel args_dict["bc_max_error_allowed"]=args.bc_max_error_allowed args_dict["bc_pseudo_count"]=args.bc_pseudo_count args_dict["bc_use_mapping"]=args.bc_use_mapping args_dict["bc_mapq"]=args.bc_mapq args_dict["frag_no_merging"]=args.frag_no_merging args_dict["frag_mapq"]=args.frag_mapq args_dict["frag_pval"]=args.frag_pval args_dict["frag_freq"]=args.frag_freq fsummary = open(outs.summary, "w") fsummary.write(safe_json.safe_jsonify(args_dict)) fsummary.close() tk_bam.concatenate(out_file_name=outs.pos_sorted_bam, all_in_file_names=[chunk.pos_sorted_bam for chunk in chunk_outs]) tk_bam.index(outs.pos_sorted_bam) outs.pos_sorted_bam_index = outs.pos_sorted_bam + '.bai' bam_in = tk_bam.create_bam_infile(outs.pos_sorted_bam) chroms = bam_in.references barcode_whitelist = list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)) barcode_whitelist.sort() # Combine fragment csv files into a single h5 file in_csv_files = [co.fragments+"_"+cd.tid+".csv" for (cd, co) in zip(chunk_defs, chunk_outs) if os.path.exists(co.fragments+"_"+cd.tid+".csv")] nfrags = 0 if len(in_csv_files) > 0: bc_num_frags = defaultdict(int) bc_num_reads = defaultdict(int) bc_num_single_reads = defaultdict(int) bc_num_lens = defaultdict(int) temp_csv_barcodes = outs.barcodes+"_temp.csv" nfrags = 0 for f in in_csv_files: # TODO - sequentially append to fragments.h5 file to keep memory under control # - handle multiple GEM groups properly. # ensure the chroms column has string /categorical type in hdf5 # - same fixes for barcodes.h5 file # handle 0-length outputs -- does that result in None file outs? frag_in = p.read_csv(f, names=["tid", "start_pos", "end_pos", "bc_id", "num_reads"]) frag_in["obs_len"] = frag_in.end_pos - frag_in.start_pos frag_in[frag_in.num_reads <= 1].obs_len = 1000 frag_in["est_len"] = np.maximum(1, frag_in["obs_len"] * (frag_in.num_reads + 1) / np.maximum(1, frag_in.num_reads - 1)).astype("int") frag_in[frag_in.num_reads <= 1].est_len = 1000 barcode_seqs = [] molecule_ids = [] for (i, row) in frag_in.iterrows(): bc_num_frags[row.bc_id] += 1 bc_num_reads[row.bc_id] += row.num_reads bc_num_lens[row.bc_id] += row.est_len bc_wl_id = int(row.bc_id) % len(barcode_whitelist) gg = int(row.bc_id) / len(barcode_whitelist) + 1 barcode_seq = "%s-%d" % (barcode_whitelist[bc_wl_id], gg) barcode_seqs.append(barcode_seq) molecule_ids.append(nfrags) nfrags += 1 frag_in["bc"] = p.Categorical(barcode_seqs) frag_in["chrom"] = p.Categorical.from_codes(frag_in.tid, chroms) frag_in["molecule_id"] = molecule_ids del frag_in["tid"] del frag_in["bc_id"] if len(frag_in) > 0: tenkit.hdf5.append_data_frame(outs.fragments, frag_in) with open(temp_csv_barcodes, "w") as csv_out: csv_out.write("bc,bc_est_len,bc_linked_read_fraction,bc_linked_fragment_fraction,bc_mean_reads_per_fragment,bc_num_fragments,bc_num_reads\n") for bc_id in range(len(barcode_whitelist)): bc = barcode_whitelist[bc_id]+"-1" if bc_id in bc_num_frags: bc_est_len = bc_num_lens[bc_id] bc_linked_read_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_reads[bc_id] bc_linked_fragment_fraction = 1.0 - bc_num_single_reads[bc_id]*1.0/bc_num_frags[bc_id] bc_mean_reads_per_fragment = bc_num_reads[bc_id]*1.0/bc_num_frags[bc_id] csv_out.write("%s,%d,%f,%f,%f,%d,%d\n" % (bc, bc_est_len, bc_linked_read_fraction, bc_linked_fragment_fraction, bc_mean_reads_per_fragment, bc_num_frags[bc_id], bc_num_reads[bc_id])) if nfrags == 0: outs.fragments = None outs.barcodes = None else: tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') df_barcodes = p.read_csv(temp_csv_barcodes) tenkit.hdf5.append_data_frame(outs.barcodes, df_barcodes) else: outs.fragments = None outs.barcodes= None summary = {} # Compute high-level BC summary metrics # Load BC data if outs.barcodes: bc_df = tenkit.hdf5.read_data_frame(outs.barcodes) fragment_df = tenkit.hdf5.read_data_frame(outs.fragments, query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos']) bc_df.sort('bc_num_reads', inplace=True) # bin the bc counts and write a json histogram file n_reads = bc_df.bc_num_reads.values max_val = np.percentile(n_reads, 99.99) * 1.3 min_val = n_reads.min() num_bins = 400 step = math.ceil((max_val - min_val)/num_bins) bins = np.arange(min_val, max_val, step) (hist, edges) = np.histogram(n_reads, bins=bins) bc_count_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)} # Summarize properties of n50 and n90 BC set bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5 n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh] n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)] n50_stats = high_level_stats("n50", n50_fra, n50_bcs) del n50_fra n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1 n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh] n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)] n90_stats = high_level_stats("n90", n90_fra, n90_bcs) del n90_fra for (k,v) in n50_stats.iteritems(): summary[k] = v for (k,v) in n90_stats.iteritems(): summary[k] = v # Generate a fragment length histogram fragment_df['len_bin'] = np.floor_divide(fragment_df.est_len.values, FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE multi_read_frags = fragment_df[fragment_df.num_reads > 1] len_bins = multi_read_frags.groupby(['len_bin']).apply(len) del multi_read_frags len_hist = {k:v for (k,v) in len_bins.iteritems()} # Write fragment length hist to json with open(outs.fragment_size, 'w') as fragment_size_file: tenkit.safe_json.dump_numpy(len_hist, fragment_size_file) # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0, bc_df.bc_num_reads > 25)] hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True) if len(hot_bcs) > 50: hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:] summary['estimated_dna_per_partition'] = round(scipy.stats.tmean(hot_bcs.bc_est_len, scipy.percentile(hot_bcs.bc_est_len, (1,99)))) else: summary['estimated_dna_per_partition'] = None # Read-based effective diversity reads = bc_df.bc_num_reads.values sum_sq = (reads**2.0).sum() effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq)) summary['effective_diversity_reads'] = effective_diversity # Fragment-based effective diversity fragments = bc_df.bc_num_fragments.values sum_sq = (fragments**2.0).sum() effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0), float(sum_sq)) summary['effective_diversity_fragments'] = effective_diversity else: # No fragment_size file emitted outs.fragment_size = None n50_stats = high_level_stats("n50", None, None) n90_stats = high_level_stats("n90", None, None) for (k,v) in n50_stats.iteritems(): summary[k] = v for (k,v) in n90_stats.iteritems(): summary[k] = v bc_count_hist = {} summary['estimated_dna_per_partition'] = None summary['effective_diversity_reads'] = None summary['effective_diversity_fragments'] = None with open(outs.barcode_histogram, 'w') as barcode_hist_file: tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file) # Write summary to json with open(outs.single_partition, 'w') as summary_file: tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): buckets = [] for _, out in sorted(zip(chunk_defs, chunk_outs), key=lambda x: x[0].prefix): buckets.append(out.bcsorted_bam) tk_bam.concatenate(outs.bcsorted_bam, buckets)