def join(args, outs, chunk_defs, chunk_outs): join_df = None for chunk in chunk_outs: df = sv_io.read_sv_bedpe_to_df(chunk.sv_calls) join_df = pd.concat([join_df, df], ignore_index = True) sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
def split(args): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.variants) gt_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants) tk_sv_io.check_sv_names(gt_df) sv_df["name"] = ["call_%d" % idx for idx in range(len(sv_df))] variants_bedpe = os.path.join(os.getcwd(), "variants.bedpe") tk_sv_io.write_sv_df_to_bedpe(sv_df, variants_bedpe) nsvs = sv_df.shape[0] nbreaks_per_chunk = max(100, int(np.ceil(nsvs / 32.0))) # avoid overchunking nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk))) chunk_defs = [] for i in range(nchunks): chunk_start = i * nbreaks_per_chunk chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk) chunk_defs.append({ 'renamed_variants': variants_bedpe, 'start_idx': chunk_start, 'stop_idx': chunk_end, '__mem_gb': 12 }) if len(chunk_defs) == 0: chunk_defs = [{ 'renamed_variants': variants_bedpe, 'start_idx': 0, 'stop_idx': 0, '__mem_gb': 12 }] return {'chunks': chunk_defs, 'join': {'__mem_gb': 16}}
def join(args, outs, chunk_defs, chunk_outs): out_df = None for chunk in chunk_outs: tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates) out_df = pd.concat([out_df, tmp_df], ignore_index=True) out_df['name'] = np.arange(len(out_df)) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
def join(args, outs, chunk_defs, chunk_outs): join_df = None for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants) join_df = pd.concat([join_df, bedpe_df], ignore_index=True) if not args.best_only: join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
def join(args, outs, chunk_defs, chunk_outs): out_calls = None out_pileups = None for c in chunk_outs: if not os.path.isfile(c.sv_calls): continue calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls) pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups) out_calls = pd.concat([out_calls, calls], ignore_index=True) out_pileups = pd.concat([out_pileups, pileups], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
def main(args, outs): rust_env = os.environ.copy() rust_env["RUST_BACKTRACE"] = "1" if args.chunk_bed is None: sv_io.write_sv_df_to_bedpe(None, outs.sv_calls) return # Run PVC fasta = tenkit.reference.get_fasta(args.reference_path) pvc_args = ['pvc', '--min-kmer-obs', str(args.min_kmer_obs), '--coverage-json', args.coverage_json] pvc_args.extend(["call-bed", "-o", outs.sv_calls, fasta, args.possorted_bam, args.chunk_bed]) subprocess.check_call(pvc_args, env=rust_env)
def join(args, outs, chunk_defs, chunk_outs): join_df = None non_pass_join_df = None for chunk in chunk_outs: df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls) join_df = pd.concat([join_df, df], ignore_index=True) non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df], ignore_index=True) join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) non_pass_join_df['name'] = np.arange(len(join_df), len(join_df) + len(non_pass_join_df)) tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if not out_bedpe is None: out_bedpe['name'] = np.arange(len(out_bedpe)) sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants) if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary): shutil.copyfile(chunk_outs[0].summary, outs.summary) else: outs.summary = None
def main(args, outs): callsets = [] if args.calls1 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls1) callsets.append(c) if args.calls2 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls2) callsets.append(c) if args.calls3 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls3) callsets.append(c) # Select the highest qual merged = merge_overlapping(callsets, select_widest()) sv_io.write_sv_df_to_bedpe(merged, outs.merged)
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if out_bedpe is None: col_names = ['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name', 'qual', 'strand1', 'strand2', 'filters', 'info'] out_bedpe = pd.DataFrame(columns=col_names) out_bedpe.names = np.arange(len(out_bedpe)) out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv] tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
def main(args, outs): if args.chrom is None or len(args.starts) == 0 or args.barcode_whitelist is None: tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls) return max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(args.insert_sizes, MAX_INSERT_SIZE_PRC) if max_insert is None: martian.throw('No Q60 reads') # This is slightly bigger than the maximum "normal" insert min_call_insert, _ = tk_sv_utils.get_insert_size_info(args.insert_sizes, MIN_SV_INSERT_SIZE_PRC) min_sv_len = max(args.min_sv_len, min_call_insert) martian.log_info('Setting min_sv_len to {}'.format(min_sv_len)) with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary['same_dir_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary['outward_dir_chimera_rate'] chimera_rates = {tk_readpairs.DEL_STR:chimera_rate_del, tk_readpairs.INV_STR:chimera_rate_inv, tk_readpairs.TDUP_STR:chimera_rate_dup, tk_readpairs.TRANS_STR:summary['far_chimera_rate']} df, read_counts, _ = tk_readpairs.get_discordant_loci(args.possorted_bam, chrom = str(args.chrom), starts = args.starts, stops = args.stops, min_mapq = args.min_mapq, min_insert = 0, max_insert = max_insert, max_merge_range = args.merge_range_factor * max_insert, min_sv_len = min_sv_len, max_sv_len = args.max_sv_len, ins_logsf_fun = ins_logsf_fun, min_lr_to_call = args.min_lr_to_call, min_reads_to_call = args.min_reads_to_call, chimera_rate = chimera_rates, reads_as_qual = True) # Need to convert to dict because defaultdict doesn't get pickled properly read_counts['split'] = dict(read_counts['split']) read_counts['pair'] = dict(read_counts['pair']) tk_sv_io.write_sv_df_to_bedpe(df, outs.sv_calls) with open(outs.discordant_read_counts, 'w') as f: f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs): if args.barcode_whitelist is None: # write empty dataframe tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates) martian.log_info( 'Data seem un-barcoded. No deletion candidates will be computed.') return if True: ''' pvc [options] call-one <fasta> <bam> <locus> pvc [options] call-bed -o <out> <fasta> <bam> <bed> [<which>] pvc [options] bam-svs <out> <bam> pvc [options] cands <bam> <locus> <out> pvc (-h | --help) pvc --version Options: --min-size=<m> Mininum event size --min-kmer-obs=<k> Minimum number of kmer observations -h --help Show this screen. --version Show version. --trace Trace logging -d --debug Debug logging ''' for locus in args.loci: tmp_file = "tmp.bedpe" min_detect_size = 25 pvc_args = [ 'pvc', '--het-read-prob=%f' % args.het_read_prob, '--min-size=%d' % min_detect_size, 'cands', args.possorted_bam, locus, tmp_file ] subprocess.check_call(pvc_args) subprocess.check_call('cat %s >> %s' % (tmp_file, outs.del_candidates), shell=True)
def main(args, outs): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) sv_df["info2"] = "SV" cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants) cnv_df["info2"] = "CNV" sv_df = pd.concat([sv_df, cnv_df], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) sv_df.sort(['chrom1', 'chrom2'], inplace=True) res_df = None for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']): tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], inplace=True) # cluster the loci in the group based on proximity groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist) # for each cluster, get the row with max qual # tmp_df.loc[g] gets the subset of tmp_df in the cluster. # then idxmax gets the max index out_df = pd.DataFrame(columns=sv_df.columns) idx = 0 for g in groups: row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()] if (tmp_df.loc[g]['info2'] == 'SV').any(): row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()] source = list(set(tmp_df.loc[g]['info2'])) row['info'] += (";SOURCE=" + ",".join(source)) out_df.loc[idx] = row idx += 1 out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True) res_df = pd.concat([res_df, out_df], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
def join(args, outs, chunk_defs, chunk_outs): join_df = None read_counts = {} read_counts['split'] = defaultdict(int) read_counts['pair'] = defaultdict(int) for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) join_df = pd.concat([join_df, bedpe_df], ignore_index = True) if not os.path.isfile(chunk.discordant_read_counts): continue with open(chunk.discordant_read_counts, 'r') as f: counts = json.load(f) for t, c in counts['split'].iteritems(): read_counts['split'][t] += c for t, c in counts['pair'].iteritems(): read_counts['pair'][t] += c join_df['name'] = [str(i) for i in np.arange(len(join_df))] tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) read_counts['split'] = dict(read_counts['split']) read_counts['pair'] = dict(read_counts['pair']) with open(args.basic_summary, 'r') as f: num_reads = float(json.load(f)['num_reads']) / 2.0 read_counts['frac_split'] = {} read_counts['frac_pair'] = {} for t, c in read_counts['split'].iteritems(): read_counts['frac_split'][t] = c / num_reads for t, c in read_counts['pair'].iteritems(): read_counts['frac_pair'][t] = c / num_reads with open(outs.discordant_read_counts, 'w') as f: f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) in_bam = tk_bam.create_bam_infile(args.possorted_bam) frac_changed = np.zeros((len(pred_df), ), dtype=np.float) for i, (_, row) in enumerate(pred_df.iterrows()): frac_changed[i] = get_frac_mapq_changed(in_bam, row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop1 + BREAK_EXT, row.chrom2, max(0, row.start2 - BREAK_EXT), row.stop2 + BREAK_EXT, min_mapq=60) pileups = pred_df[frac_changed > args.max_frac_low_mapq] pred_df = pred_df[frac_changed <= args.max_frac_low_mapq] tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) in_bam = tk_bam.create_bam_infile(args.possorted_bam) cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] has_pileups = np.zeros((len(pred_df), ), dtype=np.bool) for i, (_, row) in enumerate(pred_df.iterrows()): has_clipped1 = has_too_many_clipped( in_bam, row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop1 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped2 = has_too_many_clipped( in_bam, row.chrom2, max(0, row.start2 - BREAK_EXT), row.stop2 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped = has_clipped1 and has_clipped2 if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE: has_pileups[i] = has_clipped continue cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT)) cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int) if not 'coverage_deduped' in cov.columns: cov['coverage_deduped'] = cov[sel_cols].sum(axis=1) cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped']) median_cov = np.median(cov_arr) # Rescue for deletions or duplications with breakpoints on the pileups sv_len = row.stop2 - row.start1 side_cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2), row.start1 - BREAK_EXT)) side_cov = pd.concat([ side_cov, cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT, row.stop2 + BREAK_EXT + sv_len / 2)) ], ignore_index=True) if not 'coverage_deduped' in side_cov.columns: side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1) # Ignore pileups, enough evidence for a large-scale copy number variant if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median( side_cov.coverage_deduped): continue if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median( side_cov.coverage_deduped): continue # Filter out the call if there are pileups very close to the breakpoints has_pileups[i] = len(cov_arr) > 4 and np.any( cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov) has_pileups[i] = has_pileups[i] or has_clipped pileups = pred_df[has_pileups] pred_df = pred_df[np.logical_not(has_pileups)] tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
def main(args, outs): if args.barcode_whitelist is None: # write empty dataframe tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates) martian.log_info('Data seem un-barcoded. No deletion candidates will be computed.') return in_bam = tk_bam.create_bam_infile(args.possorted_bam) del_loci = [] for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov_df = get_hap_coverage(in_bam, None, chrom, start, stop, cov_quals=[30]) best_path = get_candidate_del_loci(cov_df, transition_prob=args.transition_prob, het_read_prob=args.het_read_prob) # Get regions with good coverage for a het del (not too high, not too low) bad_cov = np.logical_or(cov_df['total_cov'] < MIN_COV, cov_df['total_cov'] > MAX_COV) bad_regions = tk_regions.Regions([ (s,e) for (s,e) in group_bit_arr(bad_cov, start) if e-s > args.min_bad_region]) # Group the states of the HMM and exclude bad regions pos = start out_loci = [] for bit, group in groupby(best_path): group_size = len(list(group)) group_start = pos group_stop = group_start + group_size if bit and group_size >= args.min_del_len and group_size <= args.max_del_len and \ not bad_regions.overlapping_regions(group_start, group_stop): out_loci.append((chrom, group_start, group_stop)) pos += group_size # Get regions that look like hom dels hom_del_loci = group_bit_arr(cov_df['total_cov'] < MIN_COV, start) out_loci.extend([(chrom, s, e) for (s, e) in hom_del_loci]) out_loci = sorted(out_loci) # Now merge deletion candidates that are separated by short non-dels if out_loci: new_out_loci = [] last_locus = out_loci[0] for i, locus in enumerate(out_loci[1:]): if locus[1] - last_locus[2] > MIN_GAP: new_out_loci.append(last_locus) last_locus = locus else: last_locus = (last_locus[0], min(locus[1], last_locus[1]), max(locus[2], last_locus[2])) new_out_loci.append(last_locus) del_loci.extend(new_out_loci) final_loci = [locus for locus in del_loci if locus[2] - locus[1] >= args.min_del_len and locus[2] - locus[1] <= args.max_del_len] info_strs = ['TYPE=DEL' for _ in final_loci] in_bam.close() chroms = [locus[0] for locus in final_loci] starts1 = np.array([locus[1] for locus in final_loci], dtype=np.int) starts2 = np.array([locus[2] for locus in final_loci], dtype=np.int) sv_df = tk_sv_io.create_sv_df(chroms, starts1, starts1 + 1, chroms, starts2, starts2 + 1, np.arange(len(chroms)), 1, info_strs = info_strs) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.del_candidates)
def join(args, outs, chunk_defs, chunk_outs): pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs) # Change TRANS type to DISTAL. This change will only # affect the type reported not the names of the metrics. new_info = [] for _, row in pred_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'TRANS': sv_type = 'DISTAL' new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type])) pred_df['info'] = new_info if not true_df is None: true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') ##### Write BEDPE/VCF outputs tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates) source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format( martian.get_pipelines_version()) sample_id = 'sample' if args.sample_id is None else args.sample_id tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id, source_str, args.reference_path) # this will sort and gzip tk_sv_io.index_sv_vcf(outs.svs.strip(".gz")) outs.svs_index = outs.svs + '.tbi' # delete the non-gzipped file os.remove(outs.svs.strip('.gz')) if not pred_df.empty: call_df = pred_df[np.logical_or(pred_df['filters'] == '.', pred_df['filters'] == "PASS")] else: call_df = None tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls) # Annotate each call with the matching ground truth svs. The resulting # dataframe might have multiple rows for the same call if there were multiple # matching ground truth svs. martian.log_info("merging calls and gt") if not pred_df.empty: pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match) martian.log_info("writing call_tsv") pred_df.to_csv(outs.call_tsv, index=False, header=True, sep='\t', na_rep='NaN') pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))] max_dists = sorted(np.array(args.detect_dists)) gt_sv_types = get_all_sv_types(true_df) call_sv_types = get_all_sv_types(pred_df) if not true_df is None: # Use the default MAX_PPV_TIER unless this is greater than the maximum tier # present in the data. max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier)) # Use the default unless this is smaller than the minimum tier present in # the data. max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier)) else: max_ppv_tier = 1 max_sens_tier = 1 tiers = [max_ppv_tier, max_sens_tier] # All combinations of filters in ground truth and call set if not args.targets is None and not args.target_dists is None: target_dists = list(sorted(np.array(args.target_dists, dtype=np.float))) target_dists.append(float('NaN')) else: target_dists = [float('NaN')] combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers, [True, False], call_sv_types, max_dists) metrics = defaultdict(list) gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier'] call_filters = ['call_filtered', 'call_sv_type', 'match_dist'] for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type, dist) in combs: if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type: continue metrics['genic_breaks'].append(genic_breaks) metrics['target_dist'].append(tdist) metrics['gt_sv_type'].append(gt_sv_type) metrics['tier'].append(tier) metrics['call_filtered'].append(is_filtered) metrics['call_sv_type'].append(call_sv_type) metrics['match_dist'].append(dist) if true_df is None: sel_true_df = None else: sel_true_df = true_df if gt_sv_type != 'NA': sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type] if not np.isnan(tdist): sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist] sel_true_df = sel_true_df[sel_true_df.tier <= tier] # Restrict to genic or non-genic or take everything if this is None. if not genic_breaks is None: sel_true_df = sel_true_df[sel_true_df.genic_breaks == genic_breaks] if len(sel_true_df) == 0: sel_true_df = None sel_pred_df = pred_df if is_filtered and not pred_df.empty: sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') | (sel_pred_df.filters == 'PASS')] if call_sv_type != 'NA' and not pred_df.empty: sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type] if not pred_df.empty and (args.min_rel_overlap is None or args.min_rel_overlap == 0): # Do not apply thi filter if the matching is done based on overlap. sel_pred_df = sel_pred_df[np.logical_or( np.isnan(sel_pred_df.match_dist), sel_pred_df.match_dist <= dist)] add_metrics(sel_pred_df, sel_true_df, metrics) column_names = gt_filters column_names.extend(call_filters) other_names = set(metrics.keys()).difference(set(column_names)) column_names.extend(other_names) metric_df = pd.DataFrame(metrics) metric_df = metric_df[column_names] martian.log_info("writing summary tsv") metric_df.to_csv(outs.summary_tsv, index=False, header=True, sep='\t', na_rep='NaN') short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier, max_sens_tier, args) if not args.call_summary is None: with open(args.call_summary, 'r') as in_summary_fn: in_summary = json.load(in_summary_fn) for key, val in in_summary.iteritems(): short_metrics[key] = val short_metrics['min_qv'] = min_qv with open(outs.summary, 'w') as out_file: out_file.write( tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
def merge_breaks(bedpe_df, out_bedpe, merge_win=10000, max_range=np.inf, max_nmates=np.inf, cluster_qual_factor=0.2): """Merges a set of SVs into a non-redundant set. Args: - bedpe_df: Either a bedpe file or a DataFrame like the one returned by tk_sv_io.read_sv_bedpe_to_df. - out_bedpe: Path to file where output will be written. - merge_win: Breakpoints will be merged if they are within this distance from each other. Two SVs will be merged if both their breakpoints can be merged. - max_range: See max_range field of cluster_loci. - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1, and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so breakpoint clusters), of the first breakpoint of an SV. SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the output. Return value: The output BEDPE. """ if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i in range(bedpe_df.shape[0]): breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1], bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1))) breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4], bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = {} for i in range(bedpe_df.shape[0]): name = bedpe_df.iloc[i]['name'] cluster_idx1 = mem_to_cluster[(name, 1)] cluster_idx2 = mem_to_cluster[(name, 2)] if not (cluster_idx1, cluster_idx2) in cluster_pairs: cluster_pairs[(cluster_idx1, cluster_idx2)] = [i] else: old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0] # Make sure the old and the new pair have breaks on the same chromosomes assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0]) assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3]) cluster_pairs[(cluster_idx1, cluster_idx2)].append(i) new_cluster_pairs = {} cluster_dist_ratio = {} for p, pos_list in cluster_pairs.iteritems(): pos_arr = np.array(pos_list) tmp_df = get_dataframe_loc(bedpe_df, pos_arr) quals = np.array(tmp_df.qual) best_call = pos_arr[np.argmax(quals)] close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0] close_df = get_dataframe_loc(tmp_df, close_calls) same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[ best_call]['chrom1'] min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1) max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[ best_call]['stop1'] new_cluster_pairs[p] = best_call if not same_chrom or max_break_dist > MAX_FRAG_SIZE: cluster_dist_ratio[p] = '.' elif min_break_dist <= 0: cluster_dist_ratio[p] = float('NaN') else: cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist cluster_pairs = new_cluster_pairs def clusters_close(i, j): chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[ i, 1], bedpe_df.iloc[i, 2] chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[ i, 4], bedpe_df.iloc[i, 5] next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[ j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2] next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[ j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5] dist1 = max(next_start1 - stop1, start1 - next_stop1) dist2 = max(next_start2 - stop2, start2 - next_stop2) return (chrom1 == next_chrom1 and chrom2 == next_chrom2 and dist1 <= merge_win and dist2 <= merge_win) # The "chain-breaking" in cluster_loci might still leave some redundancy. # In particular, we might leave some almost touching clusters that were # separated only because of chain-breaking. Do a second round of clustering # where you go through consecutive pairs of cluster and merge them if they're merge-able. new_cluster_pairs = {} for (cluster1, cluster2) in sorted(cluster_pairs.keys()): if cluster_pairs[(cluster1, cluster2)] == -1: continue # Consider all neighboring clusters after this cluster. # Notice that the cluster indices are sorted by genomic coordinates. neigh_clusters = [ (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1), (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1) ] idx = cluster_pairs[(cluster1, cluster2)] # Best cluster among neighboring clusters max_cluster = ((cluster1, cluster2), idx) for next_cluster1, next_cluster2 in neigh_clusters: if not (next_cluster1, next_cluster2) in cluster_pairs: continue if cluster_pairs[(next_cluster1, next_cluster2)] == -1: continue next_idx = cluster_pairs[(next_cluster1, next_cluster2)] if clusters_close(idx, next_idx): cluster_pairs[(next_cluster1, next_cluster2)] = -1 if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']: max_cluster = ((next_cluster1, next_cluster2), next_idx) new_cluster_pairs[max_cluster[0]] = max_cluster[1] cluster_pairs = new_cluster_pairs # Now compute the number of mate breakpoints for each cluster num_mates = {} for (cluster1, cluster2) in cluster_pairs.keys(): if not cluster1 in num_mates: num_mates[cluster1] = 0 if not cluster2 in num_mates: num_mates[cluster2] = 0 num_mates[cluster1] += 1 if cluster2 != cluster1: num_mates[cluster2] += 1 sel_loc = [] new_info_strs = [] for (cluster1, cluster2) in sorted(cluster_pairs.keys()): sv_loc = cluster_pairs[(cluster1, cluster2)] if num_mates[cluster1] > max_nmates and num_mates[ cluster2] > max_nmates: continue sel_loc.append(sv_loc) new_info_strs.append( tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'], ['NMATES1', 'NMATES2', 'RESOLUTION'], [ num_mates[cluster1], num_mates[cluster2], cluster_dist_ratio[(cluster1, cluster2)] ])) if len(sel_loc) > 0: bedpe_df = bedpe_df.iloc[sel_loc] bedpe_df['info'] = new_info_strs else: bedpe_df = pd.DataFrame(columns=bedpe_df.columns) if not out_bedpe is None: tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe) return bedpe_df
def main(args, outs): """SV calling on a subset of the input loci.""" #### Prepare input files and parameters #### if not isfile(args.fragment_histogram) or not isfile(args.fragments) or \ not isfile(args.fragment_phasing): martian.log_info('One or more files needed for SV-calling are missing. No calls will be made.') tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_variants) return # Get candidate loci and subset to the loci for this chunk. overlap_loci, input_names = prepare_loci(args) overlap_loci = [overlap_loci[i] for i in range(int(args.start_idx), int(args.stop_idx))] if not input_names is None: input_names = [input_names[i] for i in range(int(args.start_idx), int(args.stop_idx))] # Get molecule size distribution. frag_res = tk_sv_stats.read_frag_hist(args.fragment_histogram, MIN_FRAG_SIZE) frag_sizes, frag_counts = frag_res # Get fragment phasing info. This will be used to get the barcode phasing. frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) # Estimate the Poisson reads rate alpha. fragment_df = tk_hdf5.read_data_frame_limited(args.fragments, query_cols=['obs_len', 'num_reads'], max_rows=20000000) fragment_df = fragment_df[fragment_df.num_reads > MIN_READS_PER_FRAG] alpha = np.median(np.array(fragment_df['num_reads']) / np.array(fragment_df['obs_len'], dtype=np.float)) martian.log_info('Using alpha = {}'.format(alpha)) sv_model = tk_sv_read_model.ReadModel(alpha, frag_sizes, frag_counts, p_ov_mol=args.p_ov_mol, step=1000) if not args.targets is None: msg = 'Read-based SV-calling from targeted data not supported.' martian.log_info(msg) return # Get a set of barcodes to remove from SV-calling. if not args.barcode_blacklist is None: tmp_df = pd.read_csv(args.barcode_blacklist, sep='\t', index_col=False) blacklist_barcodes = set(tmp_df.bc) else: blacklist_barcodes = set([]) in_bam = tk_bam.create_bam_infile(args.possorted_bam) #### End prepare input files and parameters #### old_locus = None old_reads = None res_arr = [] group_ids = [] # Iterate through all loci and evaluate them for the presence of SVs for locus_idx, (c1, s1, e1, c2, s2, e2, _) in enumerate(overlap_loci): print >> sys.stderr, 'Evaluating locus', c1, s1, e1, c2, s2, e2 # Candidate locus too wide. Skip. if e1 - s1 > MAX_REGION_LEN or e2 - s2 > MAX_REGION_LEN: print >> sys.stderr, 'Locus too wide. Skipping.' continue # Candidate loci too close to each other. Skip. if c1 == c2 and s2 - e1 < 2 * BREAK_EXT: print >> sys.stderr, 'Breakpoints of too close. Skipping.' continue # Evaluate for proximal SVs (DEL, INV, DUP) if the distance between # breakpoints is < MAX_FRAG_SIZE. Otherwise, the event will be called # a translocation and we'll try to infer the signal orientation. if c1 == c2 and s2 - e1 < MAX_FRAG_SIZE: if not old_locus is None and loci_close(old_locus, (c1, s1, e1, c2, s2, e2)): in_read_df = old_reads else: in_read_df = None res, reads = call_proximal(sv_model, c1, s1, e1, s2, e2, in_bam, in_read_df, frag_phasing, blacklist_barcodes, args) old_locus = (c1, s1, e1, c2, s2, e2) old_reads = reads else: res = call_distal(sv_model, c1, max(0, s1 - BREAK_EXT), e1 + BREAK_EXT, c2, max(0, s2 - BREAK_EXT), e2 + BREAK_EXT, in_bam, frag_phasing, blacklist_barcodes, args) if not res is None: res_arr.extend(res) group_ids.extend(locus_idx * np.ones((len(res),), dtype=np.int)) in_bam.close() out_df = tk_sv_call.SvCall.svs_to_dataframe(res_arr) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
def main(args, outs): if args.fragments is None: outs.bc_cnv = None outs.bc_large_cnv = None return rust_env = os.environ.copy() rust_env["RUST_BACKTRACE"] = "1" final_blacklist = lr_gt.get_genomic_track(args.blacklist, "terminal_cnv", args.reference_path, "default_blacklist.bed") if final_blacklist is None: final_blacklist = args.possorted_bam + "_tmp" open(final_blacklist, 'w').close() if args.subcommand == "bc" and args.fragments: frag_csv = outs.bc_cnv + ".csv" bin_size, frag_version = convert_fragments_to_csv( args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj) cnv_args = [ 'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam, final_blacklist, outs.bc_cnv, "--fragver", str(frag_version), "--binsize", str(bin_size), "--probchange", str(args.status_change_penalty), "--minprob", str(args.min_prob) ] elif args.subcommand == "read": cnv_args = [ 'hmm-bc-cnv', args.subcommand, args.possorted_bam, final_blacklist, outs.bc_cnv, "--binsize", str(args.bin_size), "--probchange", str(args.status_change_penalty) ] elif args.subcommand == "asread": frag_csv = outs.bc_cnv + ".csv" bin_size, frag_version = convert_fragments_to_csv( args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj) cnv_args = [ 'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam, final_blacklist, outs.bc_cnv, "--fragver", str(frag_version), "--binsize", str(bin_size), "--probchange", str(args.status_change_penalty), "--minprob", str(args.min_prob) ] print cnv_args subprocess.check_call(cnv_args, env=rust_env) outs.final_bin_size = bin_size chroms = [] starts1 = [] end1 = [] starts2 = [] end2 = [] info_strs = [] quals = [] primary_contigs = tk_reference.load_primary_contigs(args.reference_path) spikes = tk_io.get_target_regions(open(args.spikes)) with open(outs.bc_cnv) as fin: for line in fin.readlines(): if line.startswith('#') or line.startswith( 'browser') or line.startswith('track') or line.startswith( '-browser') or line.startswith('-track'): continue infos = line.strip().split("\t") cp = int(infos[3]) ch = infos[0] s = int(infos[1]) e = int(infos[2]) # Some basic filtering if primary_contigs and ch not in primary_contigs: continue if cp == 2 or (e - s) < args.minimal_cnv_size: continue if cp > 2: if ch not in spikes: continue overlaps = spikes[ch].overlapping_regions( max(0, s - bin_size), e + bin_size) ln = len(overlaps) if ln > 0 and \ overlap(s-bin_size, s+bin_size, overlaps[0][0], overlaps[0][1]) and \ overlap(e-bin_size, e+bin_size, overlaps[ln-1][0], overlaps[ln-1][1]): continue chroms.append(infos[0]) starts1.append(s) end1.append(s + 1) starts2.append(e) end2.append(e + 1) pval = float(infos[4]) #if pval > args.max_pval: # continue if pval < 1e-100: qual = 1000 else: qual = int(-log10(pval) * 10) quals.append(qual) if cp > 2: info_strs.append('TYPE=DUP;COPY=%d' % cp) elif cp < 2: info_strs.append('TYPE=DEL;COPY=%d' % cp) sv_df = tk_sv_io.create_sv_df(chroms, starts1, end1, chroms, starts2, end2, np.arange(len(chroms)), quals, info_strs=info_strs) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.bc_large_cnv)
def main(args, outs): sv_df = read_bedpes(args) sv_df = tk_sv_utils.get_dataframe_loc( sv_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) print >> sys.stderr, 'max insert', max_insert if max_insert is None: tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.non_pass_sv_calls) return with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_STR: summary['far_chimera_rate'] } in_bam = tk_bam.create_bam_infile(args.possorted_bam) frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) pass_calls = [] non_pass_calls = [] for i, (_, row) in enumerate(sv_df.iterrows()): sv_type = tk_sv_io.get_sv_type(row.info) middle = int(0.5 * (row.stop1 + row.start2)) # Bail out on all non deletions if sv_type != tk_readpairs.DEL_STR: continue if row.chrom1 == row.chrom2: r1 = (max(0, row.start1 - args.break_pad), min(middle, row.stop1 + args.break_pad)) r2 = (max(middle, row.start2 - args.break_pad), row.stop2 + args.break_pad) if row.start2 - row.stop1 > 4 * args.break_pad: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: r1 = (max(0, row.start1 - args.break_pad), row.stop1 + args.break_pad) r2 = (max(0, row.start2 - args.break_pad), row.stop2 + args.break_pad) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] bc_cov1 = len(get_frag_coverage(frag_phasing, row.chrom1, r1[0], r1[1])) bc_cov2 = len(get_frag_coverage(frag_phasing, row.chrom2, r2[0], r2[1])) if sv_type == tk_readpairs.DEL_STR and max(bc_cov1, bc_cov2) > MAX_DEL_BC_DEPTH: print >> sys.stderr, 'Too many barcodes in DEL candidate', row.chrom1, row.start1, row.stop2 continue readpairs = tk_readpairs.get_readpairs(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) normal_readpairs = [ rp for rp in readpairs if rp.sv_type == tk_readpairs.NORMAL_STR ] if len(normal_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(normal_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(normal_readpairs)) normal_readpairs = [normal_readpairs[ridx] for ridx in sel] # Distal readpairs across the breakpoints dist_readpairs = [ rp for rp in readpairs if rp.sv_type == sv_type and ( (tk_readpairs.pos_overlaps(rp.read1.pos, r1) and tk_readpairs.pos_overlaps(rp.read2.pos, r2)) or (tk_readpairs.pos_overlaps(rp.read1.pos, r2) and tk_readpairs.pos_overlaps(rp.read2.pos, r1))) ] if len(dist_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(dist_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] dist_readpairs.extend(normal_readpairs) if sv_type == tk_readpairs.DEL_STR and len(starts) == 2: more_readpairs = tk_readpairs.get_readpairs(in_bam, [row.chrom1], [r1[1] + 1], [r2[0] - 1], max_insert=max_insert, min_mapq=args.min_mapq, normal_only=True) if len(more_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(more_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(more_readpairs)) dist_readpairs.extend([ more_readpairs[ridx] for ridx in sel if more_readpairs[ridx].sv_type == tk_readpairs.NORMAL_STR ]) readpairs = sorted(dist_readpairs, key=lambda x: x.barcode) read_groups = {} for bc, read_group_iter in groupby(dist_readpairs, lambda x: x.barcode): read_groups[bc] = list(read_group_iter) bc_set = set(read_groups.keys()) bc_list = sorted(read_groups.keys()) phase_set1 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom1, r1[0], r1[1]) phase_set2 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom2, r2[0], r2[1]) if len(bc_list) < 1: print >> sys.stderr, 'Not enough barcodes. Skipping' continue bc_phase_sets1 = tk_sv_utils.get_barcode_phase_probs(frag_phasing, row.chrom1, r1[0], r1[1], bc_set, in_ps=phase_set1) bc_phase_sets2 = tk_sv_utils.get_barcode_phase_probs(frag_phasing, row.chrom2, r2[0], r2[1], bc_set, in_ps=phase_set2) cand_breaks1 = np.arange(r1[0], r1[1] + 1, 5) cand_breaks2 = np.arange(r2[0], r2[1] + 1, 5) res = tk_readpairs.eval_sv_em(read_groups, cand_breaks1, cand_breaks2, sv_type, chimera_rates, phase_set1, phase_set2, bc_phase_sets1, bc_phase_sets2, max_insert, ins_logsf_fun, em_iters=args.em_iters) ((no_sv_max, sv_max, het_sv_max), max_locus, zygosity, max_hap, prior_hap_probs, hap_probs, support) = res lr = sv_max - no_sv_max if max_hap is None else het_sv_max - no_sv_max hap_probs1 = hap_probs[:, 0:2] hap_probs2 = hap_probs[:, 2:] new_call = sv_call.SvCall.from_em_results( row.chrom1, row.chrom2, phase_set1, phase_set2, (no_sv_max, sv_max, het_sv_max), max_locus, sv_call._SvType(sv_type, ('.', '.')), zygosity, max_hap, support, (hap_probs1, hap_probs2, None)) # the break interval is inclusive if lr >= args.min_lr and new_call.qual >= args.min_qv and new_call.break2[ 0] - new_call.break1[1] + 1 >= args.min_sv_len: pass_calls.append(new_call) else: # Leave breakpoints unchanged new_call.break1 = (row.start1, row.stop1) new_call.break2 = (row.start2, row.stop2) non_pass_calls.append(new_call) out_df = sv_call.SvCall.svs_to_dataframe(pass_calls) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_calls) out_df = sv_call.SvCall.svs_to_dataframe(non_pass_calls) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.non_pass_sv_calls) in_bam.close() frag_phasing.close()
def main(args, outs): if not isfile(args.fragment_histogram) \ or not isfile(args.barcode_blacklist) or not isfile(args.coverage): martian.log_info( 'One or more files needed for SV-calling are missing. No calls will be made.' ) return in_bam = tk_bam.create_bam_infile(args.input) genome_size = np.sum(np.array(in_bam.lengths)) frag_hist_file = args.fragment_histogram barcode_blacklist_file = args.barcode_blacklist if args.targets is None: martian.exit('You should use CALL_STRUCTVARS for WGS samples.') else: target_regions = sv_utils.bed_to_region_map(args.targets, merge=True) target_coverage = sv_utils.region_cum_coverage_map( target_regions, TARGET_COV_BIN) link_distance = SV_FRAGMENT_LINK_DISTANCE_TARGET with open(args.coverage, 'r') as f: cov_sum = json.load(f)['target_info'] if 'on_target_bases' in cov_sum: prob_off_target = 1 - cov_sum['on_target_bases'] / float( cov_sum['total_bases']) else: prob_off_target = 0.001 corr_factor = sv_stats.off_target_amp_corr_factor( target_regions, prob_off_target, genome_size=genome_size) res = sv_stats.get_frag_data(frag_hist_file, barcode_blacklist_file, min_frag_size=0, frag_size_prc_cutoff=args.frag_size_prc) frag_sizes, frag_counts, frag_prc, blacklist_barcodes = res frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) min_sv_len = int(max(0.8 * args.min_call_dist, link_distance)) if not frag_prc is None and not args.targets is None: min_sv_len = int(max(min_sv_len, frag_prc)) martian.log_info('Calling SVs with min length: {:d}'.format(min_sv_len)) fragment_df = tk_hdf5.read_data_frame(args.fragments, query_cols=['obs_len', 'num_reads']) fragment_df = fragment_df[ fragment_df.num_reads > MIN_READS_PER_FRAG_TARGET] alpha = np.median( np.array(fragment_df['num_reads']) / np.array(fragment_df['obs_len'], dtype=np.float)) martian.log_info('Using alpha = {}'.format(alpha)) summary = {} summary['min_sv_len'] = min_sv_len with open(outs.summary, 'w') as out_fn: out_fn.write(tenkit.safe_json.safe_jsonify(summary, pretty=True)) model = sv_stats.FragModel(frag_sizes, frag_counts, blacklist_barcodes, target_coverage, cov_bin=TARGET_COV_BIN, corr_factor=corr_factor, genome_size=genome_size, target_regions=target_regions, alpha=alpha, p_ov_mol=args.p_ov_mol) with open(args.overlap_loci, 'rb') as f: overlap_loci = cPickle.load(f) overlap_loci = [ overlap_loci[i] for i in range(int(args.start_idx), int(args.stop_idx)) ] final_res = [] for i, (c1, s1, e1, c2, s2, e2) in enumerate(overlap_loci): frags1, frags2 = get_frags_from_reads( in_bam, c1, max(0, s1 - args.break_ext), e1 + args.break_ext, c2, max(0, s2 - args.break_ext), e2 + args.break_ext, min_mapq=args.min_mapq, min_sv_len=SV_FRAGMENT_LINK_DISTANCE_TARGET, min_frag_size=args.min_frag_size, min_reads_per_frag=args.min_reads_per_frag) bc_set = set(frags1.bc).union(set(frags2.bc)) ps1 = sv_utils.get_phase_set(frag_phasing, c1, s1, e1) ps2 = sv_utils.get_phase_set(frag_phasing, c2, s2, e2) bc_phase_set_dict1 = sv_utils.get_barcode_phase_probs(frag_phasing, c1, s1, e1, bc_set, in_ps=ps1) bc_phase_set_dict2 = sv_utils.get_barcode_phase_probs(frag_phasing, c2, s2, e2, bc_set, in_ps=ps2) print >> sys.stderr, 'Evaluating locus', c1, s1, e1, c2, s2, e2 res = model.eval_sv(frags1, frags2, (c1, s1, e1), (c2, s2, e2), min_dist=min_sv_len, ps1=ps1, ps2=ps2, phase_set_dict1=bc_phase_set_dict1, phase_set_dict2=bc_phase_set_dict2, grid_len=args.grid_len) if not res is None and res.qual >= args.sv_min_qv: final_res.append(res) in_bam.close() out_df = sv_call.SvCall.svs_to_dataframe(final_res) sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
def main(args, outs): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) bedpe_df = tk_sv_utils.get_dataframe_loc( bedpe_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) if max_insert is None: martian.throw('No Q60 reads') with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_trans = summary['far_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_FF_STR: chimera_rate_trans, tk_readpairs.TRANS_FR_STR: chimera_rate_trans, tk_readpairs.TRANS_RR_STR: chimera_rate_trans, tk_readpairs.TRANS_RF_STR: chimera_rate_trans } in_bam = tk_bam.create_bam_infile(args.input) out_quals = [] out_infos = [] out_chroms1 = [] out_starts1 = [] out_stops1 = [] out_chroms2 = [] out_starts2 = [] out_stops2 = [] for i, (_, row) in enumerate(bedpe_df.iterrows()): in_svt = tk_sv_io.get_sv_type(row.info) if row.chrom1 == row.chrom2: max_ext = min(args.break_extend, int( (row.start2 - row.stop1) / 3.0)) r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext) r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend) if r1[1] > r2[0]: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: r1 = (max(0, row.start1 - args.break_extend), row.stop1 + args.break_extend) r2 = (max(0, row.start2 - args.break_extend), row.stop2 + args.break_extend) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] readpairs = tk_readpairs.get_readpairs2(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) # Distal readpairs across the breakpoints dist_readpairs = filter( filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs) if len(dist_readpairs) > MAX_READPAIRS: sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun, max_insert, chimera_rates) if len(res_arr) == 0: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append(row['info']) else: if args.best_only: res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True) res_arr = [res_arr[0]] for (lr, num_split, num_pairs, sv_len, support_range, svt, support_readpairs) in res_arr: range1, range2 = support_range if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None: out_quals.append(row.qual + args.rp_lr_multiplier * lr) out_chroms1.append(row.chrom1) out_starts1.append(range1[0]) out_stops1.append(range1[1]) out_chroms2.append(row.chrom2) out_starts2.append(range2[0]) out_stops2.append(range2[1]) if svt != in_svt and in_svt != 'TRANS': in_svt = 'UNK' else: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append( tk_sv_io.update_info( row['info'], ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'], [num_pairs, num_split, lr, svt, in_svt])) in_bam.close() if args.best_only: out_names = [n for n in bedpe_df['name']] else: out_names = np.arange(len(bedpe_df)) out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1, out_chroms2, out_starts2, out_stops2, out_names, out_quals, out_infos) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)