def add_filters(pred_df, pred_to_match, black_dists1, black_dists2, black_names1, black_names2, seg_dup_calls, all_bad_regions, args): if not args.targets is None: min_call_qv = args.min_call_qv_target else: min_call_qv = args.min_call_qv_wgs if args.coverage is None: # used for WGS max_bc_cov = SV_DEFAULT_MAX_BC_COV bc_mean_depth = 200 else: # used for exome with open(args.coverage, 'r') as f: cov_res = json.load(f) bc_summary_depth_info = cov_res['summary_bc_depth_info'] bc_mean_depth, _, _ = get_depth_info_json(bc_summary_depth_info) max_bc_cov = args.max_bc_cov_factor * bc_mean_depth if args.keep_filters: filter_strs = [s for s in pred_df.filters] else: filter_strs = ['.' for i in range(len(pred_df))] info_strs = [s for s in pred_df['info']] rps = np.zeros((len(pred_df), ), dtype=np.int) def get_cov_frac(black_regions, chrom, start, stop): regions = tk_sv_utils.strictly_overlapping_regions( black_regions, chrom, start, stop) tot_black = np.sum([r[1] - r[0] for r in regions]) tot_len = float(stop - start) black_frac = tk_stats.robust_divide(tot_black, tot_len) return black_frac for i, (_, row) in enumerate(pred_df.iterrows()): npairs = tk_sv_io.get_npairs(row['info']) nsplit = tk_sv_io.get_nsplit(row['info']) rps[i] = npairs + nsplit sv_type = tk_sv_io.get_sv_type(row['info']) name = row['name'] qual = row.qual ####### Filtering for read-pair calls ####### frac_on_hap = tk_sv_io.extract_sv_info(row.info, ['FRAC_HAP_SUPPORT'])[0] allelic_frac = tk_sv_io.extract_sv_info(row.info, ['HAP_ALLELIC_FRAC'])[0] if allelic_frac != '': allelic_frac = float(allelic_frac) if args.is_germline is None: if qual < min_call_qv: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not args.min_allelic_frac is None and not frac_on_hap is None and \ frac_on_hap != '' and float(frac_on_hap) < args.min_allelic_frac: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not args.min_allelic_frac is None and allelic_frac != '' and \ float(allelic_frac) < args.min_allelic_frac: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) elif args.targets is None: if args.is_germline: martian.log_info('Mean barcode depth {}'.format(bc_mean_depth)) min_call_qv = max(min_call_qv, bc_mean_depth / 10.0) martian.log_info( 'Support cutoff: {} barcodes'.format(min_call_qv)) enough_bcs = qual >= min_call_qv is_good = allelic_frac > 0.8 or (sv_type == 'INV' and allelic_frac > 0.6) is_good = is_good and enough_bcs if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) else: min_call_qv = max(min_call_qv, 4) is_good = allelic_frac > 0.6 and qual >= min_call_qv if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) else: if args.is_germline: # Harder to get confident support in Exome min_call_qv = max(min_call_qv, bc_mean_depth / 10.0) martian.log_info( 'Support cutoff: {} barcodes'.format(min_call_qv)) # Apply a very lenient filter on allelic fraction because lots of barcodes can be unphased is_good = qual >= min_call_qv and allelic_frac > 0.05 af = tk_sv_io.extract_sv_info(row.info, ['ALLELIC_FRAC'])[0] if af != '': af = float(af) is_good = is_good and af > 0.04 else: min_call_qv = max(min_call_qv, 4) is_good = qual >= min_call_qv if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not black_dists1 is None: chrom1, chrom2 = row.chrom1, row.chrom2 black_dist1, black_dist2 = black_dists1[i], black_dists2[i] if chrom1 == chrom2: if chrom1 in all_bad_regions: black_frac = get_cov_frac(all_bad_regions, chrom1, row.stop1, row.start2) else: black_frac = 0.0 else: black_frac = float('NaN') else: black_dist1 = np.inf black_dist2 = np.inf black_frac = float('NaN') filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_DIST', min(black_dist1, black_dist2), args.min_dist_from_black) filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_FRAC', black_frac, 0, args.max_frac_black) bname1 = '.' bname2 = '.' if black_dist1 < args.min_dist_from_black or re.search( 'BLACK_FRAC', filter_strs[i]): bname1 = black_names1[i] if black_dist2 < args.min_dist_from_black or re.search( 'BLACK_FRAC', filter_strs[i]): bname2 = black_names2[i] if name in seg_dup_calls: filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'SEG_DUP', None) seg_dup_match = ','.join(list(seg_dup_calls[name])) else: seg_dup_match = '.' nbcs1 = tk_sv_io.get_nbcs1(row.info) nbcs2 = tk_sv_io.get_nbcs2(row.info) if not nbcs1 is None and not nbcs2 is None and (nbcs1 > max_bc_cov or nbcs2 > max_bc_cov): filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'HIGH_BC_COV', None) filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'READ_SUPPORT', npairs + nsplit, min_val=args.min_read_support) match_str = ','.join([str(s) for s in pred_to_match.get(name, '.')]) if not args.targets is None: # Disable orientation reporting in exome info_strs[i] = tk_sv_io.update_info(info_strs[i], ['ORIENT'], [None]) info_strs[i] = tk_sv_io.update_info(info_strs[i], [ 'BLACK_DIST1', 'BLACK_DIST2', 'BLACK_FRAC', 'BLACK1', 'BLACK2', 'MATCHES', 'SEG_DUP' ], [ black_dist1, black_dist2, black_frac, bname1, bname2, match_str, seg_dup_match ]) pred_df['filters'] = filter_strs pred_df['info'] = info_strs pred_df['read_support'] = rps return pred_df, min_call_qv
def join(args, outs, chunk_defs, chunk_outs): pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs) # Change TRANS type to DISTAL. This change will only # affect the type reported not the names of the metrics. new_info = [] for _, row in pred_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'TRANS': sv_type = 'DISTAL' new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type])) pred_df['info'] = new_info if not true_df is None: true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') ##### Write BEDPE/VCF outputs tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates) source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format( martian.get_pipelines_version()) sample_id = 'sample' if args.sample_id is None else args.sample_id tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id, source_str, args.reference_path) # this will sort and gzip tk_sv_io.index_sv_vcf(outs.svs.strip(".gz")) outs.svs_index = outs.svs + '.tbi' # delete the non-gzipped file os.remove(outs.svs.strip('.gz')) if not pred_df.empty: call_df = pred_df[np.logical_or(pred_df['filters'] == '.', pred_df['filters'] == "PASS")] else: call_df = None tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls) # Annotate each call with the matching ground truth svs. The resulting # dataframe might have multiple rows for the same call if there were multiple # matching ground truth svs. martian.log_info("merging calls and gt") if not pred_df.empty: pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match) martian.log_info("writing call_tsv") pred_df.to_csv(outs.call_tsv, index=False, header=True, sep='\t', na_rep='NaN') pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))] max_dists = sorted(np.array(args.detect_dists)) gt_sv_types = get_all_sv_types(true_df) call_sv_types = get_all_sv_types(pred_df) if not true_df is None: # Use the default MAX_PPV_TIER unless this is greater than the maximum tier # present in the data. max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier)) # Use the default unless this is smaller than the minimum tier present in # the data. max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier)) else: max_ppv_tier = 1 max_sens_tier = 1 tiers = [max_ppv_tier, max_sens_tier] # All combinations of filters in ground truth and call set if not args.targets is None and not args.target_dists is None: target_dists = list(sorted(np.array(args.target_dists, dtype=np.float))) target_dists.append(float('NaN')) else: target_dists = [float('NaN')] combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers, [True, False], call_sv_types, max_dists) metrics = defaultdict(list) gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier'] call_filters = ['call_filtered', 'call_sv_type', 'match_dist'] for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type, dist) in combs: if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type: continue metrics['genic_breaks'].append(genic_breaks) metrics['target_dist'].append(tdist) metrics['gt_sv_type'].append(gt_sv_type) metrics['tier'].append(tier) metrics['call_filtered'].append(is_filtered) metrics['call_sv_type'].append(call_sv_type) metrics['match_dist'].append(dist) if true_df is None: sel_true_df = None else: sel_true_df = true_df if gt_sv_type != 'NA': sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type] if not np.isnan(tdist): sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist] sel_true_df = sel_true_df[sel_true_df.tier <= tier] # Restrict to genic or non-genic or take everything if this is None. if not genic_breaks is None: sel_true_df = sel_true_df[sel_true_df.genic_breaks == genic_breaks] if len(sel_true_df) == 0: sel_true_df = None sel_pred_df = pred_df if is_filtered and not pred_df.empty: sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') | (sel_pred_df.filters == 'PASS')] if call_sv_type != 'NA' and not pred_df.empty: sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type] if not pred_df.empty and (args.min_rel_overlap is None or args.min_rel_overlap == 0): # Do not apply thi filter if the matching is done based on overlap. sel_pred_df = sel_pred_df[np.logical_or( np.isnan(sel_pred_df.match_dist), sel_pred_df.match_dist <= dist)] add_metrics(sel_pred_df, sel_true_df, metrics) column_names = gt_filters column_names.extend(call_filters) other_names = set(metrics.keys()).difference(set(column_names)) column_names.extend(other_names) metric_df = pd.DataFrame(metrics) metric_df = metric_df[column_names] martian.log_info("writing summary tsv") metric_df.to_csv(outs.summary_tsv, index=False, header=True, sep='\t', na_rep='NaN') short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier, max_sens_tier, args) if not args.call_summary is None: with open(args.call_summary, 'r') as in_summary_fn: in_summary = json.load(in_summary_fn) for key, val in in_summary.iteritems(): short_metrics[key] = val short_metrics['min_qv'] = min_qv with open(outs.summary, 'w') as out_file: out_file.write( tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
def merge_breaks(bedpe_df, out_bedpe, merge_win=10000, max_range=np.inf, max_nmates=np.inf, cluster_qual_factor=0.2): """Merges a set of SVs into a non-redundant set. Args: - bedpe_df: Either a bedpe file or a DataFrame like the one returned by tk_sv_io.read_sv_bedpe_to_df. - out_bedpe: Path to file where output will be written. - merge_win: Breakpoints will be merged if they are within this distance from each other. Two SVs will be merged if both their breakpoints can be merged. - max_range: See max_range field of cluster_loci. - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1, and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so breakpoint clusters), of the first breakpoint of an SV. SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the output. Return value: The output BEDPE. """ if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i in range(bedpe_df.shape[0]): breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1], bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1))) breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4], bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = {} for i in range(bedpe_df.shape[0]): name = bedpe_df.iloc[i]['name'] cluster_idx1 = mem_to_cluster[(name, 1)] cluster_idx2 = mem_to_cluster[(name, 2)] if not (cluster_idx1, cluster_idx2) in cluster_pairs: cluster_pairs[(cluster_idx1, cluster_idx2)] = [i] else: old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0] # Make sure the old and the new pair have breaks on the same chromosomes assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0]) assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3]) cluster_pairs[(cluster_idx1, cluster_idx2)].append(i) new_cluster_pairs = {} cluster_dist_ratio = {} for p, pos_list in cluster_pairs.iteritems(): pos_arr = np.array(pos_list) tmp_df = get_dataframe_loc(bedpe_df, pos_arr) quals = np.array(tmp_df.qual) best_call = pos_arr[np.argmax(quals)] close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0] close_df = get_dataframe_loc(tmp_df, close_calls) same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[ best_call]['chrom1'] min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1) max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[ best_call]['stop1'] new_cluster_pairs[p] = best_call if not same_chrom or max_break_dist > MAX_FRAG_SIZE: cluster_dist_ratio[p] = '.' elif min_break_dist <= 0: cluster_dist_ratio[p] = float('NaN') else: cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist cluster_pairs = new_cluster_pairs def clusters_close(i, j): chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[ i, 1], bedpe_df.iloc[i, 2] chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[ i, 4], bedpe_df.iloc[i, 5] next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[ j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2] next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[ j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5] dist1 = max(next_start1 - stop1, start1 - next_stop1) dist2 = max(next_start2 - stop2, start2 - next_stop2) return (chrom1 == next_chrom1 and chrom2 == next_chrom2 and dist1 <= merge_win and dist2 <= merge_win) # The "chain-breaking" in cluster_loci might still leave some redundancy. # In particular, we might leave some almost touching clusters that were # separated only because of chain-breaking. Do a second round of clustering # where you go through consecutive pairs of cluster and merge them if they're merge-able. new_cluster_pairs = {} for (cluster1, cluster2) in sorted(cluster_pairs.keys()): if cluster_pairs[(cluster1, cluster2)] == -1: continue # Consider all neighboring clusters after this cluster. # Notice that the cluster indices are sorted by genomic coordinates. neigh_clusters = [ (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1), (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1) ] idx = cluster_pairs[(cluster1, cluster2)] # Best cluster among neighboring clusters max_cluster = ((cluster1, cluster2), idx) for next_cluster1, next_cluster2 in neigh_clusters: if not (next_cluster1, next_cluster2) in cluster_pairs: continue if cluster_pairs[(next_cluster1, next_cluster2)] == -1: continue next_idx = cluster_pairs[(next_cluster1, next_cluster2)] if clusters_close(idx, next_idx): cluster_pairs[(next_cluster1, next_cluster2)] = -1 if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']: max_cluster = ((next_cluster1, next_cluster2), next_idx) new_cluster_pairs[max_cluster[0]] = max_cluster[1] cluster_pairs = new_cluster_pairs # Now compute the number of mate breakpoints for each cluster num_mates = {} for (cluster1, cluster2) in cluster_pairs.keys(): if not cluster1 in num_mates: num_mates[cluster1] = 0 if not cluster2 in num_mates: num_mates[cluster2] = 0 num_mates[cluster1] += 1 if cluster2 != cluster1: num_mates[cluster2] += 1 sel_loc = [] new_info_strs = [] for (cluster1, cluster2) in sorted(cluster_pairs.keys()): sv_loc = cluster_pairs[(cluster1, cluster2)] if num_mates[cluster1] > max_nmates and num_mates[ cluster2] > max_nmates: continue sel_loc.append(sv_loc) new_info_strs.append( tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'], ['NMATES1', 'NMATES2', 'RESOLUTION'], [ num_mates[cluster1], num_mates[cluster2], cluster_dist_ratio[(cluster1, cluster2)] ])) if len(sel_loc) > 0: bedpe_df = bedpe_df.iloc[sel_loc] bedpe_df['info'] = new_info_strs else: bedpe_df = pd.DataFrame(columns=bedpe_df.columns) if not out_bedpe is None: tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe) return bedpe_df
def compare_multiple_breaks(in_bedpes, sample_names, out_bedpe, merge_win=0, max_range=np.inf): """Compares multiple BEDPE files. Args: - in_bedpes: A list of BEDPE files to compare. - sample_names: A list of the same size with unique names for the input samples. - out_bedpe: Where union BEDPE will be written. Return value: A DataFrame with the union of calls and information about which calls are present in which input files. This DataFrame will have one entry per call in the union and will include (among other things) columns <sample>_qual, <sample>_filtered, <sample>_correct, and <sample>_dist for each of the input BEDPEs. """ assert (len(sample_names) == len(in_bedpes)) # Merge all the input files. This will get rid of redundant entries. # The quality in the output will be the maximum quality across all files. merged_df = merge_multiple_breaks(in_bedpes, out_bedpe, merge_win=merge_win, max_range=max_range) num_merged = len(merged_df) # Map the name of each entry in the union to its index in the DataFrame. name_to_ind = {} for i, n in enumerate(merged_df['name']): name_to_ind[n] = i new_filters = [set([]) for i in range(num_merged)] new_matches = [set([]) for i in range(num_merged)] # For each of the input BEDPEs find which of the entries in the union it # overlaps. This is somewhat duplicated work, but it's simpler this way. for sample, bedpe in zip(sample_names, in_bedpes): in_df = tk_sv_io.read_sv_bedpe_to_df(bedpe) name_to_ind2 = {} for i, n in enumerate(in_df['name']): name_to_ind2[n] = i matched_qual = np.zeros((num_merged, ), dtype=np.int) is_correct = np.zeros((num_merged, ), dtype=np.bool) is_filtered = np.zeros((num_merged, ), dtype=np.bool) tmp_dist = np.zeros((num_merged, ), dtype=np.int) matched_names = ['' for i in range(num_merged)] # merged_to_this will be a dictionary from a name in the union to a set # of names in the input bedpe merged_to_this, _, _ = compare_breaks(merged_df, bedpe, max_dist=merge_win) for name1, name2_set in merged_to_this.iteritems(): ind1 = name_to_ind[name1] matched_names[ind1] = ';'.join([str(s) for s in name2_set]) for name2 in name2_set: ind2 = name_to_ind2[name2] matched_qual[ind1] = max(matched_qual[ind1], in_df.iloc[ind2]['qual']) match = tk_sv_io.extract_sv_info(in_df.iloc[ind2]['info'], ['MATCHES'])[0] is_match_correct = (match != '.' and match != '' and not match is None) if is_match_correct: new_matches[ind1].add(match) # Never set back to False if it was set to true. is_correct[ind1] = True is_filtered[ind1] = in_df.iloc[ind2]['filters'] != '.' if in_df.iloc[ind2]['filters'] != '.': new_filters[ind1] = new_filters[ind1].union( set(in_df.iloc[ind2]['filters'].split(';'))) if in_df.iloc[ind2]['chrom1'] != in_df.iloc[ind2]['chrom2']: tmp_dist[ind1] = -1 else: tmp_dist[ind1] = in_df.iloc[ind2]['start2'] - in_df.iloc[ ind2]['stop1'] merged_df[str(sample) + '_matches'] = matched_names merged_df[str(sample) + '_qual'] = matched_qual merged_df[str(sample) + '_correct'] = is_correct merged_df[str(sample) + '_filtered'] = is_filtered merged_df[str(sample) + '_dist'] = tmp_dist info_strs = ['.' for i in range(num_merged)] filter_strs = ['.' for i in range(num_merged)] for i in range(num_merged): match_str = ','.join( new_matches[i]) if len(new_matches[i]) > 0 else '.' info_strs[i] = tk_sv_io.update_info('.', ['MATCHES'], [match_str]) filter_strs[i] = ';'.join( new_filters[i]) if len(new_filters[i]) > 0 else '.' merged_df['qual'] = np.array(np.max( merged_df[[str(s) + '_qual' for s in sample_names]], axis=1), dtype=np.int) merged_df['filters'] = filter_strs merged_df['info'] = info_strs merged_df.sort( ['qual', 'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], ascending=[0, 1, 1, 1, 1, 1, 1], inplace=True) return merged_df
def main(args, outs): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) bedpe_df = tk_sv_utils.get_dataframe_loc( bedpe_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) if max_insert is None: martian.throw('No Q60 reads') with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_trans = summary['far_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_FF_STR: chimera_rate_trans, tk_readpairs.TRANS_FR_STR: chimera_rate_trans, tk_readpairs.TRANS_RR_STR: chimera_rate_trans, tk_readpairs.TRANS_RF_STR: chimera_rate_trans } in_bam = tk_bam.create_bam_infile(args.input) out_quals = [] out_infos = [] out_chroms1 = [] out_starts1 = [] out_stops1 = [] out_chroms2 = [] out_starts2 = [] out_stops2 = [] for i, (_, row) in enumerate(bedpe_df.iterrows()): in_svt = tk_sv_io.get_sv_type(row.info) if row.chrom1 == row.chrom2: max_ext = min(args.break_extend, int( (row.start2 - row.stop1) / 3.0)) r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext) r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend) if r1[1] > r2[0]: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: r1 = (max(0, row.start1 - args.break_extend), row.stop1 + args.break_extend) r2 = (max(0, row.start2 - args.break_extend), row.stop2 + args.break_extend) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] readpairs = tk_readpairs.get_readpairs2(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) # Distal readpairs across the breakpoints dist_readpairs = filter( filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs) if len(dist_readpairs) > MAX_READPAIRS: sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun, max_insert, chimera_rates) if len(res_arr) == 0: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append(row['info']) else: if args.best_only: res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True) res_arr = [res_arr[0]] for (lr, num_split, num_pairs, sv_len, support_range, svt, support_readpairs) in res_arr: range1, range2 = support_range if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None: out_quals.append(row.qual + args.rp_lr_multiplier * lr) out_chroms1.append(row.chrom1) out_starts1.append(range1[0]) out_stops1.append(range1[1]) out_chroms2.append(row.chrom2) out_starts2.append(range2[0]) out_stops2.append(range2[1]) if svt != in_svt and in_svt != 'TRANS': in_svt = 'UNK' else: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append( tk_sv_io.update_info( row['info'], ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'], [num_pairs, num_split, lr, svt, in_svt])) in_bam.close() if args.best_only: out_names = [n for n in bedpe_df['name']] else: out_names = np.arange(len(bedpe_df)) out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1, out_chroms2, out_starts2, out_stops2, out_names, out_quals, out_infos) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)