Exemplo n.º 1
0
def get_hap_coverage(in_bam, ps_h5, chrom, start, stop, cov_quals):
    """Return a dataframe with coverage per haplotype.

    Args:
    - in_bam: reader for a position sorted bam
    - ps_h5: HDF5 with phase set coordinates
    - chrom, start, stop: region to get coverage
    - cov_quals: Array of MAPQ cutoffs.

    Return value:
    A dataframe with columns:
    - chrom
    - pos
    - cov_q<M>_hap<H> for all M in cov_quals and for H in [0, 1, 2]: This is the
    coverage on haplotype H using reads of MAPQ >= M. Haplotype 2 corresponds to
    unphased.
    - phase_set: null if ps_h5 is missing.
    """
    coverages = [np.zeros((stop - start, 3)) for _ in cov_quals]

    for _, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))):
        if not read.is_unmapped and not read.aend is None and not read.is_secondary and not read.is_duplicate:
            hap = tk_io.get_read_haplotype(read)
            hap_idx = 2 if hap is None else hap - 1
            range_start = max(0, read.pos - start)
            range_stop = min(stop, read.aend) - start
            for qi, q in enumerate(cov_quals):
                if read.mapq >= q:
                    coverages[qi][range_start:range_stop + 1, hap_idx] += 1

    base_df = pd.DataFrame({'chrom': chrom, 'pos': np.arange(start, stop)})
    dfs = map(
        lambda x: pd.DataFrame(
            x[0],
            columns=['cov_q' + str(x[1]) + '_hap' + str(i) for i in range(3)]),
        zip(coverages, cov_quals))
    df = pd.concat([base_df, pd.concat(dfs, axis=1)], axis=1)

    phase_sets = -np.ones((stop - start, ), dtype=np.int)

    # This can be None if for example the input is unbarcoded.
    if not ps_h5 is None:
        ps_df = tk_hdf5.read_data_frame(ps_h5)
        ps_df = ps_df[np.logical_and(
            ps_df.chrom == chrom,
            np.logical_and(ps_df.end >= start, ps_df.start < stop))]

        for _, row in ps_df.iterrows():
            range_start = max(0, row.start - start)
            range_stop = min(stop, row.end) - start
            phase_sets[range_start:range_stop + 1] = row.phase_set

    df['phase_set'] = phase_sets
    return df
Exemplo n.º 2
0
def join(args, outs, chunk_defs, chunk_outs):
    out_calls = None
    out_pileups = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_calls):
            continue
        calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls)
        pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups)
        out_calls = pd.concat([out_calls, calls], ignore_index=True)
        out_pileups = pd.concat([out_pileups, pileups], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
Exemplo n.º 3
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    non_pass_join_df = None
    for chunk in chunk_outs:
        df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls)
        join_df = pd.concat([join_df, df], ignore_index=True)
        non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df],
                                     ignore_index=True)

    join_df['name'] = np.arange(len(join_df))
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
    non_pass_join_df['name'] = np.arange(len(join_df),
                                         len(join_df) + len(non_pass_join_df))
    tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
Exemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    out_df = None
    for chunk in chunk_outs:
        tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates)
        out_df = pd.concat([out_df, tmp_df], ignore_index=True)

    out_df['name'] = np.arange(len(out_df))
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
Exemplo n.º 5
0
def read_bedpes(args):
    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    if not args.sv_calls2 is None:
        sv_df = pd.concat(
            [sv_df, tk_sv_io.read_sv_bedpe_to_df(args.sv_calls2)],
            ignore_index=True)
        sv_df['name'] = np.arange(len(sv_df))
    return sv_df
Exemplo n.º 6
0
def make_output_dataframes(bcs_frags_in):
    fragments = []
    bcs = []

    bc_dfs = []
    fragment_dfs = []

    for (bc_stats, frags) in bcs_frags_in:
        # Denormalize selected bc columns into the fragments dataframe
        for (k, v) in bc_stats.items():
            if k in [
                    'bc', 'bc_num_reads', 'bc_mean_reads_per_fragment',
                    'bc_est_len', 'bc_num_unmapped_reads'
            ]:
                for frag in frags:
                    frag[k] = v

        fragments.extend(frags)
        bcs.append(bc_stats)

        if len(fragments) > 2e6:
            (frag_df, bc_df) = make_df_chunk(fragments, bcs)

            fragment_dfs.append(frag_df)
            fragments = []

            bc_dfs.append(bc_df)
            bcs = []

    (frag_df, bc_df) = make_df_chunk(fragments, bcs)
    fragment_dfs.append(frag_df)
    bc_dfs.append(bc_df)

    frag_dfs = [x for x in fragment_dfs if x is not None]
    bc_dfs = [x for x in bc_dfs if x is not None]

    if len(bc_dfs) > 0:
        frag_df = p.concat(frag_dfs)
        bc_df = p.concat(bc_dfs)
    else:
        frag_df = None
        bc_df = None

    return (frag_df, bc_df)
Exemplo n.º 7
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants)
        join_df = pd.concat([join_df, bedpe_df], ignore_index=True)

    if not args.best_only:
        join_df['name'] = np.arange(len(join_df))

    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
Exemplo n.º 8
0
def main(args, outs):

    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    sv_df["info2"] = "SV"

    cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants)
    cnv_df["info2"] = "CNV"

    sv_df = pd.concat([sv_df, cnv_df], ignore_index=True)
    sv_df['name'] = np.arange(len(sv_df))
    sv_df.sort(['chrom1', 'chrom2'], inplace=True)

    res_df = None
    for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']):
        tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
                    inplace=True)
        # cluster the loci in the group based on proximity
        groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist)

        # for each cluster, get the row with max qual
        # tmp_df.loc[g] gets the subset of tmp_df in the cluster.
        # then idxmax gets the max index

        out_df = pd.DataFrame(columns=sv_df.columns)
        idx = 0
        for g in groups:
            row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()]
            if (tmp_df.loc[g]['info2'] == 'SV').any():
                row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()]

            source = list(set(tmp_df.loc[g]['info2']))
            row['info'] += (";SOURCE=" + ",".join(source))
            out_df.loc[idx] = row
            idx += 1

        out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True)
        res_df = pd.concat([res_df, out_df], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
Exemplo n.º 9
0
def get_reads(in_bam, chrom, start, stop, in_read_df=None,
              min_mapq=30, max_reads=500000, blacklist_barcodes=None):
    poses = []
    ends = []
    bcs = []

    if not in_read_df is None and len(in_read_df) > 0:
        ret_df = in_read_df.sort('pos')
        old_poses = np.array(ret_df['pos'])
        # Subtracting the read length is roughly right, ideally we should sort
        # by aend.
        # Loci are considered in an ordered fashion, so we should never fetch
        # reads "earlier" in the bam.
        start = max(old_poses[0], max(0, start - MAX_READ_LEN))
        if start >= old_poses[0] and start <= old_poses[-1]:
            start_idx = bisect.bisect_left(old_poses, start)
            if stop >= old_poses[0] and stop <= old_poses[-1]:
                stop_idx = min(len(ret_df), bisect.bisect(old_poses, stop))
            else:
                stop_idx = len(ret_df)
            # Remove all positions that are smaller than the input start
            ret_df = ret_df.iloc[start_idx:stop_idx]
            # Set the new start to the end of the input data frame.
            # Add an overlap of READ_LEN to capture reads that were right on
            # the boundary between the old and new data frame.
            start = max(0, old_poses[stop_idx - 1] - MAX_READ_LEN)
            stop = max(start, stop)
    else:
        ret_df = None

    for i, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))):
        if i > max_reads:
            break
        bc = tk_io.get_read_barcode(read)
        if read.pos < start:
            continue
        if not blacklist_barcodes is None and bc in blacklist_barcodes:
            continue
        if not read.is_secondary and not read.is_duplicate and read.is_read1 and \
           not read.is_unmapped and read.mapq >= min_mapq and read.is_proper_pair and \
           not bc is None:
            poses.append(read.pos)
            ends.append(read.aend)
            bcs.append(tk_io.get_read_barcode(read))

    tmp_ret_df = pd.DataFrame({'chrom':chrom, 'pos':poses, 'aend':ends, 'bc':bcs})

    ret_df = pd.concat([ret_df, tmp_ret_df], ignore_index=True)
    ret_df.sort(['bc', 'pos'], inplace=True)
    return ret_df
Exemplo n.º 10
0
def merge_calls_and_gt(call_df, gt_df, call_to_gt):

    if not gt_df is None:
        gt_df.index = gt_df['name']
    else:
        call_to_gt = {}

    out_call_df = None
    for _, row in call_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        orient = tk_sv_io.get_break_orientation(row.info)
        row['orient'] = orient

        # revert sv type name from DISTAL to TRANS to match ground truth
        # conventions
        if sv_type == 'DISTAL':
            sv_type = 'TRANS'
        row['sv_type'] = sv_type

        matches = list(call_to_gt.get(row['name'], [None]))
        # One output row per match
        for m in matches:
            row['match'] = m
            if not m is None and not gt_df is None:
                x = gt_df.loc[m]
                row['match_dist'] = max(
                    dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1,
                                   x.stop1),
                    dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2,
                                   x.stop2))
            else:
                row['match_dist'] = float('NaN')

            out_call_df = pd.concat(
                [out_call_df, pd.DataFrame([row])], ignore_index=True)

    if not gt_df is None:
        out_call_df = pd.merge(out_call_df,
                               gt_df,
                               left_on='match',
                               right_on='name',
                               how='outer',
                               suffixes=['', '_gt'])
        out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True)
    out_call_df.sort('name', inplace=True)

    return out_call_df
Exemplo n.º 11
0
def join(args, outs, chunk_defs, chunk_outs):
    # Combine the coverage hdf5 files
    frame = p.DataFrame()
    list_ = []
    if args.baits_file_map and outs.bait_csv:
        in_files = [
            out.bait_csv for (cdef, out) in zip(chunk_defs, chunk_outs)
        ]
        for file_ in in_files:
            df = p.read_csv(file_, index_col=None, header=0)
            list_.append(df)
        frame = p.concat(list_)

        # write csv
        frame.to_csv(outs.bait_csv)
    else:
        outs.target_coverage = None
Exemplo n.º 12
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if not out_bedpe is None:
        out_bedpe['name'] = np.arange(len(out_bedpe))
    sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)

    if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary):
        shutil.copyfile(chunk_outs[0].summary, outs.summary)
    else:
        outs.summary = None
Exemplo n.º 13
0
def join(args, outs, chunk_defs, chunk_outs):
    out_loci = []
    summary_df = None
    for chunk in chunk_outs:
        with open(chunk.loci, 'r') as f:
            out_loci.extend(cPickle.load(f))
        summary_df = pd.concat([
            summary_df,
            pd.read_csv(chunk.cov_summary, sep='\t', header=0, index_col=None)
        ],
                               ignore_index=True)

    # There might be some overlapping loci due to the overlap between chunks
    # but we hope that subsequent stages will deal with this.
    with open(outs.loci, 'w') as f:
        cPickle.dump(out_loci, f)
    summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
Exemplo n.º 14
0
def merge_multiple_breaks(in_bedpes,
                          out_bedpe,
                          merge_win=10000,
                          max_range=np.inf):
    assert (len(in_bedpes) > 0)
    in_bedpe_df = None
    for bi, bedpe in enumerate(in_bedpes):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe)
        assert (bedpe_df.shape[1] > 11)
        bedpe_df = bedpe_df.iloc[:, 0:12]
        # Make sure that all names from all files are unique
        bedpe_df['name'] = [str(n) + '_' + str(bi) for n in bedpe_df['name']]
        in_bedpe_df = pd.concat([in_bedpe_df, bedpe_df], ignore_index=True)

    return merge_breaks(in_bedpe_df,
                        out_bedpe,
                        merge_win=merge_win,
                        max_range=max_range)
Exemplo n.º 15
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if out_bedpe is None:
        col_names = ['chrom1', 'start1', 'stop1',
                     'chrom2', 'start2', 'stop2', 'name', 'qual',
                     'strand1', 'strand2', 'filters', 'info']
        out_bedpe = pd.DataFrame(columns=col_names)
    out_bedpe.names = np.arange(len(out_bedpe))

    out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv]
    tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
Exemplo n.º 16
0
def read_data_frame_filtered(fn,
                             filter_func,
                             query_cols=[],
                             chunk_size=5000000):
    ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns.
        filter_func should take a DataFrame and return a boolean vector of the rows to keep.
        Rows are loaded from the file and filtered in chunks to keep peak memory usage small. '''

    f = h5py.File(fn, 'r')

    column_names = f.attrs.get("column_names")
    column_names = get_column_intersection(column_names, query_cols)
    column_index = p.Index(column_names)

    sz = f[column_names[0]].shape[0]
    starts = np.arange(0, sz, chunk_size)
    ends = np.minimum(sz, starts + chunk_size)

    chunks = []

    for (start, end) in zip(starts, ends):
        cols = {}
        for name in column_names:
            ds = f[name]
            if has_levels(ds):
                indices = ds[start:end]
                uniques = get_levels(ds)
                col = uniques[indices]
            else:
                col = ds[start:end]

            cols[name] = col
        df = p.DataFrame(cols, columns=column_index)
        df = df[filter_func(df)]

        if len(df) > 0 or len(chunks) == 0:
            chunks.append(df)

    f.close()

    result = p.concat(chunks, ignore_index=True)
    return result
Exemplo n.º 17
0
def merge_overlapping(callsets, selection_fun=select_first()):
    """Merge overlapping calls and remove redundancies based on the specified function.

    - callsets: list of call dataframes
    - selection_fun: given a group of overlapping calls, this function decides
    which of these calls should be output. Default is to pick the first in each group.
    """

    # Make sure there are no inter-chromosomal calls
    assert np.all(not has_inter_chromosomal(callset) for callset in callsets)
    callset = pd.concat(callsets, ignore_index=True)
    # The selection functions might break if this is empty
    if callset.empty:
        return None
    callset.sort(['chrom1', 'start1', 'start2'], inplace=True)
    groups = group_overlapping(callset)

    # agg returns a multilevel dataframe. reset_index removes one level and
    # adds a group column, which we drop.
    return groups.agg(selection_fun).reset_index().drop('group', axis=1)
Exemplo n.º 18
0
def read_data_frame_indexed(fn, queries, query_cols=[], coords=True):
    ''' Read rows from the HDF5 data frame that match each tabix query in the
    queries list.  A tabix query is in the form ('chr1', 100, 200). query_cols
    is a list of columns you want to return. If coords is True, then it it will
    return coordinates regardless of query_cols. If coords is False, it will
    only return the columns specified in query_cols. Returns a concatenated
    pandas DataFrame. WARNING: If the queries overlap in coordinates, the same
    region will appear more than once. In such cases, use
    read_data_frame_indexed_no_concat().'''

    dfs = read_data_frame_indexed_no_concat(fn, queries, query_cols, coords)

    if len(dfs) == 1:
        d = dfs[0]
    else:
        # Return the union of the queries
        d = p.concat(dfs)

    d.reset_index(drop=True, inplace=True)
    return d
Exemplo n.º 19
0
def merge_predictions(chunk_outs):
    join_pred_to_match = {}
    join_true_to_match = defaultdict(set)
    join_pred_df = None
    feasible_gt = None
    for ci, chunk in enumerate(chunk_outs):
        with open(re.sub('.json', '.pickle', chunk.summary)) as f:
            pred_to_match = cPickle.load(f)
            true_to_match = cPickle.load(f)
            pred_df, min_qv = cPickle.load(f)
        join_pred_df = pd.concat([join_pred_df, pred_df], ignore_index=True)
        for pred, matches in pred_to_match.iteritems():
            join_pred_to_match[pred] = matches
        for sv, matches in true_to_match.iteritems():
            join_true_to_match[sv] = join_true_to_match[sv].union(matches)
        if ci == 0 and os.path.exists(chunk.feasible_gt):
            feasible_gt = pd.read_csv(chunk.feasible_gt,
                                      header=0,
                                      index_col=None,
                                      sep='\t')
    return (join_pred_to_match, join_true_to_match, join_pred_df, feasible_gt,
            min_qv)
Exemplo n.º 20
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    read_counts = {}
    read_counts['split'] = defaultdict(int)
    read_counts['pair'] = defaultdict(int)

    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        join_df = pd.concat([join_df, bedpe_df], ignore_index = True)

        if not os.path.isfile(chunk.discordant_read_counts):
            continue
        with open(chunk.discordant_read_counts, 'r') as f:
            counts = json.load(f)
        for t, c in counts['split'].iteritems():
            read_counts['split'][t] += c
        for t, c in counts['pair'].iteritems():
            read_counts['pair'][t] += c

    join_df['name'] = [str(i) for i in np.arange(len(join_df))]
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)

    read_counts['split'] = dict(read_counts['split'])
    read_counts['pair'] = dict(read_counts['pair'])

    with open(args.basic_summary, 'r') as f:
        num_reads = float(json.load(f)['num_reads']) / 2.0

    read_counts['frac_split'] = {}
    read_counts['frac_pair'] = {}
    for t, c in read_counts['split'].iteritems():
        read_counts['frac_split'][t] = c / num_reads
    for t, c in read_counts['pair'].iteritems():
        read_counts['frac_pair'][t] = c / num_reads

    with open(outs.discordant_read_counts, 'w') as f:
        f.write(tenkit.safe_json.safe_jsonify(read_counts))
Exemplo n.º 21
0
def main(args, outs):
    sv_df = read_sv_bedpe_to_df(args.gt_variants)
    sv_df = get_dataframe_loc(sv_df, list(range(args.start_idx,
                                                args.stop_idx)))

    if not isfile(args.fragments) or not isfile(args.fragment_histogram) or not isfile(args.barcodes) \
            or not isfile(args.barcode_blacklist) or not isfile(args.coverage):
        sv_df['qual'] = 0
        if 'info' in sv_df.columns:
            info_strs = [s for s in sv_df['info']]
        else:
            info_strs = ['.' for i in range(len(sv_df))]
        for i in range(len(info_strs)):
            info_strs[i] = update_info(info_strs[i],
                                       ['BCOV', 'NBCS1', 'NBCS2', 'NOOV'],
                                       [0, 0, 0, 0])
        sv_df['info'] = info_strs
        sv_df['strand1'] = '.'
        sv_df['strand2'] = '.'
        sv_df['filters'] = '.'
        write_sv_df_to_bedpe(sv_df, outs.summary)
        martian.log_info(
            'One or more files needed for computing quality scores are missing.'
        )
        return

    input_bam = tk_bam.create_bam_infile(args.input)
    genome_size = np.sum(np.array(input_bam.lengths))
    input_bam.close()

    frag_file = args.fragments
    frag_hist_file = args.fragment_histogram
    barcode_file = args.barcodes
    barcode_blacklist_file = args.barcode_blacklist

    if args.targets is None:
        target_coverage = None
        corr_factor = 1.0
        min_frag_size = MIN_FRAG_SIZE_WGS
        min_reads_per_frag = MIN_READS_PER_FRAG_WGS
        link_distance = SV_FRAGMENT_LINK_DISTANCE_WGS
    else:
        target_regions = bed_to_region_map(args.targets, merge=True)
        target_coverage = region_cum_coverage_map(target_regions,
                                                  TARGET_COV_BIN)

        with open(args.coverage, 'r') as f:
            cov_sum = json.load(f)['target_info']
        if 'on_target_bases' in cov_sum:
            prob_off_target = 1 - cov_sum['on_target_bases'] / float(
                cov_sum['total_bases'])
        else:
            prob_off_target = 0.001

        corr_factor = off_target_amp_corr_factor(target_regions,
                                                 prob_off_target,
                                                 genome_size=genome_size)
        min_frag_size = MIN_FRAG_SIZE_TARGET
        min_reads_per_frag = MIN_READS_PER_FRAG_TARGET
        link_distance = SV_FRAGMENT_LINK_DISTANCE_TARGET

    frag_sizes, frag_counts, frag_prc, blacklist_barcodes, frag_filter_fun = get_frag_data(
        frag_hist_file,
        barcode_file,
        barcode_blacklist_file,
        nx=args.nx,
        min_frag_size=min_frag_size,
        min_reads_per_frag=min_reads_per_frag)

    min_sv_len = link_distance
    prob_store = {}

    required_cols = [
        'bc', 'bc_num_reads', 'bc_est_len', 'bc_mean_reads_per_fragment',
        'chrom', 'start_pos', 'end_pos', 'num_reads', 'obs_len', 'est_len'
    ]
    fragment_reader = tk_hdf5.DataFrameReader(frag_file)

    out_df = None

    for (chrom1, chrom2), g in sv_df.groupby(['chrom1', 'chrom2']):
        query1 = (chrom1, max(0,
                              np.min(g['start1']) - MAX_FRAG_SIZE),
                  np.max(g['stop1']) + MAX_FRAG_SIZE)
        filt = frag_filter_fun(*query1)
        frags1 = filt(
            fragment_reader.query(query1,
                                  query_cols=required_cols,
                                  id_column='fragment_id'))

        query2 = (chrom2, max(0,
                              np.min(g['start2']) - MAX_FRAG_SIZE),
                  np.max(g['stop2']) + MAX_FRAG_SIZE)
        filt = frag_filter_fun(*query2)
        frags2 = filt(
            fragment_reader.query(query2,
                                  query_cols=required_cols,
                                  id_column='fragment_id'))

        lrs = np.zeros((len(g), ), dtype=np.int)
        if 'info' in g.columns:
            info_strs = [s for s in g['info']]
        else:
            info_strs = ['.' for i in range(len(g))]

        for i, (s1, e1, s2, e2) in enumerate(
                zip(g['start1'], g['stop1'], g['start2'], g['stop2'])):
            if chrom1 == chrom2:
                middle = 0.5 * (e1 + s2)
                locus1 = (chrom1, max(0, s1 - args.extend_win),
                          min(middle, e1 + args.extend_win))
                locus2 = (chrom2, max(middle, s2 - args.extend_win),
                          e2 + args.extend_win)
            else:
                locus1 = (chrom1, max(0, s1 - args.extend_win),
                          e1 + args.extend_win)
                locus2 = (chrom2, max(0, s2 - args.extend_win),
                          e2 + args.extend_win)

            tmp_frags1 = overlap_frags(max(0, s1 - args.extend_win),
                                       e1 + args.extend_win, frags1)
            tmp_frags2 = overlap_frags(max(0, s2 - args.extend_win),
                                       e2 + args.extend_win, frags2)

            (lr, pb, nov, ndiscordant, new_start, new_stop,
             _) = get_lr_from_frag_df(frags1,
                                      frags2,
                                      locus1,
                                      locus2,
                                      prob_store,
                                      frag_sizes,
                                      frag_counts,
                                      blacklist_barcodes,
                                      target_coverage,
                                      corr_factor,
                                      genome_size=genome_size,
                                      min_dist=min_sv_len)
            lrs[i] = lr
            if new_start[0] is None or new_start[1] is None:
                new_start = (s1, e1)
            if new_stop[0] is None or new_stop[1] is None:
                new_stop = (s2, e2)
            info_strs[i] = update_info(
                info_strs[i],
                ['BCOV', 'NBCS1', 'NBCS2', 'NOOV', 'P_SV', 'START', 'STOP'], [
                    nov,
                    len(set(tmp_frags1.bc)),
                    len(set(tmp_frags2.bc)), ndiscordant, pb, '-'.join([
                        str(p) for p in new_start
                    ]), '-'.join([str(p) for p in new_stop])
                ])

        g_cp = g.copy()
        g_cp['qual'] = np.maximum(lrs, 0)
        g_cp['info'] = info_strs
        g_cp['strand1'] = '+'
        g_cp['strand2'] = '+'
        g_cp['filters'] = '.'
        out_df = pd.concat([out_df, g_cp])
    write_sv_df_to_bedpe(out_df, outs.summary)
Exemplo n.º 22
0
def join(args, outs, chunk_defs, chunk_outs):

    summary = {}
    # Compute high-level BC summary metrics
    # Load BC data
    if args.barcodes:
        bc_df = tenkit.hdf5.read_data_frame(args.barcodes)
        fragment_df = tenkit.hdf5.read_data_frame(args.fragments, query_cols=['bc', 'chrom', 'start_pos'])

        bc_df.sort('bc_num_reads', inplace=True)
        bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)

        # Measure coalescence rate on all BCs that could conceivably be used
        # to call SVs - i.e. ignore BCs that contribute the cumulative bottom 1% of reads
        n99_read_thresh = sum(bc_df.bc_num_reads) * 0.01
        n99_bcs = bc_df[bc_df.cum_reads > n99_read_thresh]
        martian.log_info("number of bcs to screen for coalescence: %d" % len(n99_bcs))
        martian.log_info("subsetting fragments to use")

        if len(n99_bcs) > 1:
            selected_frags = fragment_df[fragment_df.bc.isin(n99_bcs.bc)]
            del fragment_df
            martian.log_info("Doing coalescence calculation")
            coa_calc = coalescence.BcSimilarity(selected_frags, set(n99_bcs.bc), args.input)
            coa_bc_tbl = coa_calc.coalescence_analysis()

            # Also add barcodes that are extreme outliers in the number of fragments observed
            med_frags_per_bc = n99_bcs.bc_num_fragments.median()
            high_quantile = n99_bcs.bc_num_fragments.quantile(0.98)
            bc_num_fragments_threshold = max(med_frags_per_bc*5.0, high_quantile)

            med_reads_per_bc = n99_bcs.bc_num_reads.median()
            high_quantile = n99_bcs.bc_num_reads.quantile(0.98)
            bc_num_reads_threshold = max(med_reads_per_bc*5.0, high_quantile)

            overloaded_bcs = n99_bcs[(n99_bcs.bc_num_fragments > bc_num_fragments_threshold) | (n99_bcs.bc_num_reads > bc_num_reads_threshold)]
            summary['fract_bcs_overloaded'] = float(len(overloaded_bcs)) / len(n99_bcs)

            # Remove bcs that are already in the blacklist
            nr_overloaded_bcs = overloaded_bcs[~overloaded_bcs.bc.isin(coa_bc_tbl.bc)]

            # Add overloaded bcs to blacklist
            overloaded_bc_tbl = p.DataFrame({'bc': nr_overloaded_bcs.bc, 'cluster_id': -1, 'cluster_size': -1})

            # Write barcode blacklist
            bad_bc_tbl = p.concat([coa_bc_tbl, overloaded_bc_tbl])
            bad_bc_tbl.to_csv(outs.barcode_blacklist, sep="\t", index=False)

            # Compute coalescence stats
            summary['fract_bcs_in_clusters_all'] = float(len(coa_bc_tbl)) / len(n99_bcs)
            summary['fract_bcs_in_clusters_eq_2'] = float((coa_bc_tbl.cluster_size == 2).sum()) / len(n99_bcs)
            summary['fract_bcs_in_clusters_gt_2'] = float((coa_bc_tbl.cluster_size > 2).sum()) / len(n99_bcs)
            summary['num_clusters_gt_8'] = (coa_bc_tbl.cluster_size > 8).sum()

            # Compute stats ignoring clusters of Hamming distance 2
            hd2_clusters = []
            for cluster in coa_bc_tbl.groupby('cluster_id'):
                if all_within_hamming_distance(cluster[1].bc.values, 2):
                    hd2_clusters.append(cluster[0])

            coa_tbl_no_hd2 = coa_bc_tbl[~coa_bc_tbl.cluster_id.isin(hd2_clusters)]
            summary['fract_bcs_in_clusters_all_no_hd2'] = float(len(coa_tbl_no_hd2)) / len(n99_bcs)
            summary['fract_bcs_in_clusters_eq_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size == 2).sum()) / len(n99_bcs)
            summary['fract_bcs_in_clusters_gt_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size > 2).sum()) / len(n99_bcs)

        else:
            empty_df = p.DataFrame({'bc':[], 'cluster_id':[], 'cluster_size':[]})
            empty_df.to_csv(outs.barcode_blacklist, sep="\t", index=False)

            # null coalescence stats
            summary['fract_bcs_overloaded'] = None
            summary['fract_bcs_in_clusters_all'] = None
            summary['fract_bcs_in_clusters_eq_2'] = None
            summary['fract_bcs_in_clusters_gt_2'] = None
            summary['num_clusters_gt_8'] = None
            summary['fract_bcs_in_clusters_all_no_hd2'] = None
            summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None
            summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None

    else:
        outs.barcode_blacklist = None
        summary['fract_bcs_overloaded'] = None
        summary['fract_bcs_in_clusters_all'] = None
        summary['fract_bcs_in_clusters_eq_2'] = None
        summary['fract_bcs_in_clusters_gt_2'] = None
        summary['num_clusters_gt_8'] = None
        summary['fract_bcs_in_clusters_all_no_hd2'] = None
        summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None
        summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None


    # Write summary to json
    with open(outs.filter_barcodes_results, 'w') as results_file:
        tenkit.safe_json.dump_numpy(summary, results_file, pretty=True)
Exemplo n.º 23
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        in_bedpe = read_sv_bedpe_to_df(c.summary)
        out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)
    write_sv_df_to_bedpe(out_bedpe, outs.summary)
Exemplo n.º 24
0
def call_distal(sv_model, c1, s1, e1, c2, s2, e2, in_bam,
                frag_phasing, blacklist_barcodes, args):
    """Evaluate a pair of loci (c1, s1, e1), (c2, s2, e2) for distal SV calls.

    Return value: a list of SvCall objects.
    """

    # Set the random seed so if we run this for the same locus multiple times we
    # always get the same answer.
    seed(1)

    # Fetch reads from each of the two loci
    reads1 = get_reads(in_bam, c1, max(0, s1 - FRAG_EXTEND), e1 + FRAG_EXTEND,
                       min_mapq=args.min_mapq, max_reads=MAX_READS_TO_READ,
                       blacklist_barcodes=blacklist_barcodes)
    reads1 = reads1.groupby('bc').filter(lambda x: len(x) <= MAX_READS_PER_FRAG and \
                                         len(x) >= MIN_READS_PER_FRAG / 2 and \
                                         np.max(x.pos) - np.min(x.pos) > MIN_FRAG_SIZE / 2)

    reads2 = get_reads(in_bam, c2, max(0, s2 - FRAG_EXTEND), e2 + FRAG_EXTEND,
                       min_mapq=args.min_mapq, max_reads=MAX_READS_TO_READ,
                       blacklist_barcodes=blacklist_barcodes)
    reads2 = reads2.groupby('bc').filter(lambda x: len(x) <= MAX_READS_PER_FRAG and \
                                         len(x) >= MIN_READS_PER_FRAG / 2 and \
                                         np.max(x.pos) - np.min(x.pos) > MIN_FRAG_SIZE / 2)

    common_bcs = (set(reads1.bc).intersection(set(reads2.bc)))
    if len(common_bcs) < args.min_bcs:
        return None

    common_reads1 = reads1[np.array([bc in common_bcs for bc in reads1.bc], dtype=np.bool)]
    common_reads2 = reads2[np.array([bc in common_bcs for bc in reads2.bc], dtype=np.bool)]

    # Split each of the loci into small windows and count how many
    # molecules end or start in each region of the resulting grid.
    ticks1 = np.arange(np.min(common_reads1.pos), np.max(common_reads1.pos), args.grid_len)
    ticks2 = np.arange(np.min(common_reads2.pos), np.max(common_reads2.pos), args.grid_len)
    tick_counts = np.zeros((len(ticks1), len(ticks2)), dtype=np.int)
    nticks = len(ticks1) * len(ticks2)
    # barcode x cell, 1 if there is barcode overlap in the cell for that barcode
    bc_tick_counts = np.zeros((len(common_bcs), nticks), dtype=np.bool)

    common_reads1 = common_reads1.groupby('bc')
    common_reads2 = common_reads2.groupby('bc')

    common_bcs = list(common_bcs)
    for i, bc in enumerate(common_bcs):
        r1 = common_reads1.get_group(bc)
        r2 = common_reads2.get_group(bc)
        x = (np.array([np.min(r1.pos), np.max(r1.pos)], dtype=np.int) - ticks1[0]) / args.grid_len
        y = (np.array([np.min(r2.pos), np.max(r2.pos)], dtype=np.int) - ticks2[0]) / args.grid_len
        x = x[np.logical_and(x >= 0, x < len(ticks1))]
        y = y[np.logical_and(y >= 0, y < len(ticks2))]
        vals = list(set(product(x, y)))
        a = np.array([v[0] for v in vals], dtype=np.int)
        b = np.array([v[1] for v in vals], dtype=np.int)
        tick_counts[a, b] += 1
        bc_tick_counts[i, np.ravel_multi_index((a, b), (len(ticks1), len(ticks2)))] = True

    rounds = 0
    final_res = []
    ps1 = sv_utils.get_phase_set(frag_phasing, c1, s1, e1)
    ps2 = sv_utils.get_phase_set(frag_phasing, c2, s2, e2)

    # Get the cell of the grid with the maximum barcode overlap, get the
    # barcodes/molecules involved, figure out the direction of the signal,
    # get a better estimate of the breakpoints, and compute the SV score at
    # these breakpoints. Then, subtract the contribution of the involved
    # barcodes from the barcode overlap grid and repeat.
    # In practice, we only do it once.
    while np.max(tick_counts) > 2 and len(common_reads1) > 2 and len(common_reads2) > 2 and rounds < 1:
        rounds += 1

        # Get region of maximum barcode overlap. m is flat index
        m = np.argmax(tick_counts)
        x, y = np.unravel_index(m, (len(ticks1), len(ticks2)))
        new_start1 = (x * args.grid_len + ticks1[0])
        new_start2 = (y * args.grid_len + ticks2[0])

        # Get the fragments overlapping the region of max barcode overlap
        common_reads1 = common_reads1.filter(lambda x: not(np.max(x.pos) < new_start1 - 5000 or
                                                           np.min(x.pos) > new_start1 + 5000))
        common_reads2 = common_reads2.filter(lambda x: not(np.max(x.pos) < new_start2 - 5000 or
                                                           np.min(x.pos) > new_start2 + 5000))
        # Get fragments with common barcodes that overlap the selected cell
        # This is a superset of np.where(bc_tick_counts[:, m])[0] because it doesn't require that
        # the fragment starts or ends within the cell.
        new_common_bcs = set(common_reads1.bc).intersection(set(common_reads2.bc))

        # Infer orientation based on ends of fragments
        sv_types, cand_breaks1, cand_breaks2 = get_orient(common_reads1.groupby('bc'),
                                                          common_reads2.groupby('bc'),
                                                          new_start1, new_start2,
                                                          new_common_bcs,
                                                          max_cand_breaks=args.max_cand_breaks)
        if len(cand_breaks1) == 0 or len(cand_breaks2) == 0 or len(sv_types) == 0:
            return None

        svt = sv_types[0]

        # Use the inferred orientation of the signal to remove barcodes that are
        # clearly irrelevant.
        tmp_read_groups1 = reads1.groupby('bc').filter(filter_irrelevant_frags1(svt, cand_breaks1))
        tmp_read_groups1['break'] = 1

        tmp_read_groups2 = reads2.groupby('bc').filter(filter_irrelevant_frags2(svt, cand_breaks2))
        tmp_read_groups2['break'] = 2

        read_groups = pd.concat([tmp_read_groups1, tmp_read_groups2], ignore_index=True)

        if len(read_groups) == 0:
            return None

        sel_bcs = set(read_groups.bc)
        if len(sel_bcs) > MAX_READ_GROUPS:
            sel_bcs = list(sel_bcs)
            sel_bcs = set([sel_bcs[i] for i in choice(len(sel_bcs), MAX_READ_GROUPS, replace=False)])

        read_groups = read_groups[[_b in sel_bcs for _b in read_groups.bc]]

        read_groups.sort('bc', inplace=True)
        bc_set = set(read_groups.bc)
        read_groups = read_groups.groupby('bc')

        bc_phase_set_dict1 = sv_utils.get_barcode_phase_probs(frag_phasing, c1, cand_breaks1[0], cand_breaks1[-1], bc_set, in_ps=ps1)
        bc_phase_set_dict2 = sv_utils.get_barcode_phase_probs(frag_phasing, c2, cand_breaks2[0], cand_breaks2[-1], bc_set, in_ps=ps2)

        loci_pairs = [(_a, _b) for _a, _b in product(cand_breaks1, cand_breaks2)]

        read_group_dict = {}
        for bc, read_group in read_groups:
            read_group_dict[bc] = tk_sv_read_model.ReadInfo(read_group.chrom, read_group.pos, group_ids=read_group['break'])

        res = sv_model.em_it_away(loci_pairs, read_group_dict, ps1, ps2,
                                  bc_phase_set_dict1, bc_phase_set_dict2,
                                  em_iters=5, proximal=False)

        (max_lrs, max_locus, sv_type, zygosity, max_hap, support, posterior_probs) = res
        sv_call = tk_sv_call.SvCall.from_em_results(c1, c2, ps1, ps2, max_lrs,
                                                    max_locus, sv_type, zygosity, max_hap,
                                                    support, posterior_probs)
        final_res.append(sv_call)

    return final_res
Exemplo n.º 25
0
def join(args, outs, chunk_defs, chunk_outs):
    bam_in = tk_bam.create_bam_infile(args.input)
    # Combine fragment h5 files
    in_files = [
        out.fragments for out in chunk_outs
        if out.fragments and os.path.exists(out.fragments)
    ]
    nfrags = 0

    chrom_partition = partition_chroms(bam_in.references, bam_in.lengths)

    if len(in_files) > 0:
        readers = [tenkit.hdf5.DataFrameReader(f) for f in in_files]

        for chrom_set in chrom_partition:
            chunks = [r.query_chroms(chrom_set) for r in readers]
            chunk = p.concat(chunks)
            chunk.sort(['chrom', 'start_pos', 'bc'], inplace=True)

            # Always save the BC as categorical
            chunk['bc'] = chunk['bc'].astype('category')

            chunk['molecule_id'] = np.arange(len(chunk),
                                             dtype=np.int32) + nfrags
            nfrags += len(chunk)
            tenkit.hdf5.append_data_frame(outs.fragments, chunk)

        for r in readers:
            r.close()

        tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos',
                                       'end_pos')

    else:
        outs.fragments = None

    # Combine BC h5 files
    in_files = [
        out.barcodes for out in chunk_outs
        if out.barcodes and os.path.exists(out.barcodes)
    ]
    if len(in_files) > 0:
        tenkit.hdf5.combine_data_frame_files(outs.barcodes, in_files)
    else:
        outs.barcodes = None

    summary = {}

    # Compute high-level BC summary metrics
    # Load BC data
    if outs.barcodes:
        bc_df = tenkit.hdf5.read_data_frame(outs.barcodes)
        fragment_df = tenkit.hdf5.read_data_frame(
            outs.fragments,
            query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos'])

        bc_df.sort('bc_num_reads', inplace=True)

        # bin the bc counts and write a json histogram file
        n_reads = bc_df.bc_num_reads.values
        max_val = np.percentile(n_reads, 99.99) * 1.3
        min_val = n_reads.min()
        num_bins = 400
        step = math.ceil((max_val - min_val) / num_bins)
        bins = np.arange(min_val, max_val, step)
        (hist, edges) = np.histogram(n_reads, bins=bins)
        bc_count_hist = {int(edges[i]): hist[i] for i in range(len(bins) - 1)}

        # Summarize properties of n50 and n90 BC set
        bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)
        n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5
        n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh]
        n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)]
        n50_stats = high_level_stats("n50", n50_fra, n50_bcs)
        del n50_fra

        n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1
        n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh]
        n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)]
        n90_stats = high_level_stats("n90", n90_fra, n90_bcs)
        del n90_fra

        for (k, v) in n50_stats.iteritems():
            summary[k] = v

        for (k, v) in n90_stats.iteritems():
            summary[k] = v

        # Generate a fragment length histogram
        fragment_df['len_bin'] = np.floor_divide(
            fragment_df.est_len.values,
            FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE

        multi_read_frags = fragment_df[fragment_df.num_reads > 1]
        len_bins = multi_read_frags.groupby(['len_bin']).apply(len)
        del multi_read_frags

        len_hist = {k: v for (k, v) in len_bins.iteritems()}

        # Write fragment length hist to json
        with open(outs.fragment_size, 'w') as fragment_size_file:
            tenkit.safe_json.dump_numpy(len_hist, fragment_size_file)

        # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer
        hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0,
                                       bc_df.bc_num_reads > 25)]
        hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True)
        if len(hot_bcs) > 50:
            hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:]
            summary['estimated_dna_per_partition'] = round(
                scipy.stats.tmean(
                    hot_bcs.bc_est_len,
                    scipy.percentile(hot_bcs.bc_est_len, (1, 99))))
        else:
            summary['estimated_dna_per_partition'] = None

        # Read-based effective diversity
        reads = bc_df.bc_num_reads.values
        sum_sq = (reads**2.0).sum()
        effective_diversity = tk_stats.robust_divide((reads.sum()**2.0),
                                                     float(sum_sq))
        summary['effective_diversity_reads'] = effective_diversity

        # Fragment-based effective diversity
        fragments = bc_df.bc_num_fragments.values
        sum_sq = (fragments**2.0).sum()
        effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0),
                                                     float(sum_sq))
        summary['effective_diversity_fragments'] = effective_diversity

    else:
        # No fragment_size file emitted
        outs.fragment_size = None

        n50_stats = high_level_stats("n50", None, None)
        n90_stats = high_level_stats("n90", None, None)

        for (k, v) in n50_stats.iteritems():
            summary[k] = v

        for (k, v) in n90_stats.iteritems():
            summary[k] = v

        bc_count_hist = {}

        summary['estimated_dna_per_partition'] = None
        summary['effective_diversity_reads'] = None
        summary['effective_diversity_fragments'] = None

    with open(outs.barcode_histogram, 'w') as barcode_hist_file:
        tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file)

    # Write summary to json
    with open(outs.single_partition, 'w') as summary_file:
        tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
Exemplo n.º 26
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']

    has_pileups = np.zeros((len(pred_df), ), dtype=np.bool)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        has_clipped1 = has_too_many_clipped(
            in_bam,
            row.chrom1,
            max(0, row.start1 - BREAK_EXT),
            row.stop1 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped2 = has_too_many_clipped(
            in_bam,
            row.chrom2,
            max(0, row.start2 - BREAK_EXT),
            row.stop2 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped = has_clipped1 and has_clipped2

        if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE:
            has_pileups[i] = has_clipped
            continue

        cov = cov_reader.query(
            (row.chrom1, max(0,
                             row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT))
        cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int)
        if not 'coverage_deduped' in cov.columns:
            cov['coverage_deduped'] = cov[sel_cols].sum(axis=1)
        cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped'])
        median_cov = np.median(cov_arr)

        # Rescue for deletions or duplications with breakpoints on the pileups
        sv_len = row.stop2 - row.start1
        side_cov = cov_reader.query(
            (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2),
             row.start1 - BREAK_EXT))
        side_cov = pd.concat([
            side_cov,
            cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT,
                              row.stop2 + BREAK_EXT + sv_len / 2))
        ],
                             ignore_index=True)
        if not 'coverage_deduped' in side_cov.columns:
            side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1)

        # Ignore pileups, enough evidence for a large-scale copy number variant
        if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue
        if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue

        # Filter out the call if there are pileups very close to the breakpoints
        has_pileups[i] = len(cov_arr) > 4 and np.any(
            cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov)
        has_pileups[i] = has_pileups[i] or has_clipped

    pileups = pred_df[has_pileups]
    pred_df = pred_df[np.logical_not(has_pileups)]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
Exemplo n.º 27
0
def main(args, outs):
    reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']
    ext_cols = list(sel_cols)
    ext_cols.append('total_cov')

    out_loci = []
    summary_df = None
    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov = reader.query((chrom, start, stop))
        cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int)
        cov['total_cov'] = cov[sel_cols].sum(axis=1)
        mean_cov = np.mean(cov['total_cov'])
        summary_df = pd.concat([
            summary_df,
            pd.DataFrame(
                {
                    'chrom': chrom,
                    'start': start,
                    'stop': stop,
                    'mean_cov': mean_cov
                },
                index=[0])
        ],
                               ignore_index=True)
        # Remove very small phase sets. These tend to be single-SNP phase sets
        # and can result from erroneous SNPs.
        cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000)
        sum_df = cov.groupby(['bin',
                              'phase_set'])[ext_cols].mean().reset_index()
        sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov
        sum_df['low_hap0'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1)
        sum_df['low_hap1'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0)

        if not sum_df.empty:
            any_low = np.logical_or(
                sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0))

            bins = np.array(sum_df['bin'])
            bins = np.concatenate([bins, [np.max(bins) + 1]])
            pos = 0
            # Get runs of 0s and 1s in any_low
            for bit, group in groupby(any_low):
                group_size = len(list(group))
                group_start = bins[pos] * args.bin_size
                group_stop = bins[pos + group_size] * args.bin_size
                region_len = group_stop - group_start
                if bit and region_len >= args.min_len:
                    out_loci.append((chrom, max(0,
                                                group_start - args.bin_size),
                                     group_start + args.bin_size, chrom,
                                     max(0, group_stop - args.bin_size),
                                     group_stop + args.bin_size))
                pos += group_size

    with open(outs.loci, 'w') as f:
        cPickle.dump(out_loci, f)

    summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)