예제 #1
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    for chunk in chunk_outs:
        df = sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        join_df = pd.concat([join_df, df], ignore_index = True)

    sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
예제 #2
0
def split(args):
    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.variants)
    gt_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants)
    tk_sv_io.check_sv_names(gt_df)

    sv_df["name"] = ["call_%d" % idx for idx in range(len(sv_df))]

    variants_bedpe = os.path.join(os.getcwd(), "variants.bedpe")
    tk_sv_io.write_sv_df_to_bedpe(sv_df, variants_bedpe)

    nsvs = sv_df.shape[0]
    nbreaks_per_chunk = max(100,
                            int(np.ceil(nsvs / 32.0)))  # avoid overchunking
    nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk)))
    chunk_defs = []

    for i in range(nchunks):
        chunk_start = i * nbreaks_per_chunk
        chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk)
        chunk_defs.append({
            'renamed_variants': variants_bedpe,
            'start_idx': chunk_start,
            'stop_idx': chunk_end,
            '__mem_gb': 12
        })

    if len(chunk_defs) == 0:
        chunk_defs = [{
            'renamed_variants': variants_bedpe,
            'start_idx': 0,
            'stop_idx': 0,
            '__mem_gb': 12
        }]

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 16}}
예제 #3
0
def join(args, outs, chunk_defs, chunk_outs):
    out_df = None
    for chunk in chunk_outs:
        tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates)
        out_df = pd.concat([out_df, tmp_df], ignore_index=True)

    out_df['name'] = np.arange(len(out_df))
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
예제 #4
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants)
        join_df = pd.concat([join_df, bedpe_df], ignore_index=True)

    if not args.best_only:
        join_df['name'] = np.arange(len(join_df))

    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
예제 #5
0
def join(args, outs, chunk_defs, chunk_outs):
    out_calls = None
    out_pileups = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_calls):
            continue
        calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls)
        pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups)
        out_calls = pd.concat([out_calls, calls], ignore_index=True)
        out_pileups = pd.concat([out_pileups, pileups], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
예제 #6
0
def main(args, outs):

    rust_env = os.environ.copy()
    rust_env["RUST_BACKTRACE"] = "1"

    if args.chunk_bed is None:
        sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        return

    # Run PVC
    fasta = tenkit.reference.get_fasta(args.reference_path)
    pvc_args = ['pvc', '--min-kmer-obs', str(args.min_kmer_obs), '--coverage-json', args.coverage_json]
    pvc_args.extend(["call-bed", "-o", outs.sv_calls, fasta, args.possorted_bam, args.chunk_bed])
    subprocess.check_call(pvc_args, env=rust_env)
예제 #7
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    non_pass_join_df = None
    for chunk in chunk_outs:
        df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls)
        join_df = pd.concat([join_df, df], ignore_index=True)
        non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df],
                                     ignore_index=True)

    join_df['name'] = np.arange(len(join_df))
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
    non_pass_join_df['name'] = np.arange(len(join_df),
                                         len(join_df) + len(non_pass_join_df))
    tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
예제 #8
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if not out_bedpe is None:
        out_bedpe['name'] = np.arange(len(out_bedpe))
    sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)

    if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary):
        shutil.copyfile(chunk_outs[0].summary, outs.summary)
    else:
        outs.summary = None
예제 #9
0
def main(args, outs):
    callsets = []

    if args.calls1 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls1)
        callsets.append(c)

    if args.calls2 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls2)
        callsets.append(c)

    if args.calls3 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls3)
        callsets.append(c)

    # Select the highest qual
    merged = merge_overlapping(callsets, select_widest())
    sv_io.write_sv_df_to_bedpe(merged, outs.merged)
예제 #10
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if out_bedpe is None:
        col_names = ['chrom1', 'start1', 'stop1',
                     'chrom2', 'start2', 'stop2', 'name', 'qual',
                     'strand1', 'strand2', 'filters', 'info']
        out_bedpe = pd.DataFrame(columns=col_names)
    out_bedpe.names = np.arange(len(out_bedpe))

    out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv]
    tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
예제 #11
0
def main(args, outs):
    if args.chrom is None or len(args.starts) == 0 or args.barcode_whitelist is None:
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        return

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(args.insert_sizes, MAX_INSERT_SIZE_PRC)
    if max_insert is None:
        martian.throw('No Q60 reads')

    # This is slightly bigger than the maximum "normal" insert
    min_call_insert, _ = tk_sv_utils.get_insert_size_info(args.insert_sizes, MIN_SV_INSERT_SIZE_PRC)
    min_sv_len = max(args.min_sv_len, min_call_insert)
    martian.log_info('Setting min_sv_len to {}'.format(min_sv_len))
    
    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary['same_dir_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary['outward_dir_chimera_rate']
    chimera_rates = {tk_readpairs.DEL_STR:chimera_rate_del,
                     tk_readpairs.INV_STR:chimera_rate_inv,
                     tk_readpairs.TDUP_STR:chimera_rate_dup,
                     tk_readpairs.TRANS_STR:summary['far_chimera_rate']}

    df, read_counts, _ = tk_readpairs.get_discordant_loci(args.possorted_bam, chrom = str(args.chrom),
                                                          starts = args.starts, stops = args.stops,
                                                          min_mapq = args.min_mapq, min_insert = 0,
                                                          max_insert = max_insert,
                                                          max_merge_range = args.merge_range_factor * max_insert,
                                                          min_sv_len = min_sv_len, max_sv_len = args.max_sv_len,
                                                          ins_logsf_fun = ins_logsf_fun,
                                                          min_lr_to_call = args.min_lr_to_call,
                                                          min_reads_to_call = args.min_reads_to_call,
                                                          chimera_rate = chimera_rates, reads_as_qual = True)

    # Need to convert to dict because defaultdict doesn't get pickled properly
    read_counts['split'] = dict(read_counts['split'])
    read_counts['pair'] = dict(read_counts['pair'])
    tk_sv_io.write_sv_df_to_bedpe(df, outs.sv_calls)
    with open(outs.discordant_read_counts, 'w') as f:
        f.write(tenkit.safe_json.safe_jsonify(read_counts))
예제 #12
0
def main(args, outs):

    if args.barcode_whitelist is None:
        # write empty dataframe
        tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates)
        martian.log_info(
            'Data seem un-barcoded. No deletion candidates will be computed.')
        return

    if True:
        '''
          pvc [options] call-one <fasta> <bam> <locus>
          pvc [options] call-bed -o <out> <fasta> <bam> <bed> [<which>]
          pvc [options] bam-svs <out> <bam>
          pvc [options] cands <bam> <locus> <out>
          pvc (-h | --help)
          pvc --version

        Options:
          --min-size=<m>       Mininum event size
          --min-kmer-obs=<k>   Minimum number of kmer observations
          -h --help            Show this screen.
          --version            Show version.
          --trace              Trace logging
          -d --debug           Debug logging
        '''

        for locus in args.loci:
            tmp_file = "tmp.bedpe"

            min_detect_size = 25
            pvc_args = [
                'pvc',
                '--het-read-prob=%f' % args.het_read_prob,
                '--min-size=%d' % min_detect_size, 'cands', args.possorted_bam,
                locus, tmp_file
            ]
            subprocess.check_call(pvc_args)
            subprocess.check_call('cat %s >> %s' %
                                  (tmp_file, outs.del_candidates),
                                  shell=True)
예제 #13
0
def main(args, outs):

    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    sv_df["info2"] = "SV"

    cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants)
    cnv_df["info2"] = "CNV"

    sv_df = pd.concat([sv_df, cnv_df], ignore_index=True)
    sv_df['name'] = np.arange(len(sv_df))
    sv_df.sort(['chrom1', 'chrom2'], inplace=True)

    res_df = None
    for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']):
        tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
                    inplace=True)
        # cluster the loci in the group based on proximity
        groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist)

        # for each cluster, get the row with max qual
        # tmp_df.loc[g] gets the subset of tmp_df in the cluster.
        # then idxmax gets the max index

        out_df = pd.DataFrame(columns=sv_df.columns)
        idx = 0
        for g in groups:
            row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()]
            if (tmp_df.loc[g]['info2'] == 'SV').any():
                row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()]

            source = list(set(tmp_df.loc[g]['info2']))
            row['info'] += (";SOURCE=" + ",".join(source))
            out_df.loc[idx] = row
            idx += 1

        out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True)
        res_df = pd.concat([res_df, out_df], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
예제 #14
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    read_counts = {}
    read_counts['split'] = defaultdict(int)
    read_counts['pair'] = defaultdict(int)

    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        join_df = pd.concat([join_df, bedpe_df], ignore_index = True)

        if not os.path.isfile(chunk.discordant_read_counts):
            continue
        with open(chunk.discordant_read_counts, 'r') as f:
            counts = json.load(f)
        for t, c in counts['split'].iteritems():
            read_counts['split'][t] += c
        for t, c in counts['pair'].iteritems():
            read_counts['pair'][t] += c

    join_df['name'] = [str(i) for i in np.arange(len(join_df))]
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)

    read_counts['split'] = dict(read_counts['split'])
    read_counts['pair'] = dict(read_counts['pair'])

    with open(args.basic_summary, 'r') as f:
        num_reads = float(json.load(f)['num_reads']) / 2.0

    read_counts['frac_split'] = {}
    read_counts['frac_pair'] = {}
    for t, c in read_counts['split'].iteritems():
        read_counts['frac_split'][t] = c / num_reads
    for t, c in read_counts['pair'].iteritems():
        read_counts['frac_pair'][t] = c / num_reads

    with open(outs.discordant_read_counts, 'w') as f:
        f.write(tenkit.safe_json.safe_jsonify(read_counts))
예제 #15
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    frac_changed = np.zeros((len(pred_df), ), dtype=np.float)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        frac_changed[i] = get_frac_mapq_changed(in_bam,
                                                row.chrom1,
                                                max(0, row.start1 - BREAK_EXT),
                                                row.stop1 + BREAK_EXT,
                                                row.chrom2,
                                                max(0, row.start2 - BREAK_EXT),
                                                row.stop2 + BREAK_EXT,
                                                min_mapq=60)

    pileups = pred_df[frac_changed > args.max_frac_low_mapq]
    pred_df = pred_df[frac_changed <= args.max_frac_low_mapq]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
예제 #16
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']

    has_pileups = np.zeros((len(pred_df), ), dtype=np.bool)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        has_clipped1 = has_too_many_clipped(
            in_bam,
            row.chrom1,
            max(0, row.start1 - BREAK_EXT),
            row.stop1 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped2 = has_too_many_clipped(
            in_bam,
            row.chrom2,
            max(0, row.start2 - BREAK_EXT),
            row.stop2 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped = has_clipped1 and has_clipped2

        if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE:
            has_pileups[i] = has_clipped
            continue

        cov = cov_reader.query(
            (row.chrom1, max(0,
                             row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT))
        cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int)
        if not 'coverage_deduped' in cov.columns:
            cov['coverage_deduped'] = cov[sel_cols].sum(axis=1)
        cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped'])
        median_cov = np.median(cov_arr)

        # Rescue for deletions or duplications with breakpoints on the pileups
        sv_len = row.stop2 - row.start1
        side_cov = cov_reader.query(
            (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2),
             row.start1 - BREAK_EXT))
        side_cov = pd.concat([
            side_cov,
            cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT,
                              row.stop2 + BREAK_EXT + sv_len / 2))
        ],
                             ignore_index=True)
        if not 'coverage_deduped' in side_cov.columns:
            side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1)

        # Ignore pileups, enough evidence for a large-scale copy number variant
        if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue
        if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue

        # Filter out the call if there are pileups very close to the breakpoints
        has_pileups[i] = len(cov_arr) > 4 and np.any(
            cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov)
        has_pileups[i] = has_pileups[i] or has_clipped

    pileups = pred_df[has_pileups]
    pred_df = pred_df[np.logical_not(has_pileups)]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
예제 #17
0
def main(args, outs):

    if args.barcode_whitelist is None:
        # write empty dataframe
        tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates)
        martian.log_info('Data seem un-barcoded. No deletion candidates will be computed.')
        return

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    del_loci = []
    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov_df = get_hap_coverage(in_bam, None, chrom, start, stop, cov_quals=[30])
        best_path = get_candidate_del_loci(cov_df, transition_prob=args.transition_prob, het_read_prob=args.het_read_prob)

        # Get regions with good coverage for a het del (not too high, not too low)
        bad_cov = np.logical_or(cov_df['total_cov'] < MIN_COV,
                                cov_df['total_cov'] > MAX_COV)
        bad_regions = tk_regions.Regions([ (s,e) for (s,e) in group_bit_arr(bad_cov, start) if e-s > args.min_bad_region])

        # Group the states of the HMM and exclude bad regions
        pos = start
        out_loci = []
        for bit, group in groupby(best_path):
            group_size = len(list(group))
            group_start = pos
            group_stop = group_start + group_size
            if bit and group_size >= args.min_del_len and group_size <= args.max_del_len and \
               not bad_regions.overlapping_regions(group_start, group_stop):
                out_loci.append((chrom, group_start, group_stop))
            pos += group_size

        # Get regions that look like hom dels
        hom_del_loci = group_bit_arr(cov_df['total_cov'] < MIN_COV, start)
        out_loci.extend([(chrom, s, e) for (s, e) in hom_del_loci])
        out_loci = sorted(out_loci)

        # Now merge deletion candidates that are separated by short non-dels
        if out_loci:
            new_out_loci = []
            last_locus = out_loci[0]
            for i, locus in enumerate(out_loci[1:]):
                if locus[1] - last_locus[2] > MIN_GAP:
                    new_out_loci.append(last_locus)
                    last_locus = locus
                else:
                    last_locus = (last_locus[0], min(locus[1], last_locus[1]), max(locus[2], last_locus[2]))
            new_out_loci.append(last_locus)

            del_loci.extend(new_out_loci)

    final_loci = [locus for locus in del_loci if locus[2] - locus[1] >= args.min_del_len and locus[2] - locus[1] <= args.max_del_len]
    info_strs = ['TYPE=DEL' for _ in final_loci]
    in_bam.close()

    chroms = [locus[0] for locus in final_loci]
    starts1 = np.array([locus[1] for locus in final_loci], dtype=np.int)
    starts2 = np.array([locus[2] for locus in final_loci], dtype=np.int)
    sv_df = tk_sv_io.create_sv_df(chroms, starts1, starts1 + 1,
                                  chroms, starts2, starts2 + 1,
                                  np.arange(len(chroms)), 1, info_strs = info_strs)
    tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.del_candidates)
예제 #18
0
def join(args, outs, chunk_defs, chunk_outs):
    pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs)

    # Change TRANS type to DISTAL. This change will only
    # affect the type reported not the names of the metrics.
    new_info = []
    for _, row in pred_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        if sv_type == 'TRANS':
            sv_type = 'DISTAL'
        new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type]))
    pred_df['info'] = new_info

    if not true_df is None:
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')

    ##### Write BEDPE/VCF outputs
    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates)
    source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format(
        martian.get_pipelines_version())
    sample_id = 'sample' if args.sample_id is None else args.sample_id
    tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id,
                          source_str, args.reference_path)
    # this will sort and gzip
    tk_sv_io.index_sv_vcf(outs.svs.strip(".gz"))
    outs.svs_index = outs.svs + '.tbi'
    # delete the non-gzipped file
    os.remove(outs.svs.strip('.gz'))

    if not pred_df.empty:
        call_df = pred_df[np.logical_or(pred_df['filters'] == '.',
                                        pred_df['filters'] == "PASS")]
    else:
        call_df = None
    tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls)

    # Annotate each call with the matching ground truth svs. The resulting
    # dataframe might have multiple rows for the same call if there were multiple
    # matching ground truth svs.
    martian.log_info("merging calls and gt")
    if not pred_df.empty:
        pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match)

    martian.log_info("writing call_tsv")
    pred_df.to_csv(outs.call_tsv,
                   index=False,
                   header=True,
                   sep='\t',
                   na_rep='NaN')

    pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))]

    max_dists = sorted(np.array(args.detect_dists))

    gt_sv_types = get_all_sv_types(true_df)
    call_sv_types = get_all_sv_types(pred_df)

    if not true_df is None:
        # Use the default MAX_PPV_TIER unless this is greater than the maximum tier
        # present in the data.
        max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier))
        # Use the default unless this is smaller than the minimum tier present in
        # the data.
        max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier))
    else:
        max_ppv_tier = 1
        max_sens_tier = 1

    tiers = [max_ppv_tier, max_sens_tier]

    # All combinations of filters in ground truth and call set
    if not args.targets is None and not args.target_dists is None:
        target_dists = list(sorted(np.array(args.target_dists,
                                            dtype=np.float)))
        target_dists.append(float('NaN'))
    else:
        target_dists = [float('NaN')]

    combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers,
                    [True, False], call_sv_types, max_dists)

    metrics = defaultdict(list)

    gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier']
    call_filters = ['call_filtered', 'call_sv_type', 'match_dist']

    for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type,
         dist) in combs:
        if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type:
            continue

        metrics['genic_breaks'].append(genic_breaks)
        metrics['target_dist'].append(tdist)
        metrics['gt_sv_type'].append(gt_sv_type)
        metrics['tier'].append(tier)
        metrics['call_filtered'].append(is_filtered)
        metrics['call_sv_type'].append(call_sv_type)
        metrics['match_dist'].append(dist)

        if true_df is None:
            sel_true_df = None
        else:
            sel_true_df = true_df
            if gt_sv_type != 'NA':
                sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type]
            if not np.isnan(tdist):
                sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist]
            sel_true_df = sel_true_df[sel_true_df.tier <= tier]
            # Restrict to genic or non-genic or take everything if this is None.
            if not genic_breaks is None:
                sel_true_df = sel_true_df[sel_true_df.genic_breaks ==
                                          genic_breaks]

            if len(sel_true_df) == 0:
                sel_true_df = None

        sel_pred_df = pred_df

        if is_filtered and not pred_df.empty:
            sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') |
                                      (sel_pred_df.filters == 'PASS')]
        if call_sv_type != 'NA' and not pred_df.empty:
            sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type]
        if not pred_df.empty and (args.min_rel_overlap is None
                                  or args.min_rel_overlap == 0):
            # Do not apply thi filter if the matching is done based on overlap.
            sel_pred_df = sel_pred_df[np.logical_or(
                np.isnan(sel_pred_df.match_dist),
                sel_pred_df.match_dist <= dist)]

        add_metrics(sel_pred_df, sel_true_df, metrics)

    column_names = gt_filters
    column_names.extend(call_filters)
    other_names = set(metrics.keys()).difference(set(column_names))
    column_names.extend(other_names)

    metric_df = pd.DataFrame(metrics)
    metric_df = metric_df[column_names]

    martian.log_info("writing summary tsv")
    metric_df.to_csv(outs.summary_tsv,
                     index=False,
                     header=True,
                     sep='\t',
                     na_rep='NaN')

    short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier,
                                      max_sens_tier, args)

    if not args.call_summary is None:
        with open(args.call_summary, 'r') as in_summary_fn:
            in_summary = json.load(in_summary_fn)
            for key, val in in_summary.iteritems():
                short_metrics[key] = val

    short_metrics['min_qv'] = min_qv

    with open(outs.summary, 'w') as out_file:
        out_file.write(
            tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
예제 #19
0
파일: utils.py 프로젝트: umccr/longranger
def merge_breaks(bedpe_df,
                 out_bedpe,
                 merge_win=10000,
                 max_range=np.inf,
                 max_nmates=np.inf,
                 cluster_qual_factor=0.2):
    """Merges a set of SVs into a non-redundant set.
    Args:
    - bedpe_df: Either a bedpe file or a DataFrame like the one returned by
    tk_sv_io.read_sv_bedpe_to_df.
    - out_bedpe: Path to file where output will be written.
    - merge_win: Breakpoints will be merged if they are within this distance from
    each other. Two SVs will be merged if both their breakpoints can be merged.
    - max_range: See max_range field of cluster_loci.
    - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1,
    and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so
    breakpoint clusters), of the first breakpoint of an SV.
    SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the
    output.

    Return value:
    The output BEDPE.
    """
    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)
    breaks = []
    for i in range(bedpe_df.shape[0]):
        breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1],
                       bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1)))
        breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4],
                       bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = {}
    for i in range(bedpe_df.shape[0]):
        name = bedpe_df.iloc[i]['name']
        cluster_idx1 = mem_to_cluster[(name, 1)]
        cluster_idx2 = mem_to_cluster[(name, 2)]
        if not (cluster_idx1, cluster_idx2) in cluster_pairs:
            cluster_pairs[(cluster_idx1, cluster_idx2)] = [i]
        else:
            old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0]
            # Make sure the old and the new pair have breaks on the same chromosomes
            assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0])
            assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3])
            cluster_pairs[(cluster_idx1, cluster_idx2)].append(i)

    new_cluster_pairs = {}
    cluster_dist_ratio = {}
    for p, pos_list in cluster_pairs.iteritems():
        pos_arr = np.array(pos_list)
        tmp_df = get_dataframe_loc(bedpe_df, pos_arr)
        quals = np.array(tmp_df.qual)
        best_call = pos_arr[np.argmax(quals)]
        close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0]
        close_df = get_dataframe_loc(tmp_df, close_calls)

        same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[
            best_call]['chrom1']
        min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1)
        max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[
            best_call]['stop1']

        new_cluster_pairs[p] = best_call
        if not same_chrom or max_break_dist > MAX_FRAG_SIZE:
            cluster_dist_ratio[p] = '.'
        elif min_break_dist <= 0:
            cluster_dist_ratio[p] = float('NaN')
        else:
            cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist

    cluster_pairs = new_cluster_pairs

    def clusters_close(i, j):
        chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[
            i, 1], bedpe_df.iloc[i, 2]
        chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[
            i, 4], bedpe_df.iloc[i, 5]
        next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[
            j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2]
        next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[
            j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5]
        dist1 = max(next_start1 - stop1, start1 - next_stop1)
        dist2 = max(next_start2 - stop2, start2 - next_stop2)
        return (chrom1 == next_chrom1 and chrom2 == next_chrom2
                and dist1 <= merge_win and dist2 <= merge_win)

    # The "chain-breaking" in cluster_loci might still leave some redundancy.
    # In particular, we might leave some almost touching clusters that were
    # separated only because of chain-breaking. Do a second round of clustering
    # where you go through consecutive pairs of cluster and merge them if they're merge-able.
    new_cluster_pairs = {}
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        if cluster_pairs[(cluster1, cluster2)] == -1:
            continue
        # Consider all neighboring clusters after this cluster.
        # Notice that the cluster indices are sorted by genomic coordinates.
        neigh_clusters = [
            (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1),
            (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1)
        ]
        idx = cluster_pairs[(cluster1, cluster2)]
        # Best cluster among neighboring clusters
        max_cluster = ((cluster1, cluster2), idx)
        for next_cluster1, next_cluster2 in neigh_clusters:
            if not (next_cluster1, next_cluster2) in cluster_pairs:
                continue
            if cluster_pairs[(next_cluster1, next_cluster2)] == -1:
                continue
            next_idx = cluster_pairs[(next_cluster1, next_cluster2)]
            if clusters_close(idx, next_idx):
                cluster_pairs[(next_cluster1, next_cluster2)] = -1
                if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']:
                    max_cluster = ((next_cluster1, next_cluster2), next_idx)
        new_cluster_pairs[max_cluster[0]] = max_cluster[1]

    cluster_pairs = new_cluster_pairs

    # Now compute the number of mate breakpoints for each cluster
    num_mates = {}
    for (cluster1, cluster2) in cluster_pairs.keys():
        if not cluster1 in num_mates:
            num_mates[cluster1] = 0
        if not cluster2 in num_mates:
            num_mates[cluster2] = 0
        num_mates[cluster1] += 1
        if cluster2 != cluster1:
            num_mates[cluster2] += 1

    sel_loc = []
    new_info_strs = []
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        sv_loc = cluster_pairs[(cluster1, cluster2)]
        if num_mates[cluster1] > max_nmates and num_mates[
                cluster2] > max_nmates:
            continue
        sel_loc.append(sv_loc)
        new_info_strs.append(
            tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'],
                                 ['NMATES1', 'NMATES2', 'RESOLUTION'], [
                                     num_mates[cluster1], num_mates[cluster2],
                                     cluster_dist_ratio[(cluster1, cluster2)]
                                 ]))
    if len(sel_loc) > 0:
        bedpe_df = bedpe_df.iloc[sel_loc]
        bedpe_df['info'] = new_info_strs
    else:
        bedpe_df = pd.DataFrame(columns=bedpe_df.columns)
    if not out_bedpe is None:
        tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe)

    return bedpe_df
예제 #20
0
def main(args, outs):
    """SV calling on a subset of the input loci."""

    #### Prepare input files and parameters ####
    if not isfile(args.fragment_histogram) or not isfile(args.fragments) or \
       not isfile(args.fragment_phasing):
        martian.log_info('One or more files needed for SV-calling are missing. No calls will be made.')
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_variants)
        return

    # Get candidate loci and subset to the loci for this chunk.
    overlap_loci, input_names = prepare_loci(args)

    overlap_loci = [overlap_loci[i] for i in range(int(args.start_idx), int(args.stop_idx))]
    if not input_names is None:
        input_names = [input_names[i] for i in range(int(args.start_idx), int(args.stop_idx))]

    # Get molecule size distribution.
    frag_res = tk_sv_stats.read_frag_hist(args.fragment_histogram, MIN_FRAG_SIZE)
    frag_sizes, frag_counts = frag_res

    # Get fragment phasing info. This will be used to get the barcode phasing.
    frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)

    # Estimate the Poisson reads rate alpha.
    fragment_df = tk_hdf5.read_data_frame_limited(args.fragments, query_cols=['obs_len', 'num_reads'], max_rows=20000000)
    fragment_df = fragment_df[fragment_df.num_reads > MIN_READS_PER_FRAG]
    alpha = np.median(np.array(fragment_df['num_reads']) / np.array(fragment_df['obs_len'], dtype=np.float))
    martian.log_info('Using alpha = {}'.format(alpha))

    sv_model = tk_sv_read_model.ReadModel(alpha, frag_sizes, frag_counts,
                                          p_ov_mol=args.p_ov_mol, step=1000)

    if not args.targets is None:
        msg = 'Read-based SV-calling from targeted data not supported.'
        martian.log_info(msg)
        return

    # Get a set of barcodes to remove from SV-calling.
    if not args.barcode_blacklist is None:
        tmp_df = pd.read_csv(args.barcode_blacklist, sep='\t', index_col=False)
        blacklist_barcodes = set(tmp_df.bc)
    else:
        blacklist_barcodes = set([])

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    #### End prepare input files and parameters ####

    old_locus = None
    old_reads = None

    res_arr = []
    group_ids = []

    # Iterate through all loci and evaluate them for the presence of SVs
    for locus_idx, (c1, s1, e1, c2, s2, e2, _) in enumerate(overlap_loci):

        print >> sys.stderr, 'Evaluating locus', c1, s1, e1, c2, s2, e2

        # Candidate locus too wide. Skip.
        if e1 - s1 > MAX_REGION_LEN or e2 - s2 > MAX_REGION_LEN:
            print >> sys.stderr, 'Locus too wide. Skipping.'
            continue

        # Candidate loci too close to each other. Skip.
        if c1 == c2 and s2 - e1 < 2 * BREAK_EXT:
            print >> sys.stderr, 'Breakpoints of too close. Skipping.'
            continue

        # Evaluate for proximal SVs (DEL, INV, DUP) if the distance between
        # breakpoints is < MAX_FRAG_SIZE. Otherwise, the event will be called
        # a translocation and we'll try to infer the signal orientation.
        if c1 == c2 and s2 - e1 < MAX_FRAG_SIZE:
            if not old_locus is None and loci_close(old_locus, (c1, s1, e1, c2, s2, e2)):
                in_read_df = old_reads
            else:
                in_read_df = None
            res, reads = call_proximal(sv_model, c1, s1, e1, s2, e2,
                                       in_bam, in_read_df, frag_phasing,
                                       blacklist_barcodes, args)

            old_locus = (c1, s1, e1, c2, s2, e2)
            old_reads = reads
        else:
            res = call_distal(sv_model, c1, max(0, s1 - BREAK_EXT), e1 + BREAK_EXT,
                              c2, max(0, s2 - BREAK_EXT), e2 + BREAK_EXT, in_bam,
                              frag_phasing, blacklist_barcodes, args)

        if not res is None:
            res_arr.extend(res)
            group_ids.extend(locus_idx * np.ones((len(res),), dtype=np.int))

    in_bam.close()

    out_df = tk_sv_call.SvCall.svs_to_dataframe(res_arr)

    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
예제 #21
0
def main(args, outs):

    if args.fragments is None:
        outs.bc_cnv = None
        outs.bc_large_cnv = None
        return

    rust_env = os.environ.copy()
    rust_env["RUST_BACKTRACE"] = "1"
    final_blacklist = lr_gt.get_genomic_track(args.blacklist, "terminal_cnv",
                                              args.reference_path,
                                              "default_blacklist.bed")
    if final_blacklist is None:
        final_blacklist = args.possorted_bam + "_tmp"
        open(final_blacklist, 'w').close()

    if args.subcommand == "bc" and args.fragments:
        frag_csv = outs.bc_cnv + ".csv"
        bin_size, frag_version = convert_fragments_to_csv(
            args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj)
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam,
            final_blacklist, outs.bc_cnv, "--fragver",
            str(frag_version), "--binsize",
            str(bin_size), "--probchange",
            str(args.status_change_penalty), "--minprob",
            str(args.min_prob)
        ]
    elif args.subcommand == "read":
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, args.possorted_bam, final_blacklist,
            outs.bc_cnv, "--binsize",
            str(args.bin_size), "--probchange",
            str(args.status_change_penalty)
        ]
    elif args.subcommand == "asread":
        frag_csv = outs.bc_cnv + ".csv"
        bin_size, frag_version = convert_fragments_to_csv(
            args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj)
        cnv_args = [
            'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam,
            final_blacklist, outs.bc_cnv, "--fragver",
            str(frag_version), "--binsize",
            str(bin_size), "--probchange",
            str(args.status_change_penalty), "--minprob",
            str(args.min_prob)
        ]

    print cnv_args
    subprocess.check_call(cnv_args, env=rust_env)
    outs.final_bin_size = bin_size

    chroms = []
    starts1 = []
    end1 = []
    starts2 = []
    end2 = []
    info_strs = []
    quals = []

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)

    spikes = tk_io.get_target_regions(open(args.spikes))
    with open(outs.bc_cnv) as fin:
        for line in fin.readlines():
            if line.startswith('#') or line.startswith(
                    'browser') or line.startswith('track') or line.startswith(
                        '-browser') or line.startswith('-track'):
                continue
            infos = line.strip().split("\t")
            cp = int(infos[3])
            ch = infos[0]
            s = int(infos[1])
            e = int(infos[2])

            # Some basic filtering
            if primary_contigs and ch not in primary_contigs:
                continue

            if cp == 2 or (e - s) < args.minimal_cnv_size:
                continue

            if cp > 2:
                if ch not in spikes: continue
                overlaps = spikes[ch].overlapping_regions(
                    max(0, s - bin_size), e + bin_size)
                ln = len(overlaps)
                if ln > 0 and \
                    overlap(s-bin_size, s+bin_size, overlaps[0][0], overlaps[0][1]) and \
                    overlap(e-bin_size, e+bin_size, overlaps[ln-1][0], overlaps[ln-1][1]):
                    continue

            chroms.append(infos[0])
            starts1.append(s)
            end1.append(s + 1)
            starts2.append(e)
            end2.append(e + 1)
            pval = float(infos[4])
            #if pval > args.max_pval:
            #    continue
            if pval < 1e-100:
                qual = 1000
            else:
                qual = int(-log10(pval) * 10)
            quals.append(qual)
            if cp > 2:
                info_strs.append('TYPE=DUP;COPY=%d' % cp)
            elif cp < 2:
                info_strs.append('TYPE=DEL;COPY=%d' % cp)

    sv_df = tk_sv_io.create_sv_df(chroms,
                                  starts1,
                                  end1,
                                  chroms,
                                  starts2,
                                  end2,
                                  np.arange(len(chroms)),
                                  quals,
                                  info_strs=info_strs)
    tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.bc_large_cnv)
예제 #22
0
def main(args, outs):
    sv_df = read_bedpes(args)
    sv_df = tk_sv_utils.get_dataframe_loc(
        sv_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    print >> sys.stderr, 'max insert', max_insert

    if max_insert is None:
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.non_pass_sv_calls)
        return

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']
    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_STR: summary['far_chimera_rate']
    }

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)

    pass_calls = []
    non_pass_calls = []

    for i, (_, row) in enumerate(sv_df.iterrows()):
        sv_type = tk_sv_io.get_sv_type(row.info)

        middle = int(0.5 * (row.stop1 + row.start2))

        # Bail out on all non deletions
        if sv_type != tk_readpairs.DEL_STR:
            continue

        if row.chrom1 == row.chrom2:
            r1 = (max(0, row.start1 - args.break_pad),
                  min(middle, row.stop1 + args.break_pad))
            r2 = (max(middle,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)

            if row.start2 - row.stop1 > 4 * args.break_pad:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
            else:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
        else:
            r1 = (max(0,
                      row.start1 - args.break_pad), row.stop1 + args.break_pad)
            r2 = (max(0,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        bc_cov1 = len(get_frag_coverage(frag_phasing, row.chrom1, r1[0],
                                        r1[1]))
        bc_cov2 = len(get_frag_coverage(frag_phasing, row.chrom2, r2[0],
                                        r2[1]))
        if sv_type == tk_readpairs.DEL_STR and max(bc_cov1,
                                                   bc_cov2) > MAX_DEL_BC_DEPTH:
            print >> sys.stderr, 'Too many barcodes in DEL candidate', row.chrom1, row.start1, row.stop2
            continue

        readpairs = tk_readpairs.get_readpairs(in_bam,
                                               chroms,
                                               starts,
                                               stops,
                                               max_insert=max_insert,
                                               min_mapq=args.min_mapq)

        normal_readpairs = [
            rp for rp in readpairs if rp.sv_type == tk_readpairs.NORMAL_STR
        ]
        if len(normal_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(normal_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(normal_readpairs))
        normal_readpairs = [normal_readpairs[ridx] for ridx in sel]

        # Distal readpairs across the breakpoints
        dist_readpairs = [
            rp for rp in readpairs if rp.sv_type == sv_type and (
                (tk_readpairs.pos_overlaps(rp.read1.pos, r1)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r2)) or
                (tk_readpairs.pos_overlaps(rp.read1.pos, r2)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r1)))
        ]
        if len(dist_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(dist_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        dist_readpairs.extend(normal_readpairs)
        if sv_type == tk_readpairs.DEL_STR and len(starts) == 2:
            more_readpairs = tk_readpairs.get_readpairs(in_bam, [row.chrom1],
                                                        [r1[1] + 1],
                                                        [r2[0] - 1],
                                                        max_insert=max_insert,
                                                        min_mapq=args.min_mapq,
                                                        normal_only=True)
            if len(more_readpairs) > MAX_DEL_READPAIRS:
                sel = np.random.choice(len(more_readpairs), MAX_DEL_READPAIRS)
            else:
                sel = np.arange(len(more_readpairs))
            dist_readpairs.extend([
                more_readpairs[ridx] for ridx in sel
                if more_readpairs[ridx].sv_type == tk_readpairs.NORMAL_STR
            ])

        readpairs = sorted(dist_readpairs, key=lambda x: x.barcode)
        read_groups = {}
        for bc, read_group_iter in groupby(dist_readpairs,
                                           lambda x: x.barcode):
            read_groups[bc] = list(read_group_iter)

        bc_set = set(read_groups.keys())
        bc_list = sorted(read_groups.keys())
        phase_set1 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom1, r1[0],
                                               r1[1])
        phase_set2 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom2, r2[0],
                                               r2[1])

        if len(bc_list) < 1:
            print >> sys.stderr, 'Not enough barcodes. Skipping'
            continue

        bc_phase_sets1 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom1,
                                                             r1[0],
                                                             r1[1],
                                                             bc_set,
                                                             in_ps=phase_set1)
        bc_phase_sets2 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom2,
                                                             r2[0],
                                                             r2[1],
                                                             bc_set,
                                                             in_ps=phase_set2)

        cand_breaks1 = np.arange(r1[0], r1[1] + 1, 5)
        cand_breaks2 = np.arange(r2[0], r2[1] + 1, 5)

        res = tk_readpairs.eval_sv_em(read_groups,
                                      cand_breaks1,
                                      cand_breaks2,
                                      sv_type,
                                      chimera_rates,
                                      phase_set1,
                                      phase_set2,
                                      bc_phase_sets1,
                                      bc_phase_sets2,
                                      max_insert,
                                      ins_logsf_fun,
                                      em_iters=args.em_iters)

        ((no_sv_max, sv_max, het_sv_max), max_locus, zygosity, max_hap,
         prior_hap_probs, hap_probs, support) = res

        lr = sv_max - no_sv_max if max_hap is None else het_sv_max - no_sv_max

        hap_probs1 = hap_probs[:, 0:2]
        hap_probs2 = hap_probs[:, 2:]

        new_call = sv_call.SvCall.from_em_results(
            row.chrom1, row.chrom2, phase_set1, phase_set2,
            (no_sv_max, sv_max, het_sv_max), max_locus,
            sv_call._SvType(sv_type, ('.', '.')), zygosity, max_hap, support,
            (hap_probs1, hap_probs2, None))

        # the break interval is inclusive
        if lr >= args.min_lr and new_call.qual >= args.min_qv and new_call.break2[
                0] - new_call.break1[1] + 1 >= args.min_sv_len:
            pass_calls.append(new_call)
        else:
            # Leave breakpoints unchanged
            new_call.break1 = (row.start1, row.stop1)
            new_call.break2 = (row.start2, row.stop2)
            non_pass_calls.append(new_call)

    out_df = sv_call.SvCall.svs_to_dataframe(pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_calls)

    out_df = sv_call.SvCall.svs_to_dataframe(non_pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.non_pass_sv_calls)
    in_bam.close()
    frag_phasing.close()
예제 #23
0
def main(args, outs):
    if not isfile(args.fragment_histogram) \
        or not isfile(args.barcode_blacklist) or not isfile(args.coverage):
        martian.log_info(
            'One or more files needed for SV-calling are missing. No calls will be made.'
        )
        return

    in_bam = tk_bam.create_bam_infile(args.input)
    genome_size = np.sum(np.array(in_bam.lengths))

    frag_hist_file = args.fragment_histogram
    barcode_blacklist_file = args.barcode_blacklist

    if args.targets is None:
        martian.exit('You should use CALL_STRUCTVARS for WGS samples.')
    else:
        target_regions = sv_utils.bed_to_region_map(args.targets, merge=True)
        target_coverage = sv_utils.region_cum_coverage_map(
            target_regions, TARGET_COV_BIN)
        link_distance = SV_FRAGMENT_LINK_DISTANCE_TARGET
        with open(args.coverage, 'r') as f:
            cov_sum = json.load(f)['target_info']
        if 'on_target_bases' in cov_sum:
            prob_off_target = 1 - cov_sum['on_target_bases'] / float(
                cov_sum['total_bases'])
        else:
            prob_off_target = 0.001
        corr_factor = sv_stats.off_target_amp_corr_factor(
            target_regions, prob_off_target, genome_size=genome_size)

    res = sv_stats.get_frag_data(frag_hist_file,
                                 barcode_blacklist_file,
                                 min_frag_size=0,
                                 frag_size_prc_cutoff=args.frag_size_prc)
    frag_sizes, frag_counts, frag_prc, blacklist_barcodes = res

    frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)

    min_sv_len = int(max(0.8 * args.min_call_dist, link_distance))
    if not frag_prc is None and not args.targets is None:
        min_sv_len = int(max(min_sv_len, frag_prc))
    martian.log_info('Calling SVs with min length: {:d}'.format(min_sv_len))

    fragment_df = tk_hdf5.read_data_frame(args.fragments,
                                          query_cols=['obs_len', 'num_reads'])
    fragment_df = fragment_df[
        fragment_df.num_reads > MIN_READS_PER_FRAG_TARGET]
    alpha = np.median(
        np.array(fragment_df['num_reads']) /
        np.array(fragment_df['obs_len'], dtype=np.float))
    martian.log_info('Using alpha = {}'.format(alpha))

    summary = {}
    summary['min_sv_len'] = min_sv_len
    with open(outs.summary, 'w') as out_fn:
        out_fn.write(tenkit.safe_json.safe_jsonify(summary, pretty=True))

    model = sv_stats.FragModel(frag_sizes,
                               frag_counts,
                               blacklist_barcodes,
                               target_coverage,
                               cov_bin=TARGET_COV_BIN,
                               corr_factor=corr_factor,
                               genome_size=genome_size,
                               target_regions=target_regions,
                               alpha=alpha,
                               p_ov_mol=args.p_ov_mol)

    with open(args.overlap_loci, 'rb') as f:
        overlap_loci = cPickle.load(f)

    overlap_loci = [
        overlap_loci[i] for i in range(int(args.start_idx), int(args.stop_idx))
    ]

    final_res = []

    for i, (c1, s1, e1, c2, s2, e2) in enumerate(overlap_loci):
        frags1, frags2 = get_frags_from_reads(
            in_bam,
            c1,
            max(0, s1 - args.break_ext),
            e1 + args.break_ext,
            c2,
            max(0, s2 - args.break_ext),
            e2 + args.break_ext,
            min_mapq=args.min_mapq,
            min_sv_len=SV_FRAGMENT_LINK_DISTANCE_TARGET,
            min_frag_size=args.min_frag_size,
            min_reads_per_frag=args.min_reads_per_frag)

        bc_set = set(frags1.bc).union(set(frags2.bc))

        ps1 = sv_utils.get_phase_set(frag_phasing, c1, s1, e1)
        ps2 = sv_utils.get_phase_set(frag_phasing, c2, s2, e2)
        bc_phase_set_dict1 = sv_utils.get_barcode_phase_probs(frag_phasing,
                                                              c1,
                                                              s1,
                                                              e1,
                                                              bc_set,
                                                              in_ps=ps1)
        bc_phase_set_dict2 = sv_utils.get_barcode_phase_probs(frag_phasing,
                                                              c2,
                                                              s2,
                                                              e2,
                                                              bc_set,
                                                              in_ps=ps2)

        print >> sys.stderr, 'Evaluating locus', c1, s1, e1, c2, s2, e2

        res = model.eval_sv(frags1,
                            frags2, (c1, s1, e1), (c2, s2, e2),
                            min_dist=min_sv_len,
                            ps1=ps1,
                            ps2=ps2,
                            phase_set_dict1=bc_phase_set_dict1,
                            phase_set_dict2=bc_phase_set_dict2,
                            grid_len=args.grid_len)

        if not res is None and res.qual >= args.sv_min_qv:
            final_res.append(res)

    in_bam.close()

    out_df = sv_call.SvCall.svs_to_dataframe(final_res)
    sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
예제 #24
0
def main(args, outs):

    bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    bedpe_df = tk_sv_utils.get_dataframe_loc(
        bedpe_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    if max_insert is None:
        martian.throw('No Q60 reads')

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_trans = summary['far_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']

    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_FF_STR: chimera_rate_trans,
        tk_readpairs.TRANS_FR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RF_STR: chimera_rate_trans
    }

    in_bam = tk_bam.create_bam_infile(args.input)

    out_quals = []
    out_infos = []
    out_chroms1 = []
    out_starts1 = []
    out_stops1 = []
    out_chroms2 = []
    out_starts2 = []
    out_stops2 = []

    for i, (_, row) in enumerate(bedpe_df.iterrows()):
        in_svt = tk_sv_io.get_sv_type(row.info)

        if row.chrom1 == row.chrom2:
            max_ext = min(args.break_extend, int(
                (row.start2 - row.stop1) / 3.0))
            r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext)
            r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend)
            if r1[1] > r2[0]:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
            else:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
        else:
            r1 = (max(0, row.start1 - args.break_extend),
                  row.stop1 + args.break_extend)
            r2 = (max(0, row.start2 - args.break_extend),
                  row.stop2 + args.break_extend)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        readpairs = tk_readpairs.get_readpairs2(in_bam,
                                                chroms,
                                                starts,
                                                stops,
                                                max_insert=max_insert,
                                                min_mapq=args.min_mapq)

        # Distal readpairs across the breakpoints
        dist_readpairs = filter(
            filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs)

        if len(dist_readpairs) > MAX_READPAIRS:
            sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun,
                                         max_insert, chimera_rates)

        if len(res_arr) == 0:
            out_quals.append(row.qual)
            out_chroms1.append(row.chrom1)
            out_starts1.append(row.start1)
            out_stops1.append(row.stop1)
            out_chroms2.append(row.chrom2)
            out_starts2.append(row.start2)
            out_stops2.append(row.stop2)
            out_infos.append(row['info'])
        else:
            if args.best_only:
                res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True)
                res_arr = [res_arr[0]]

            for (lr, num_split, num_pairs, sv_len, support_range, svt,
                 support_readpairs) in res_arr:
                range1, range2 = support_range
                if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None:
                    out_quals.append(row.qual + args.rp_lr_multiplier * lr)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(range1[0])
                    out_stops1.append(range1[1])
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(range2[0])
                    out_stops2.append(range2[1])
                    if svt != in_svt and in_svt != 'TRANS':
                        in_svt = 'UNK'
                else:
                    out_quals.append(row.qual)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(row.start1)
                    out_stops1.append(row.stop1)
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(row.start2)
                    out_stops2.append(row.stop2)

                out_infos.append(
                    tk_sv_io.update_info(
                        row['info'],
                        ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'],
                        [num_pairs, num_split, lr, svt, in_svt]))

    in_bam.close()

    if args.best_only:
        out_names = [n for n in bedpe_df['name']]
    else:
        out_names = np.arange(len(bedpe_df))

    out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1,
                                   out_chroms2, out_starts2, out_stops2,
                                   out_names, out_quals, out_infos)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)