示例#1
0
def single_motif_sim(bpnet,
                     motif,
                     max_hamming_dist=1,
                     center_coords=[450, 550],
                     repeat=128,
                     importance=['count', 'weighted']):
    """Explore the space of different motif mutations

    Args:
      bpnet: BPNet
      motif: which motif to use
      max_hamming_dist: maximum hamming distance to compute
      repeat: how many simulations to run for
      importance: list of importance scores to compute

    Returns:
      dictionary with key=hamming distance from `motif`, values= list of all possible
        sequences `hamming distance` away from `motif`
    """
    seq_by_depth = SeqMutationTree.create(
        SeqNode(motif),
        max_hamming_distance=max_hamming_dist).get_seq_by_depth()
    out = {}
    tasks = bpnet.tasks
    seqlen = bpnet.input_seqlen()
    motif = seq_by_depth[0][0]

    # get the reference value
    ref_pred = unflatten(
        bpnet.sim_pred(seq_by_depth[0][0],
                       repeat=repeat,
                       importance=importance), "/")
    out[0] = [{
        "seq":
        motif,
        "scores":
        get_scores(ref_pred, ref_pred, tasks, motif, seqlen, center_coords)
    }]

    for hamming_dist in range(1, len(seq_by_depth)):

        # prepare the output list
        out[hamming_dist] = [None] * len(seq_by_depth[hamming_dist])

        # loop through all the sequences
        for i, alt_seq in enumerate(tqdm(seq_by_depth[hamming_dist])):
            # get the predicitons for the current sequence
            alt_pred = unflatten(
                bpnet.sim_pred(alt_seq, repeat=repeat, importance=importance),
                "/")

            # compare ref and alt predictions
            out[hamming_dist][i] = {
                "seq":
                alt_seq,
                "scores":
                get_scores(ref_pred, alt_pred, tasks, alt_seq, seqlen,
                           center_coords)
            }
    return out
示例#2
0
 def unflatten(self):
     return super().__init__(unflatten(self.data), attrs=deepcopy(self.attrs))
示例#3
0
def generate_sim(bpnet,
                 central_motif,
                 side_motif,
                 side_distances,
                 center_coords=[450, 550],
                 repeat=128,
                 importance=['count', 'weighted'],
                 correct=False):
    outl = []
    tasks = bpnet.tasks
    seqlen = bpnet.input_seqlen()
    # ref_preds = sim_pred(model, central_motif)
    ref_preds = unflatten(
        bpnet.sim_pred(central_motif, repeat=repeat, importance=importance),
        "/")
    none_preds = unflatten(
        bpnet.sim_pred('', '', [], repeat=repeat, importance=importance), "/")

    alt_profiles = []
    for dist in tqdm(side_distances):
        # alt_preds = sim_pred(model, central_motif, side_motif, [dist])

        # Note: bpnet.sim_pred already averages the predictions
        alt_preds = unflatten(
            bpnet.sim_pred(central_motif,
                           side_motif, [dist],
                           repeat=repeat,
                           importance=importance), "/")
        if correct:
            # Correct for the 'shoulder' effect
            #
            # this performs: AB - (B - 0)
            # Where:
            # - AB: contains both, central and side_motif
            # - B : contains only side_motif
            # - 0 : doesn't contain any motif
            edge_only_preds = unflatten(
                bpnet.sim_pred('',
                               side_motif, [dist],
                               repeat=repeat,
                               importance=importance), "/")

            alt_preds_f = flatten(alt_preds, '/')
            # ref_preds_f = flatten(ref_preds, '/')
            none_preds_f = flatten(none_preds, "/")
            # substract the other counts
            alt_preds = unflatten(
                {
                    k: alt_preds_f[k] - v + none_preds_f[k]
                    for k, v in flatten(edge_only_preds, "/").items()
                }, "/")
            # ref_preds = unflatten({k: ref_preds_f[k] - v  for k,v in flatten(none_preds, "/").items()}, "/")
        alt_profiles.append((dist, alt_preds))

        # This normalizes the score by `A` finally yielding:
        # (AB - B + 0) / A
        scores = get_scores(ref_preds, alt_preds, tasks, central_motif, seqlen,
                            center_coords)

        # compute the distance metrics
        for task in bpnet.tasks:
            d = scores[task]

            # book-keeping
            d['task'] = task
            d['central_motif'] = central_motif
            d['side_motif'] = side_motif
            d['position'] = dist
            d['distance'] = dist - seqlen // 2

            outl.append(d)

    return pd.DataFrame(outl), alt_profiles
示例#4
0
def interval_predict(bpnet,
                     dataspec,
                     interval,
                     tasks,
                     smooth_obs_n=0,
                     neg_rev=True,
                     incl_pred=False):
    input_seqlen = 1000 - bpnet.body.get_len_change(
    ) - bpnet.heads[0].net.get_len_change()

    int_len = interval.end - interval.start
    if int_len != input_seqlen:
        print(f"resizing the interval of length {int_len} to {input_seqlen}")
        interval = resize_interval(interval, input_seqlen)

    # fetch the sequence
    fe = FastaExtractor(dataspec.fasta_file)
    seq = fe([interval])
    # Fetch read counts
    obs = {
        task: dataspec.task_specs[task].load_counts([interval])[0]
        for task in tasks
    }
    if smooth_obs_n > 0:
        obs = {k: moving_average(v, n=smooth_obs_n) for k, v in obs.items()}

    # TODO  have the function to get the right trimming

    trim_i, trim_j = trim_seq(input_seqlen, 1000)

    # Compute importance scores
    imp_scores = bpnet.imp_score_all(seq, preact_only=True)

    # Make predictions
    # x = bpnet.neutral_bias_inputs(1000, 1000)
    # x['seq'] = seq
    # preds = bpnet.predict(x)

    # Compile everything into a single ordered dict
    if incl_pred:
        preds = bpnet.predict(seq)

        def proc_pred(preds, task, neg_rev):
            out = preds[f"{task}/profile"][0] * np.exp(
                preds[f"{task}/counts"][0])
            if neg_rev:
                return to_neg(out)
            else:
                return out

        viz_dict = OrderedDict(
            flatten_list([
                [
                    (f"{task} Obs",
                     to_neg(obs[task]) if neg_rev else obs[task]),
                    (f"{task} Pred", proc_pred(preds, task, neg_rev)),
                    # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
                ] + [(f"{task} Imp profile", (v * seq)[0])
                     for imp_score, v in unflatten(imp_scores, "/")[task]
                     ['profile'].items() if imp_score == 'wn']
                for task_idx, task in enumerate(tasks)
            ]))
    else:
        viz_dict = OrderedDict(
            flatten_list([
                [
                    # (f"{task} Pred", to_neg(preds[f"{task}/profile"][0])),
                    (f"{task} Obs", to_neg(obs[task]) if neg_rev else obs[task]
                     ),
                    # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
                ] + [(f"{task} Imp profile", (v * seq)[0])
                     for imp_score, v in unflatten(imp_scores, "/")[task]
                     ['profile'].items() if imp_score == 'wn']
                for task_idx, task in enumerate(tasks)
            ]))
    return viz_dict, seq, imp_scores