def single_motif_sim(bpnet, motif, max_hamming_dist=1, center_coords=[450, 550], repeat=128, importance=['count', 'weighted']): """Explore the space of different motif mutations Args: bpnet: BPNet motif: which motif to use max_hamming_dist: maximum hamming distance to compute repeat: how many simulations to run for importance: list of importance scores to compute Returns: dictionary with key=hamming distance from `motif`, values= list of all possible sequences `hamming distance` away from `motif` """ seq_by_depth = SeqMutationTree.create( SeqNode(motif), max_hamming_distance=max_hamming_dist).get_seq_by_depth() out = {} tasks = bpnet.tasks seqlen = bpnet.input_seqlen() motif = seq_by_depth[0][0] # get the reference value ref_pred = unflatten( bpnet.sim_pred(seq_by_depth[0][0], repeat=repeat, importance=importance), "/") out[0] = [{ "seq": motif, "scores": get_scores(ref_pred, ref_pred, tasks, motif, seqlen, center_coords) }] for hamming_dist in range(1, len(seq_by_depth)): # prepare the output list out[hamming_dist] = [None] * len(seq_by_depth[hamming_dist]) # loop through all the sequences for i, alt_seq in enumerate(tqdm(seq_by_depth[hamming_dist])): # get the predicitons for the current sequence alt_pred = unflatten( bpnet.sim_pred(alt_seq, repeat=repeat, importance=importance), "/") # compare ref and alt predictions out[hamming_dist][i] = { "seq": alt_seq, "scores": get_scores(ref_pred, alt_pred, tasks, alt_seq, seqlen, center_coords) } return out
def unflatten(self): return super().__init__(unflatten(self.data), attrs=deepcopy(self.attrs))
def generate_sim(bpnet, central_motif, side_motif, side_distances, center_coords=[450, 550], repeat=128, importance=['count', 'weighted'], correct=False): outl = [] tasks = bpnet.tasks seqlen = bpnet.input_seqlen() # ref_preds = sim_pred(model, central_motif) ref_preds = unflatten( bpnet.sim_pred(central_motif, repeat=repeat, importance=importance), "/") none_preds = unflatten( bpnet.sim_pred('', '', [], repeat=repeat, importance=importance), "/") alt_profiles = [] for dist in tqdm(side_distances): # alt_preds = sim_pred(model, central_motif, side_motif, [dist]) # Note: bpnet.sim_pred already averages the predictions alt_preds = unflatten( bpnet.sim_pred(central_motif, side_motif, [dist], repeat=repeat, importance=importance), "/") if correct: # Correct for the 'shoulder' effect # # this performs: AB - (B - 0) # Where: # - AB: contains both, central and side_motif # - B : contains only side_motif # - 0 : doesn't contain any motif edge_only_preds = unflatten( bpnet.sim_pred('', side_motif, [dist], repeat=repeat, importance=importance), "/") alt_preds_f = flatten(alt_preds, '/') # ref_preds_f = flatten(ref_preds, '/') none_preds_f = flatten(none_preds, "/") # substract the other counts alt_preds = unflatten( { k: alt_preds_f[k] - v + none_preds_f[k] for k, v in flatten(edge_only_preds, "/").items() }, "/") # ref_preds = unflatten({k: ref_preds_f[k] - v for k,v in flatten(none_preds, "/").items()}, "/") alt_profiles.append((dist, alt_preds)) # This normalizes the score by `A` finally yielding: # (AB - B + 0) / A scores = get_scores(ref_preds, alt_preds, tasks, central_motif, seqlen, center_coords) # compute the distance metrics for task in bpnet.tasks: d = scores[task] # book-keeping d['task'] = task d['central_motif'] = central_motif d['side_motif'] = side_motif d['position'] = dist d['distance'] = dist - seqlen // 2 outl.append(d) return pd.DataFrame(outl), alt_profiles
def interval_predict(bpnet, dataspec, interval, tasks, smooth_obs_n=0, neg_rev=True, incl_pred=False): input_seqlen = 1000 - bpnet.body.get_len_change( ) - bpnet.heads[0].net.get_len_change() int_len = interval.end - interval.start if int_len != input_seqlen: print(f"resizing the interval of length {int_len} to {input_seqlen}") interval = resize_interval(interval, input_seqlen) # fetch the sequence fe = FastaExtractor(dataspec.fasta_file) seq = fe([interval]) # Fetch read counts obs = { task: dataspec.task_specs[task].load_counts([interval])[0] for task in tasks } if smooth_obs_n > 0: obs = {k: moving_average(v, n=smooth_obs_n) for k, v in obs.items()} # TODO have the function to get the right trimming trim_i, trim_j = trim_seq(input_seqlen, 1000) # Compute importance scores imp_scores = bpnet.imp_score_all(seq, preact_only=True) # Make predictions # x = bpnet.neutral_bias_inputs(1000, 1000) # x['seq'] = seq # preds = bpnet.predict(x) # Compile everything into a single ordered dict if incl_pred: preds = bpnet.predict(seq) def proc_pred(preds, task, neg_rev): out = preds[f"{task}/profile"][0] * np.exp( preds[f"{task}/counts"][0]) if neg_rev: return to_neg(out) else: return out viz_dict = OrderedDict( flatten_list([ [ (f"{task} Obs", to_neg(obs[task]) if neg_rev else obs[task]), (f"{task} Pred", proc_pred(preds, task, neg_rev)), # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq), ] + [(f"{task} Imp profile", (v * seq)[0]) for imp_score, v in unflatten(imp_scores, "/")[task] ['profile'].items() if imp_score == 'wn'] for task_idx, task in enumerate(tasks) ])) else: viz_dict = OrderedDict( flatten_list([ [ # (f"{task} Pred", to_neg(preds[f"{task}/profile"][0])), (f"{task} Obs", to_neg(obs[task]) if neg_rev else obs[task] ), # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq), ] + [(f"{task} Imp profile", (v * seq)[0]) for imp_score, v in unflatten(imp_scores, "/")[task] ['profile'].items() if imp_score == 'wn'] for task_idx, task in enumerate(tasks) ])) return viz_dict, seq, imp_scores