def preprocessing_function(nucleotide_sequence: str) -> np.ndarray: """One-hot-encode (a single) the sequence. The kipoi deepbind model does not accept a string of nucleotides. Args: nucleotide_sequence (str): defined to be of lenght 101, though other lenghts might be accepted. Returns: np.ndarray: of shape `[1, len(nucleotide_sequence), 4]` """ return encodeDNA([nucleotide_sequence])
def onehot_dinucl_shuffle(seqs): """Di-nucleotide shuffle the sequences """ return encodeDNA( [dinuc_shuffle(s) for s in one_hot2string(seqs, vocab=DNA)])
def encode_dna(seq): return encodeDNA(seq)
def data(n_bases=10, spline_order=3, pos_class_weight=1.0, truncate=True, encode_splines=True, cache=True, minmax_scale=False): dtw_train = pd.read_csv(BR_DATA + "/train/wide_data.csv") dtw_test = pd.read_csv(BR_DATA + "/test/wide_data.csv") # replace some columns names dtw_train.rename(columns={ "dist.1": "dist1", "dist.2": "dist2" }, inplace=True) dtw_test.rename(columns={ "dist.1": "dist1", "dist.2": "dist2" }, inplace=True) y_train = col2num_array(dtw_train["is_branchpoint"], -1) x_train_pos = {col: col2num_array(dtw_train[col]) for col in pos_columns} x_train_seq = encodeDNA(dtw_train["seq"]) y_test = col2num_array(dtw_test["is_branchpoint"]) x_test_pos = {col: col2num_array(dtw_test[col]) for col in pos_columns} x_test_seq = encodeDNA(dtw_test["seq"]) # get secondary structure - use a larger sequence window to precompute it # to get the larger sequence window first assert dtw_train["seq_wide"][0][100:-100] == dtw_train["seq"][ 0] # correct values x_train_struct = get_structure(dtw_train["seq_wide"], BR_DATA + "/train/rna_structure.npy", cache) x_train_struct = x_train_struct[:, 100:-100] x_test_struct = get_structure(dtw_test["seq_wide"], BR_DATA + "/test/rna_structure.npy", cache) x_test_struct = x_train_struct[:, 100:-100] # convert all np.nans into -1's for y's and 0's for x's if truncate: x_train_pos = { k: truncate_values(k, v) for k, v in x_train_pos.items() } x_test_pos = {k: truncate_values(k, v) for k, v in x_test_pos.items()} # Ranges computed only on the trainin set position_stats = get_minmax_range(x_train_pos, x_train_pos) minmax_scalers = None if encode_splines: x_train_pos_spl = encodeSplines_common(x_train_pos, position_stats, n_bases, spline_order) x_test_pos_spl = encodeSplines_common(x_test_pos, position_stats, n_bases, spline_order) else: x_train_pos_spl = { k: fill_nan(v)[:, :, np.newaxis] for k, v in x_train_pos.items() } x_test_pos_spl = { k: fill_nan(v)[:, :, np.newaxis] for k, v in x_test_pos.items() } if minmax_scale: minmax_scalers = { k: MinMaxScaler().fit(v.reshape(-1, 1)) for k, v in x_train_pos_spl.items() } x_train_pos_spl = { k: minmax_scalers[k].transform(v.reshape( (-1, 1))).reshape(v.shape) for k, v in x_train_pos_spl.items() } x_test_pos_spl = { k: minmax_scalers[k].transform(v.reshape( (-1, 1))).reshape(v.shape) for k, v in x_test_pos_spl.items() } x_train = merge_dicts(x_train_pos_spl, { "seq": x_train_seq, "struct": x_train_struct }) x_test = merge_dicts(x_test_pos_spl, { "seq": x_test_seq, "struct": x_test_struct }) y_train = fill_nan(y_train, -1)[:, :, np.newaxis] y_test = fill_nan(y_test, -1)[:, :, np.newaxis] # todo update return sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1) return (x_train, y_train, sample_weight, pos_columns, get_branchpoint_pwm_list(), position_stats, minmax_scalers), (x_test, y_test)