示例#1
0
def preprocessing_function(nucleotide_sequence: str) -> np.ndarray:
    """One-hot-encode (a single) the sequence.

    The kipoi deepbind model does not accept a string of nucleotides.

    Args:
        nucleotide_sequence (str): defined to be of lenght 101, though other
            lenghts might be accepted.

    Returns:
        np.ndarray: of shape `[1, len(nucleotide_sequence), 4]`
    """
    return encodeDNA([nucleotide_sequence])
示例#2
0
def onehot_dinucl_shuffle(seqs):
    """Di-nucleotide shuffle the sequences
    """
    return encodeDNA(
        [dinuc_shuffle(s) for s in one_hot2string(seqs, vocab=DNA)])
示例#3
0
def encode_dna(seq):
    return encodeDNA(seq)
示例#4
0
def data(n_bases=10,
         spline_order=3,
         pos_class_weight=1.0,
         truncate=True,
         encode_splines=True,
         cache=True,
         minmax_scale=False):
    dtw_train = pd.read_csv(BR_DATA + "/train/wide_data.csv")
    dtw_test = pd.read_csv(BR_DATA + "/test/wide_data.csv")

    # replace some columns names
    dtw_train.rename(columns={
        "dist.1": "dist1",
        "dist.2": "dist2"
    },
                     inplace=True)
    dtw_test.rename(columns={
        "dist.1": "dist1",
        "dist.2": "dist2"
    },
                    inplace=True)

    y_train = col2num_array(dtw_train["is_branchpoint"], -1)
    x_train_pos = {col: col2num_array(dtw_train[col]) for col in pos_columns}
    x_train_seq = encodeDNA(dtw_train["seq"])
    y_test = col2num_array(dtw_test["is_branchpoint"])
    x_test_pos = {col: col2num_array(dtw_test[col]) for col in pos_columns}
    x_test_seq = encodeDNA(dtw_test["seq"])

    # get secondary structure - use a larger sequence window to precompute it
    # to get the larger sequence window first
    assert dtw_train["seq_wide"][0][100:-100] == dtw_train["seq"][
        0]  # correct values
    x_train_struct = get_structure(dtw_train["seq_wide"],
                                   BR_DATA + "/train/rna_structure.npy", cache)
    x_train_struct = x_train_struct[:, 100:-100]
    x_test_struct = get_structure(dtw_test["seq_wide"],
                                  BR_DATA + "/test/rna_structure.npy", cache)
    x_test_struct = x_train_struct[:, 100:-100]

    # convert all np.nans into -1's for y's and 0's for x's
    if truncate:
        x_train_pos = {
            k: truncate_values(k, v)
            for k, v in x_train_pos.items()
        }
        x_test_pos = {k: truncate_values(k, v) for k, v in x_test_pos.items()}

    # Ranges computed only on the trainin set
    position_stats = get_minmax_range(x_train_pos, x_train_pos)

    minmax_scalers = None
    if encode_splines:
        x_train_pos_spl = encodeSplines_common(x_train_pos, position_stats,
                                               n_bases, spline_order)
        x_test_pos_spl = encodeSplines_common(x_test_pos, position_stats,
                                              n_bases, spline_order)
    else:
        x_train_pos_spl = {
            k: fill_nan(v)[:, :, np.newaxis]
            for k, v in x_train_pos.items()
        }
        x_test_pos_spl = {
            k: fill_nan(v)[:, :, np.newaxis]
            for k, v in x_test_pos.items()
        }
        if minmax_scale:
            minmax_scalers = {
                k: MinMaxScaler().fit(v.reshape(-1, 1))
                for k, v in x_train_pos_spl.items()
            }
            x_train_pos_spl = {
                k: minmax_scalers[k].transform(v.reshape(
                    (-1, 1))).reshape(v.shape)
                for k, v in x_train_pos_spl.items()
            }
            x_test_pos_spl = {
                k: minmax_scalers[k].transform(v.reshape(
                    (-1, 1))).reshape(v.shape)
                for k, v in x_test_pos_spl.items()
            }

    x_train = merge_dicts(x_train_pos_spl, {
        "seq": x_train_seq,
        "struct": x_train_struct
    })
    x_test = merge_dicts(x_test_pos_spl, {
        "seq": x_test_seq,
        "struct": x_test_struct
    })

    y_train = fill_nan(y_train, -1)[:, :, np.newaxis]
    y_test = fill_nan(y_test, -1)[:, :, np.newaxis]

    # todo update return
    sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1)
    return (x_train, y_train, sample_weight, pos_columns,
            get_branchpoint_pwm_list(), position_stats,
            minmax_scalers), (x_test, y_test)