Exemplo n.º 1
0
def test_nan_euclidean_distances_infinite_values(X, Y):

    with pytest.raises(ValueError) as excinfo:
        nan_euclidean_distances(X, Y=Y)

    exp_msg = ("Input contains infinity or a value too large for "
               "dtype('float64').")
    assert exp_msg == str(excinfo.value)
Exemplo n.º 2
0
def test_nan_euclidean_distances_complete_nan(missing_value):
    X = np.array([[missing_value, missing_value], [0, 1]])

    exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]])

    dist = nan_euclidean_distances(X, missing_values=missing_value)
    assert_allclose(exp_dist, dist)

    dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
    assert_allclose(exp_dist, dist)
def test_nan_euclidean_distances_one_feature_match_positive(missing_value):
    # First feature is the only feature that is non-nan and in both
    # samples. The result of `nan_euclidean_distances` with squared=True
    # should be non-negative. The non-squared version should all be close to 0.
    X = np.array([[-122.27, 648., missing_value, 37.85],
                  [-122.27, missing_value, 2.34701493, missing_value]])

    dist_squared = nan_euclidean_distances(X, missing_values=missing_value,
                                           squared=True)
    assert np.all(dist_squared >= 0)

    dist = nan_euclidean_distances(X, missing_values=missing_value,
                                   squared=False)
    assert_allclose(dist, 0.0)
Exemplo n.º 4
0
def test_nan_euclidean_distances_2x2(X, X_diag, missing_value):

    exp_dist = np.array([[0., X_diag], [X_diag, 0]])

    dist = nan_euclidean_distances(X, missing_values=missing_value)
    assert_allclose(exp_dist, dist)

    dist_sq = nan_euclidean_distances(
        X, squared=True, missing_values=missing_value)
    assert_allclose(exp_dist**2, dist_sq)

    dist_two = nan_euclidean_distances(X, X, missing_values=missing_value)
    assert_allclose(exp_dist, dist_two)

    dist_two_copy = nan_euclidean_distances(
        X, X.copy(), missing_values=missing_value)
    assert_allclose(exp_dist, dist_two_copy)
Exemplo n.º 5
0
def test_nan_euclidean_distances_equal_to_euclidean_distance(squared):
    # with no nan values
    rng = np.random.RandomState(1337)
    X = rng.randn(3, 4)
    Y = rng.randn(4, 4)

    normal_distance = euclidean_distances(X, Y=Y, squared=squared)
    nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared)
    assert_allclose(normal_distance, nan_distance)
Exemplo n.º 6
0
def test_nan_euclidean_distances_not_trival(missing_value):
    X = np.array([[1., missing_value, 3., 4., 2.],
                  [missing_value, 4., 6., 1., missing_value],
                  [3., missing_value, missing_value, missing_value, 1.]])

    Y = np.array([[missing_value, 7., 7., missing_value, 2.],
                  [missing_value, missing_value, 5., 4., 7.],
                  [missing_value, missing_value, missing_value, 4., 5.]])

    # Check for symmetry
    D1 = nan_euclidean_distances(X, Y, missing_values=missing_value)
    D2 = nan_euclidean_distances(Y, X, missing_values=missing_value)

    assert_almost_equal(D1, D2.T)

    # Check with explicit formula and squared=True
    assert_allclose(
        nan_euclidean_distances(X[:1],
                                Y[:1],
                                squared=True,
                                missing_values=missing_value),
        [[5.0 / 2.0 * ((7 - 3)**2 + (2 - 2)**2)]])

    # Check with explicit formula and squared=False
    assert_allclose(
        nan_euclidean_distances(X[1:2],
                                Y[1:2],
                                squared=False,
                                missing_values=missing_value),
        [[np.sqrt(5.0 / 2.0 * ((6 - 5)**2 + (1 - 4)**2))]])

    # Check when Y = X is explicitly passed
    D3 = nan_euclidean_distances(X, missing_values=missing_value)
    D4 = nan_euclidean_distances(X, X, missing_values=missing_value)
    D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
    assert_allclose(D3, D4)
    assert_allclose(D4, D5)

    # Check copy = True against copy = False
    D6 = nan_euclidean_distances(X, Y, copy=True)
    D7 = nan_euclidean_distances(X, Y, copy=False)
    assert_allclose(D6, D7)
Exemplo n.º 7
0
    def findKNeighbors(self):
        corr_features = dict()
        for col1 in self.X_train.columns:
            d = []
            votes = []
            for col2 in self.X_train.columns:
                dist = nan_euclidean_distances(
                    self.X_train[col1].values.reshape(1, -1),
                    self.X_train[col2].values.reshape(1, -1))
                d.append([dist, col2])
            # sorts and picks the nearest one
            # print('neighbors', d)
            d.sort(key=lambda x: x[0])
            d = d[0:self.k]
            for v, j in d:
                votes.append(j)
            corr_features[col1] = votes
            # ans = Counter(votes).most_common(1)[0][0]
            # result.append(ans)

        return corr_features
Exemplo n.º 8
0
def discr_stat(X,
               Y,
               dissimilarity="euclidean",
               remove_isolates=True,
               return_rdfs=True):
    """
    Computes the discriminability statistic.

    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix.
    Y : 1d-array, shape (n_samples)
        Input labels.
    dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity measure can be
        'euclidean' (pairwise Euclidean distances between points in the dataset) or 'precomputed'
        (pre-computed dissimilarities).
    remove_isolates : bool, optional, default=True
        Whether to remove data that have single label.
    return_rdfs : bool, optional, default=False
        Whether to return rdf for all data points.

    Returns
    -------
    stat : float
        Discriminability statistic.
    rdfs : array, shape (n_samples, max{len(id)})
        Rdfs for each sample. Only returned if ``return_rdfs==True``.

    """
    check_X_y(X, Y, accept_sparse=True)

    uniques, counts = np.unique(Y, return_counts=True)
    if (counts != 1).sum() <= 1:
        msg = "You have passed a vector containing only a single unique sample id."
        raise ValueError(msg)
    if remove_isolates:
        idx = np.isin(Y, uniques[counts != 1])
        labels = Y[idx]

        if dissimilarity == "euclidean" or dissimilarity == "cosine" or dissimilarity == "haversine" or \
            dissimilarity == "manhattan" or dissimilarity == "mahalanobis":
            X = X[idx]
        else:
            X = X[np.ix_(idx, idx)]
    else:
        labels = Y

    if dissimilarity == "euclidean":
        dissimilarities = nan_euclidean_distances(X)
    elif dissimilarity == "cosine":
        dissimilarities = cosine_distances(X)
    elif dissimilarity == "haversine":
        dissimilarities = haversine_distances(X)
    elif dissimilarity == "manhattan":
        dissimilarities = manhattan_distances(X)
    else:
        dissimilarities = X

    rdfs = _discr_rdf(dissimilarities, labels)
    rdfs[rdfs < 0.5] = np.nan
    stat = np.nanmean(rdfs)

    if return_rdfs:
        return stat, rdfs
    else:
        return stat
Exemplo n.º 9
0
def test_knn_imputer_weight_distance(na):
    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])

    # Test with "distance" weight
    nn = KNeighborsRegressor(metric="euclidean", weights="distance")
    X_rows_idx = [0, 2, 3, 4, 5, 6]
    nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
    knn_imputed_value = nn.predict(X[1:2, 1:])[0]

    # Manual calculation
    X_neighbors_idx = [0, 2, 3, 4, 5]
    dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
    weights = 1 / dist[:, X_neighbors_idx].ravel()
    manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)

    X_imputed_distance1 = np.array([[0, 0], [manual_imputed_value, 2], [4, 3],
                                    [5, 6], [7, 7], [9, 8], [11, 10]])

    # NearestNeighbor calculation
    X_imputed_distance2 = np.array([[0, 0], [knn_imputed_value, 2], [4, 3],
                                    [5, 6], [7, 7], [9, 8], [11, 10]])

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance2)

    # Test with weights = "distance" and n_neighbors=2
    X = np.array([
        [na, 0, 0],
        [2, 1, 2],
        [3, 2, 3],
        [4, 5, 5],
    ])

    # neighbors are rows 1, 2, the nan_euclidean_distances are:
    dist_0_1 = np.sqrt((3 / 2) * ((1 - 0)**2 + (2 - 0)**2))
    dist_0_2 = np.sqrt((3 / 2) * ((2 - 0)**2 + (3 - 0)**2))
    imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])

    X_imputed = np.array([
        [imputed_value, 0, 0],
        [2, 1, 2],
        [3, 2, 3],
        [4, 5, 5],
    ])

    imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    # Test with varying missingness patterns
    X = np.array([
        [1, 0, 0, 1],
        [0, na, 1, na],
        [1, 1, 1, na],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 1],
        [10, 10, 10, 10],
    ])

    # Get weights of donor neighbors
    dist = nan_euclidean_distances(X, missing_values=na)
    r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
    r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
    r1c1_nbor_wt = 1 / r1c1_nbor_dists
    r1c3_nbor_wt = 1 / r1c3_nbor_dists

    r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
    r2c3_nbor_wt = 1 / r2c3_nbor_dists

    # Collect donor values
    col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
    col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()

    # Final imputed values
    r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
    r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
    r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)

    X_imputed = np.array([
        [1, 0, 0, 1],
        [0, r1c1_imp, 1, r1c3_imp],
        [1, 1, 1, r2c3_imp],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 1],
        [10, 10, 10, 10],
    ])

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    X = np.array([
        [0, 0, 0, na],
        [1, 1, 1, na],
        [2, 2, na, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [na, 7, 7, 7],
    ])

    dist = pairwise_distances(X,
                              metric="nan_euclidean",
                              squared=False,
                              missing_values=na)

    # Calculate weights
    r0c3_w = 1.0 / dist[0, 2:-1]
    r1c3_w = 1.0 / dist[1, 2:-1]
    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
    r7c0_w = 1.0 / dist[7, 2:7]

    # Calculate weighted averages
    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)

    X_imputed = np.array([
        [0, 0, 0, r0c3],
        [1, 1, 1, r1c3],
        [2, 2, r2c2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [r7c0, 7, 7, 7],
    ])

    imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
    assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
Exemplo n.º 10
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)

    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Check to ensure NaNs work with pairwise_distances.
    X_masked = rng.random_sample((5, 4))
    Y_masked = rng.random_sample((2, 4))
    X_masked[0, 0] = np.nan
    Y_masked[0, 0] = np.nan
    S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean")
    S2_masked = nan_euclidean_distances(X_masked, Y_masked)
    assert_array_almost_equal(S_masked, S2_masked)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)

    # Test haversine distance
    # The data should be valid latitude and longitude
    X = rng.random_sample((5, 2))
    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2
    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, metric="haversine")
    S2 = haversine_distances(X)
    assert_array_almost_equal(S, S2)

    # Test haversine distance, with Y != X
    Y = rng.random_sample((2, 2))
    Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2
    Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, Y, metric="haversine")
    S2 = haversine_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert S.shape[0] == S.shape[1]
    assert S.shape[0] == X.shape[0]
    assert_array_almost_equal(S, S2)

    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert S.shape[0] == X.shape[0]
    assert S.shape[1] == Y.shape[0]
    assert_array_almost_equal(S, S2)

    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert S.shape[0] == X.shape[0]
    assert S.shape[1] == Y.shape[0]
    assert_array_almost_equal(S, S2)

    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # Test that scipy distance metrics throw an error if sparse matrix given
    with pytest.raises(TypeError):
        pairwise_distances(X_sparse, metric="minkowski")
    with pytest.raises(TypeError):
        pairwise_distances(X, Y_sparse, metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    with pytest.raises(ValueError):
        pairwise_distances(X, Y, metric="blah")
Exemplo n.º 11
0
def retrieve_closest_indices(
    df,
    num_indices,
    forecast_length,
    window_size: int = 10,
    distance_metric: str = "braycurtis",
    stride_size: int = 1,
    start_index: int = None,
    include_differenced: bool = False,
    include_last: bool = True,
    verbose: int = 0,
):
    """Find next indicies closest to the final segment of forecast_length

    Args:
        df (pd.DataFrame): source data in wide format
        num_indices (int): number of indices to return
        forecast_length (int): length of forecast
        window_size (int): length of comparison
        distance_metric (str): distance measure from scipy and nan_euclidean
        stride_size (int): length of spacing between windows
        start_index (int): index to begin creation of windows from
        include_difference (bool): if True, also compare on differences
    """
    array = df.to_numpy()
    index = df.index
    tlt_len = array.shape[0]
    combined_window_size = window_size + forecast_length
    # remove extra so last segment not included at all
    # have the last window end evenly
    max_steps = array.shape[0] - combined_window_size
    if not include_last:
        max_steps = max_steps - forecast_length
    if start_index is None:
        # handle massive stride size relative to data
        start_index = 0
        if stride_size * 6 < array.shape[0]:
            start_index = max_steps % stride_size
    if num_indices > (max_steps / stride_size):
        raise ValueError(
            "num_validations/num_indices too high for this dataset")
    window_idxs = window_id_maker(
        window_size=combined_window_size,
        start_index=start_index,
        max_steps=max_steps,
        stride_size=stride_size,
        skip_size=1,
    )
    # calculate distance between all points and last window of history
    if distance_metric == "nan_euclidean":
        from sklearn.metrics.pairwise import nan_euclidean_distances

        res = np.array([
            nan_euclidean_distances(
                array[:, a][window_idxs[:, :window_size]],
                array[(tlt_len - window_size):tlt_len, a].reshape(1, -1),
            ) for a in range(array.shape[1])
        ])
        if include_differenced:
            array_diff = np.diff(array, n=1, axis=0)
            array_diff = np.concatenate([array_diff[0:1], array_diff])
            res_diff = np.array([
                nan_euclidean_distances(
                    array_diff[:, a][window_idxs[:, :window_size]],
                    array_diff[(tlt_len - window_size):tlt_len,
                               a].reshape(1, -1),
                ) for a in range(array_diff.shape[1])
            ])
            res = np.mean([res, res_diff], axis=0)
    else:
        from scipy.spatial.distance import cdist

        res = np.array([
            cdist(
                array[:, a][window_idxs[:, :window_size]],
                array[(tlt_len - window_size):tlt_len, a].reshape(1, -1),
                metric=distance_metric,
            ) for a in range(array.shape[1])
        ])
        if include_differenced:
            array_diff = np.diff(array, n=1, axis=0)
            array_diff = np.concatenate([array_diff[0:1], array_diff])
            res_diff = np.array([
                cdist(
                    array_diff[:, a][window_idxs[:, :window_size]],
                    array_diff[(tlt_len - window_size):tlt_len,
                               a].reshape(1, -1),
                    metric=distance_metric,
                ) for a in range(array_diff.shape[1])
            ])
            res = np.mean([res, res_diff], axis=0)
    # find the lowest distance historical windows
    res_sum = np.nansum(res, axis=0)
    num_top = num_indices
    # partial then full sort
    res_idx = np.argpartition(res_sum, num_top, axis=0)[0:num_top]
    res_idx = res_idx[np.argsort(res_sum[res_idx].flatten())]
    if verbose > 1:
        print(
            f"similarity validation distance metrics: {res_sum[res_idx].flatten()} with last window: {res_sum[-1].item()}"
        )
    select_index = index.to_numpy()[window_idxs[res_idx]]
    if select_index.ndim == 3:
        res_shape = select_index.shape
        select_index = select_index.reshape((res_shape[0], res_shape[2]))
    return select_index
Exemplo n.º 12
0
Arquivo: kg.py Projeto: ricky4235/Pro
from sklearn.impute import KNNImputer

from sklearn.metrics.pairwise import nan_euclidean_distances
# 交叉验证
from sklearn.model_selection import cross_val_score
# KFlod的函数
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, metric='nan_euclidean')
imputer.fit_transform(X)

nan_euclidean_distances([[np.nan, 6, 5], [3, 4, 3]], [[3, 4, 3], [1, 2, np.nan], [8, 8, 7]])
nan_euclidean_distances([[np.nan, 6, 5], [3, 4, 3]], [[3, 4, 3], [1, 2,
      
input_file = './horse-colic.csv'
df_data = pd.read_csv(input_file, header=None, na_values='?')

data = df_data.values
ix = [i for i in range(data.shape[1]) if i != 235]
X, y = data[:, ix], data[:, 236] 


for i in range(df_data.shape[1]):
    n_miss = df_data[[i]].isnull().sum()
    perc = n_miss / df_data.shape[0] * 100
    
    if n_miss.values[0] > 0:
Exemplo n.º 13
0
    def __init__(self, path, missing_ratio, knn_impute=False, n_neighbor=5):
        scaler = MinMaxScaler()
        data = np.load(path, allow_pickle=True)
        data = data.item()
        self.missing_ratio = missing_ratio

        self.x = data["x"]
        self.y = data["y"]
        """ the argsort of numpy seems have bugs"""
        if missing_ratio == 0.0:
            pdist_x = cosine_distances(self.x, self.x)
        else:
            pdist_x = nan_euclidean_distances(self.x, self.x)
        graph_knn = np.zeros_like(pdist_x)
        for i in range(self.x.shape[0]):
            sorted_row = np.sort(pdist_x[i, :])
            sort_index = np.argsort(pdist_x[i, :])
            '''deal with when there are multiple point is the same to guarantee the diag is 0'''
            if sort_index[0] != i:
                j = np.where(sort_index == i)
                sort_index[j] = sort_index[0]
                sort_index[0] = i

            # if i==266:
            #     print(sorted_row)
            selected_index = sort_index[:1 + n_neighbor]
            graph_knn[i, selected_index] = 1
            # print(sort_index[:20])
            thresh = sorted_row[n_neighbor + 1]
            # if thresh == sorted_row[n_neighbor + 2]:
            #     print(sorted_row)
            pdist_x[i, :] = (pdist_x[i, :] < thresh).astype(float)
            # pdist_x[i,:] = 0
            # pdist_x[i, sort_index[1:1+n_neighbor]]=1
        # graph_knn = pdist_x - np.eye(self.x.shape[0])
        graph_knn = graph_knn - np.eye(self.x.shape[0])
        np.testing.assert_equal(np.sum(graph_knn, axis=1),
                                n_neighbor * np.ones(self.x.shape[0]))

        n, d = self.x.shape
        mask = np.random.rand(n, d)
        mask = (mask > self.missing_ratio).astype(float)
        self.m = mask
        if missing_ratio > 0.0:

            self.x[mask == 0] = np.nan
            # imputer = KNNImputer(n_neighbors=2)
            imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            self.x = imputer.fit_transform(self.x)
            self.m = np.ones_like(self.x)
            scaler.fit(self.x)
            self.x = scaler.transform(self.x)
        '''get nn'''
        x_nn = []
        for i in range(self.x.shape[0]):
            index = np.where(graph_knn[i, :] == 1)
            # if index[0].shape[0] != 5:
            #     print(i)
            x_i = self.x[index, :]

            x_nn.append(x_i.squeeze())
        self.x_nn = np.stack(x_nn)
 def calculateSimilarity(self, x1, x2):
     #         sim = pairwise_distances.cosine_similarity(x1.fillna(0), x2.fillna(0))
     sim = pairwise.nan_euclidean_distances(x1, x2)
     return sim