示例#1
0
def test_callable_metric():

    # Define callable metric that returns the l1 norm:
    def custom_callable(x, y, missing_values="NaN", squared=False):
        x = np.ma.array(x, mask=np.isnan(x))
        y = np.ma.array(y, mask=np.isnan(y))
        dist = np.nansum(np.abs(x-y))
        return dist

    X = np.array([
        [4, 3, 3, np.nan],
        [6, 9, 6, 9],
        [4, 8, 6, 9],
        [np.nan, 9, 11, 10.]
    ])

    X_imputed = np.array([
        [4, 3, 3, 9],
        [6, 9, 6, 9],
        [4, 8, 6, 9],
        [5, 9, 11, 10.]
    ])

    imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
    assert_array_equal(imputer.fit_transform(X), X_imputed)
示例#2
0
def test_complete_features():

    # Test with use_complete=True
    X = np.array([
        [0,      np.nan,    0,       np.nan],
        [1,      1,         1,       np.nan],
        [2,      2,         np.nan,  2],
        [3,      3,         3,       3],
        [4,      4,         4,       4],
        [5,      5,         5,       5],
        [6,      6,         6,       6],
        [np.nan, 7,         7,       7]
    ])

    r0c1 = np.mean(X[1:6, 1])
    r0c3 = np.mean(X[2:-1, -1])
    r1c3 = np.mean(X[2:-1, -1])
    r2c2 = np.nanmean(X[:6, 2])
    r7c0 = np.mean(X[2:-1, 0])

    X_imputed = np.array([
        [0,     r0c1,   0,    r0c3],
        [1,     1,      1,    r1c3],
        [2,     2,      r2c2, 2],
        [3,     3,      3,    3],
        [4,     4,      4,    4],
        [5,     5,      5,    5],
        [6,     6,      6,    6],
        [r7c0,  7,      7,    7]
    ])

    imputer_comp = KNNImputer()
    assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
示例#3
0
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs):
    """
    Impute missing values in DataFrame (np.nan or None).
    ------------------------
    Args:
        * df: pd.DataFrame of (samples x features)
        * method: string for what method of imputation to use
            ** 'mean': mean imputation
            ** 'knn': K-NN imputation (see missingpy.KNNImputer)
            ** 'rf': random forest imputation (see missingpy.MissForest)

    Returns:
        * pd.DataFrame: imputed values (samples x features)
    """
    assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method)

    if method=='mean':
        return df.fillna(df.mean(0))
    elif method=='knn':
        X = df.values
        imputer = KNNImputer(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
    elif method=='rf':
        X = df.values
        imputer = MissForest(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
def pre_processing(data_route):

    data_frame = pd.read_csv(data_route)
    #Missing Value Imputation by Random Forest
    real_colums = data_frame.columns

    def handle_column_negative(x):
        return x.map(lambda x: x * (-1) if x < 0 else x)

    numericData = data_frame.copy()
    # Preparing data to Random Forest
    numericData = numericData.drop(["cluster", "date", "country"], axis=1)
    numericData = numericData.apply(lambda x: handle_column_negative(x),
                                    axis=1)
    numericData = numericData.replace([np.inf, -np.inf], np.nan)

    # applying random forest
    random_forest_imputer = KNNImputer()
    random_forest_result = random_forest_imputer.fit_transform(numericData)
    data_frame_processed = pd.DataFrame(random_forest_result)

    # adding removed fields
    data_frame_processed.insert(0, column='date', value=data_frame['date'])
    data_frame_processed.insert(0,
                                column='cluster',
                                value=data_frame['cluster'])
    data_frame_processed.insert(0,
                                column='country',
                                value=data_frame['country'])

    data_frame_processed.columns = real_colums
    return data_frame_processed
示例#5
0
def test_default_with_invalid_input():
    # Test imputation with default values and invalid input

    # Test with % missing in a column > col_max_missing
    X = np.array([
        [np.nan, 0, 0, 0, 5],
        [np.nan, 1, 0, np.nan, 3],
        [np.nan, 2, 0, 0, 0],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [np.nan, 8, 0, 8, 9],
    ])
    imputer = KNNImputer()
    msg = "Some column(s) have more than {}% missing values".format(
        imputer.col_max_missing * 100)
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with insufficient number of neighbors
    X = np.array([
        [1, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [6, 6, 2, 5, 13],
    ])
    msg = "There are only %d samples, but n_neighbors=%d." % \
          (X.shape[0], imputer.n_neighbors)
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with inf present
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not allowed."
    assert_raise_message(ValueError, msg, KNNImputer().fit, X)

    # Test with inf present in matrix passed in transform()
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])

    X_fit = np.array([
        [0, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not allowed in data to be transformed."
    assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X)
示例#6
0
def test_knn_n_neighbors():

    X = np.array([
        [0,       0],
        [np.nan,  2],
        [4,       3],
        [5,       np.nan],
        [7,       7],
        [np.nan,  8],
        [14,      13]
    ])
    statistics_mean = np.nanmean(X, axis=0)

    # Test with 1 neighbor
    X_imputed_1NN = np.array([
        [0,      0],
        [4,      2],
        [4,      3],
        [5,      3],
        [7,      7],
        [7,      8],
        [14,     13]
    ])

    n_neighbors = 1
    imputer = KNNImputer(n_neighbors=n_neighbors)

    assert_array_equal(imputer.fit_transform(X), X_imputed_1NN)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with 6 neighbors
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      np.nan],
        [7,      7],
        [np.nan, 8],
        [14,      13]
    ])

    X_imputed_6NN = np.array([
        [0,      0],
        [6,      2],
        [4,      3],
        [5,      5.5],
        [7,      7],
        [6,      8],
        [14,     13]
    ])

    n_neighbors = 6
    imputer = KNNImputer(n_neighbors=6)
    imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1)

    assert_array_equal(imputer.fit_transform(X), X_imputed_6NN)
    assert_array_equal(imputer.statistics_, statistics_mean)
    assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit(
        X).transform(X))
    def __init__(self, missing_val_rep=0.0, k=10, copy=False):

        self.missing_val_rep = missing_val_rep
        self.imputer = KNNImputer(missing_val_rep,
                                  k,
                                  copy=copy,
                                  col_max_missing=1.0,
                                  row_max_missing=01.0)
示例#8
0
def imputate_using_knn(dataset, k):
    cols = dataset.columns
    knn_impu = KNNImputer(n_neighbors=k, weights="uniform")
    result = knn_impu.fit_transform(dataset)

    result = pd.DataFrame(result)

    result.columns = cols
    return result
示例#9
0
def knn_impute(data, n_neighbors=3):

    imputer = KNNImputer(n_neighbors=n_neighbors,
                         missing_values=np.nan,
                         weights='distance')

    imputed_df = pd.DataFrame(imputer.fit_transform(data))

    imputed_df.columns = data.columns

    return (imputed_df)
示例#10
0
    def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5):
        '''
            Imputation of feature values using either sklearn, missingpy or
            (WIP) fancyimpute approaches.

            Parameters
            ----------
            missing_values : number, string, np.nan (default) or None
                The placeholder for the missing values. All occurrences of
                `missing_values` will be imputed.


            strategy : string, optional (default="mean")
                The imputation strategy.

                Supported using sklearn:
                - If "mean", then replace missing values using the mean along
                  each column. Can only be used with numeric data.
                - If "median", then replace missing values using the median along
                  each column. Can only be used with numeric data.
                - If "most_frequent", then replace missing using the most frequent
                  value along each column. Can be used with strings or numeric data.
                - If "constant", then replace missing values with fill_value. Can be
                  used with strings or numeric data.

                Supported using missingpy:
                - If 'knn', then use a nearest neighbor search. Can be
                  used with strings or numeric data.

                WIP: More strategies using fancyimpute

            n_neighbors : int, optional (default = 5)
                Number of neighboring samples to use for imputation if method
                is knn.

            '''

        # Set parameters to objects
        self.missing_values = missing_values
        self.strategy = strategy
        self.n_neighbors = n_neighbors

        # Depending on the imputations strategy, use a specific toolbox
        if strategy in ['mean', 'median', 'most_frequent', 'constant']:
            self.Imputer =\
             SimpleImputer(missing_values=self.missing_values,
                           strategy=self.strategy)
        elif strategy == 'knn':
            if missing_values == 'nan':
                # Slightly different API for missingpy
                self.missing_values = 'NaN'
            self.Imputer = KNNImputer(missing_values=self.missing_values,
                                      n_neighbors=self.n_neighbors)
示例#11
0
def outlier_treatment(train_data_frame):
    numericData = train_data_frame.loc[:, "expenses":"volume"]
    cleaned_data = numericData.copy()
    cleaned_data[~(np.abs(stats.zscore(cleaned_data)) < 3).all(
        axis=1)] = np.nan
    imputer = KNNImputer()
    result = imputer.fit_transform(cleaned_data)
    cdp = pd.DataFrame(result)
    cdp.insert(0, column='date', value=train_data_frame['date'])
    cdp.insert(0, column='cluster', value=train_data_frame['cluster'])
    cdp.insert(0, column='country', value=train_data_frame['country'])
    cdp.columns = train_data_frame.columns.copy()
    return cdp
示例#12
0
def test_knn_imputation_shape():
    # Verify the shapes of the imputed matrix for different weights and
    # number of neighbors.
    n_rows = 10
    n_cols = 2
    X = np.random.rand(n_rows, n_cols)
    X[0, 0] = np.nan

    for weights in ['uniform', 'distance']:
        for n_neighbors in range(1, 6):
            imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (n_rows, n_cols))
示例#13
0
    def do_impute(self, matrix_to_impute):
        parameter_set = self.get_parameter_set()
        np.savetxt('test_cur_matrix_missing.csv', matrix_to_impute)
        if self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_randomforest:
            imputed_cur_matrix = np.transpose(
                self.rfimpute.miss_forest_imputation(
                    np.transpose(matrix_to_impute)))
        elif self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_knn:
            imputer = KNNImputer(n_neighbors=2,
                                 row_max_missing=1,
                                 col_max_missing=1)
            imputed_cur_matrix = np.transpose(
                imputer.fit_transform(np.transpose(matrix_to_impute)))

        return imputed_cur_matrix
示例#14
0
def test_knn_imputation_zero():
    # Test imputation when missing_values == 0
    missing_values = 0
    n_neighbors = 2
    imputer = KNNImputer(missing_values=missing_values,
                         n_neighbors=n_neighbors,
                         weights="uniform")

    # Test with missing_values=0 when NaN present
    X = np.array([
        [np.nan, 0, 0, 0, 5],
        [np.nan, 1, 0, np.nan, 3],
        [np.nan, 2, 0, 0, 0],
        [np.nan, 6, 0, 5, 13],
    ])
    msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with % zeros in column > col_max_missing
    X = np.array([
        [1, 0, 0, 0, 5],
        [2, 1, 0, 2, 3],
        [3, 2, 0, 0, 0],
        [4, 6, 0, 5, 13],
    ])
    msg = "Some column(s) have more than {}% missing values".format(
        imputer.col_max_missing * 100)
    assert_raise_message(ValueError, msg, imputer.fit, X)
示例#15
0
def test_knn_imputation_zero_p2():
    # Test with an imputable matrix and also compare with missing_values="NaN"
    X_zero = np.array([
        [1, 0, 1, 1, 1.],
        [2, 2, 2, 2, 2],
        [3, 3, 3, 3, 0],
        [6, 6, 0, 6, 6],
    ])

    X_nan = np.array([
        [1, np.nan, 1,      1,      1.],
        [2, 2,      2,      2,      2],
        [3, 3,      3,      3,      np.nan],
        [6, 6,      np.nan, 6,      6],
    ])
    statistics_mean = np.nanmean(X_nan, axis=0)

    X_imputed = np.array([
        [1, 2.5,    1,   1, 1.],
        [2, 2,      2,   2, 2],
        [3, 3,      3,   3, 1.5],
        [6, 6,      2.5, 6, 6],
    ])

    imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
                              weights="uniform")

    imputer_nan = KNNImputer(missing_values="NaN",
                             n_neighbors=2,
                             weights="uniform")

    assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed)
    assert_array_equal(imputer_zero.statistics_, statistics_mean)
    assert_array_equal(imputer_zero.fit_transform(X_zero),
                       imputer_nan.fit_transform(X_nan))
class KNN_impute:
    def __init__(self, missing_val_rep=0.0, k=10, copy=False):

        self.missing_val_rep = missing_val_rep
        self.imputer = KNNImputer(missing_val_rep,
                                  k,
                                  copy=copy,
                                  col_max_missing=1.0,
                                  row_max_missing=01.0)

    def add_medians(self, X, y):

        X['labels'] = y
        label_meds = remove_rows(X).groupby(by='labels').median()
        #print(label_meds)
        for l in tqdm(label_meds.index):
            X[X['labels'] == l] = X[X['labels'] == l].replace(
                self.missing_val_rep, label_meds.loc[l, :].to_dict())

        X.drop(columns=['labels'], inplace=True)

    def fit(self, X, y):

        self.add_medians(X, y)
        print('INSIDE IMPUTER: Beginning the fit')
        self.imputer.fit(X)
        print('INSIDE IMPUTER: Completed the fit')
        return None
        '''
        def add_median(df):
            medians = df.median(axis=0)
            return df.replace(self.missing_val_rep, medians.to_dict())
        
        X['labels'] = y
        X_median = X.groupby(by='labels').apply(add_median)
        #print(X_median)
        X.drop(columns=['labels'],inplace=True)
        X_median.drop(columns=['labels'],inplace=True)

        self.imputer.fit(X_median)
        '''

    def transform(self, X):
        return self.imputer.transform(X)
示例#17
0
def test_complete_features_weighted():

    # Test with use_complete=True
    X = np.array([
        [0,      0,     0,       np.nan],
        [1,      1,     1,       np.nan],
        [2,      2,     np.nan,  2],
        [3,      3,     3,       3],
        [4,      4,     4,       4],
        [5,      5,     5,       5],
        [6,      6,     6,       6],
        [np.nan, 7,     7,       7]
    ])

    dist = pairwise_distances(X,
                              metric="masked_euclidean",
                              squared=False)

    # Calculate weights
    r0c3_w = 1.0 / dist[0, 2:-1]
    r1c3_w = 1.0 / dist[1, 2:-1]
    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
    r7c0_w = 1.0 / dist[7, 2:7]

    # Calculate weighted averages
    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)

    X_imputed = np.array([
        [0,     0,  0,    r0c3],
        [1,     1,  1,    r1c3],
        [2,     2,  r2c2, 2],
        [3,     3,  3,    3],
        [4,     4,  4,    4],
        [5,     5,  5,    5],
        [6,     6,  6,    6],
        [r7c0,  7,  7,    7]
    ])

    imputer_comp_wt = KNNImputer(weights="distance")
    assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)
示例#18
0
def impute_missing_for_dataframe(dataframe, target='job_performance'):
    """ The imputer function should be used on a dataframe that has already been numerically encoded """
    from missingpy import KNNImputer #, MissForest
    
    X = dataframe.loc[:, dataframe.columns != target].values
    y = dataframe[target].values

    # imputer object
    knn = KNNImputer(n_neighbors=5, 
                    weights="uniform",
                    metric="masked_euclidean",
                    row_max_missing=0.8,
                    col_max_missing=0.8, 
                    copy=True)
    knn_missing_imputation = knn.fit_transform(X)
    imputed_dataframe = pd.DataFrame(knn_missing_imputation, 
                                     columns = dataframe.columns[dataframe.columns != target])
    imputed_dataframe[target] = pd.Series(y)
    return imputed_dataframe
示例#19
0
    def fit(self, dataset):
        """Train standard imputation model.
    
    Args:
      - dataset: incomplete dataset
    """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
示例#20
0
def impute_times(final,
                 times_open,
                 times_closed,
                 columns,
                 imputation_method="mean"):
    """
    Impute open work items times with different methods
    :param final: Complete preprocessed dataframe
    :param times_open: Dataframe of work items that are not closed
    :param times_closed: Dataframe of work items that are closed
    :param columns: Columns to impute
    :param imputation_method: Choose between 'mean', 'KNN', 'forest'
    :return: Dataframe of open work items with imputed values
    """
    if imputation_method == "mean":
        for col in columns:
            mean = times_closed[col].mean()
            mask = (times_open[col] == 0)
            times_open[col].mask(mask, mean, inplace=True)
    if imputation_method in ["KNN", "forest"]:
        if imputation_method == "KNN":
            imputer = KNNImputer(missing_values=0, col_max_missing=0.9)
        if imputation_method == "forest":
            imputer = MissForest(missing_values=0)
        for col in columns:
            try:
                val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0]
                other = pd.DataFrame(index=final.index,
                                     data=val,
                                     columns=[col])
                mask = (times_open[col] == 0)
                times_open.loc[mask, col] = other
            except ValueError:
                imputer = KNNImputer(missing_values=0, col_max_missing=0.99)
    return times_open
示例#21
0
def test_weight_uniform():
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     10]
    ])

    # Test with "uniform" weight (or unweighted)
    X_imputed_uniform = np.array([
        [0,      0],
        [5,      2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     10]
    ])

    imputer = KNNImputer(weights="uniform")
    assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)

    # Test with "callable" weight
    def no_weight(dist=None):
        return None

    imputer = KNNImputer(weights=no_weight)
    assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
示例#22
0
    def create_2d_velocity_field(self,
                                 radii,
                                 v_rot,
                                 n_interp_r=150,
                                 n_interp_theta=150):
        '''
        uses tilted ring model parameters to calculate velocity field
        using eqn 1-3 of 1709.02049 and v_rot from mass model

        it is easier to loop through polar coordinates and then map the v_los to the
        nearest x,y point

        returns 2d velocity field array
        '''
        v_field = np.empty(shape=(self.image_ydim, self.image_xdim))
        v_field[:] = np.nan
        v_rot_interp = interp1d(radii, v_rot)
        radii_interp = np.linspace(np.min(radii), np.max(radii), n_interp_r)
        for r in radii_interp:
            v = v_rot_interp(r)
            for theta in np.linspace(0, 2. * np.pi, n_interp_theta):
                x, y, v_los = self._calc_v_los_at_r_theta(v, r, theta)
                if (self.image_xdim - 1 > x > 0 and y < self.image_ydim - 1
                        and y > 0):
                    arr_x, arr_y = int(np.round(x, 0)), int(np.round(y, 0))
                    try:
                        v_field[arr_y][arr_x] = v_los
                    except:
                        print(arr_x, arr_y, v_los)
        near_neighbors_mask = create_blurred_mask(v_field)
        imputer = KNNImputer(n_neighbors=3, weights="distance")
        v_field = imputer.fit_transform(
            np.where(near_neighbors_mask == 1, v_field, 0.))
        v_field[v_field == 0] = np.nan

        # rotate to match the fits data field
        v_field = np.rot90(v_field, 3)
        return v_field
示例#23
0
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data,
                    var, min_vals, max_vals):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = KNNImputer(n_neighbors=5)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]],
                                dtype="int16")
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")
    y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]],
                               dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig,
            y_test_imp_orig)
示例#24
0
def clean_dragon(save=False):
    source = os.path.join(DATA_DIR, "cids-smiles-dragon.txt")
    df = pd.read_csv(source).set_index("CID")
    df = df.iloc[:, 1:]  # Drop SMILES column

    # Scale to mean 0, variance 1
    ss = StandardScaler()
    good = df.columns[df.isnull().sum() < 500]
    df = df[good]
    scaled = ss.fit_transform(df.astype("float"))
    df = pd.DataFrame(scaled, index=df.index, columns=df.columns)

    # Impute missing values
    knn = KNNImputer(k=5)
    imputed = knn.fit_transform(df.values)
    df = pd.DataFrame(imputed, index=df.index, columns=df.columns)

    # Optionally save to disk
    if save:
        dest = os.path.join(DATA_DIR, "cids-smiles-dragon-scaled-imputed.txt")
        df.to_csv(dest)

    return df
示例#25
0
def test_metric_type():
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     10]
    ])

    # Test with a metric type without NaN support
    imputer = KNNImputer(metric="euclidean")
    bad_metric_msg = "The selected metric does not support NaN values."
    assert_raise_message(ValueError, bad_metric_msg, imputer.fit, X)
示例#26
0
    def fit(self, dataset):
        """Train standard imputation model.
        
        Args:
            - dataset: incomplete dataset
        """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == "mice":
                # TODO: Resolve the below:
                raise NotImplementedError(
                    "IterativeImputer not implemented due to versioning issues with fancyimpute"
                )
                # self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == "missforest":
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == "knn":
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
def imputeMatrix(dataM):
	imputer = KNNImputer(n_neighbors=10)
	dataT = imputer.fit_transform(dataM)
	return dataT
示例#28
0
        out[index1, index2] = np.nan
        Mask[index1, index2] = 0
    Missing = Image.fromarray(rgbArray)
    plt.imshow(Missing)
    plt.show()
    return out


SelectedImage = showImagesRandomImages(
    3)  #select and image randomly from MNSIT dataset
missingPercentage = 0.2  # missing rate percentage
missingImage = generateMissingFig(
    SelectedImage,
    missingPercentage)  #inserting missing values to the original image

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputed_by_KNN = imputer.fit_transform(missingImage)
KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN)
#plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = MissForest()
MissForest_imputed = imputer.fit_transform(missingImage)
MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed)
#plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = IterativeImputer()
MICE_imputed = imputer.fit_transform(missingImage)
MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed)
#plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
示例#29
0
def imputeKNN(data, **kwargs):
    imputer = KNNImputer(**kwargs)
    imputedData = imputer.fit_transform(data)
    imputedData = pd.DataFrame(imputedData, index=data.index, columns=data.columns)
    return imputedData
示例#30
0
def perform_knn_imputation(dfs):
    knn_imputed_datasets = [KNNImputer(n_neighbors=100, copy = True).fit_transform(dfs[i]) for i in range(len(dfs))]
    return [pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs))]