Exemplo n.º 1
0
def deploy(file_name):
    file_name = file_name + '.csv'
    df = pd.read_csv(file_name)
    df = df.tail(30000)
    df = df.replace(to_replace=-9999, value=np.nan)
    #
    # i=0
    # while (i<30):
    #     i=i+1
    #     df['pressure'].fillna(method='backfill', inplace=True)
    #     df['gph'].fillna(method='backfill', inplace=True)
    # #
    #
    # df= df[['pressure','temp','gph']]
    # print(df.head(10))
    # df.replace(np.nan,0)

    # df1 = pd.read_excel('/Users/jashrathod/Desktop/')
    df_new = pd.DataFrame()
    df_new['wdir_new'] = df['wdir']
    df_new['gph'] = df['gph']
    df_new.reset_index(inplace=True)
    print(df_new.head())
    #df_new = df.replace(-9999, np.nan)
    imputer = MissForest()
    df_new = imputer.fit_transform(df_new)
    #print(df_new.head())
    df_new = pd.DataFrame(df_new)
    df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'})
    print(df_new.columns)
    print(df_new.head())
    df = df.join(df_new)

    df_new.to_excel("1filmiss.xls")
Exemplo n.º 2
0
def reconstruct(dataset, mask):
    print('Reconstructing using MissForest...')

    # train_data = dataset.orig_ds['train_X']
    # mask = dataset.miss_masks[config_idx]['train_X']

    (datasetLen, dim) = np.shape(dataset)
    train_data = dataset.copy()
    incomplete_dataset = np.zeros((datasetLen, dim))

    # IterativeImputer requires corrupted entries to be identified as NaN
    # Using the mask to replace in the input dataset all zero entries for NaN
    for i in range(datasetLen):
        frame = train_data.loc[i, :]
        ms = mask.loc[i, :]
        ms.values[ms.values == 0] = np.nan
        incomplete_dataset[i] = frame.values * ms.values

    incomplete_dataset = pd.DataFrame(incomplete_dataset)

    imputer = MissForest(max_iter=5, verbose=0)
    reconstructed_dataset = imputer.fit_transform(incomplete_dataset)

    print(np.shape(reconstructed_dataset))
    print(reconstructed_dataset)

    return pd.DataFrame(reconstructed_dataset)
Exemplo n.º 3
0
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = MissForest(random_state=1, n_jobs=-1)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    #min_vals_2=np.nanmin(XY_completed_train,axis=0)
    #max_vals_2=np.nanmax(XY_completed_train,axis=0)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
def test_statstics_fit_transform():
    # Test statistics_ when data in fit() and transform() are different
    X = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    Y = np.array([
        [0, 0, 0, 0],
        [2, 2, 2, 1],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [9, 9, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest()
    imputer.fit(X).transform(Y)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_missforest_imputation_shape():
    # Verify the shapes of the imputed matrix
    n_rows = 10
    n_cols = 2
    X = gen_array(n_rows, n_cols)
    imputer = MissForest()
    X_imputed = imputer.fit_transform(X)
    assert_equal(X_imputed.shape, (n_rows, n_cols))
Exemplo n.º 6
0
def rf_imputing(data):
  #code me !
  # Make an instance and perform the imputation
  imputer = MissForest(verbose=True)
  X = data.drop('VALUE_PER_UNIT', axis=1)
  X_imputed = imputer.fit_transform(X)
  # X_imputed['VALUE_PER_UNIT'] =  data['VALUE_PER_UNIT']
  return X_imputed
def test_missforest_categorical_multiple():
    # Test with two missing values for multiple iterations
    df = np.array([
        [0, 0, np.nan, 1],
        [0, 1, 1, 2],
        [0, 2, 1, 2],
        [np.nan, 4, 1, 5],
        [1, 7, 0, 7],
        [1, 8, 0, 8],
        [1, 15, 0, 19],
        [1, 18, 0, 17],
    ])
    cat_vars = [0, 2]
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    n_rows, n_cols = df.shape

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols)

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            rf = RandomForestClassifier(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
    np.random.seed(rand_seed)

    n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)

    imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True)
    x_filled = imputer.fit_transform(xmiss)

    mse = mse_own(x_filled, data_x, mask)

    print("MSE for MissForest: ", mse)

    return x_filled, mse
Exemplo n.º 9
0
def mf_impute(inp, subject=None, cols=None, categorical_variables=None):
    data = copy.deepcopy(inp)
    # Prepare input
    # if cols is none, perform for all columns (except first column)
    if cols is None:
        cols = data.columns[1:]
    # If subject is null, perform for all subjects
    if subject is None:
        inp = data[cols]
    else:
        # Create a dataframe with all selected subjects
        inp = pandas.DataFrame()
        for s in subject:
            inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols])
    if len(inp.columns) < 2:
        raise Exception("Multiple variables must be given as input")

    # Encode string columns
    # Note: only categorical variables are encoded
    if not categorical_variables is None:
        labels = {}
        for col in categorical_variables:
            if inp[col].dtype == np.dtype(object):
                encoded, mapping, label = label_encode(inp[col])
                # Convert string column to encoded result
                inp[col] = encoded
                labels[col] = label

    else:
        labels = {}

    # Prepare MissForest Imputer
    imputer = MissForest()
    cat_vars = None
    if not categorical_variables is None:
        cat_vars = []
        for categorical_variable in categorical_variables:
            cat_vars.append(list(inp.columns).index(categorical_variable))

    # Fit and Transform the input
    res = imputer.fit_transform(inp.values, cat_vars=cat_vars)
    res = pandas.DataFrame(res, index=inp.index, columns=inp.columns)

    # Convert encoded columns back to strings
    for col in labels.keys():
        res[col] = labels[col].inverse_transform(res[col].astype(int))

    data.loc[res.index, res.columns] = res
    return data
Exemplo n.º 10
0
def impute_times(final,
                 times_open,
                 times_closed,
                 columns,
                 imputation_method="mean"):
    """
    Impute open work items times with different methods
    :param final: Complete preprocessed dataframe
    :param times_open: Dataframe of work items that are not closed
    :param times_closed: Dataframe of work items that are closed
    :param columns: Columns to impute
    :param imputation_method: Choose between 'mean', 'KNN', 'forest'
    :return: Dataframe of open work items with imputed values
    """
    if imputation_method == "mean":
        for col in columns:
            mean = times_closed[col].mean()
            mask = (times_open[col] == 0)
            times_open[col].mask(mask, mean, inplace=True)
    if imputation_method in ["KNN", "forest"]:
        if imputation_method == "KNN":
            imputer = KNNImputer(missing_values=0, col_max_missing=0.9)
        if imputation_method == "forest":
            imputer = MissForest(missing_values=0)
        for col in columns:
            try:
                val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0]
                other = pd.DataFrame(index=final.index,
                                     data=val,
                                     columns=[col])
                mask = (times_open[col] == 0)
                times_open.loc[mask, col] = other
            except ValueError:
                imputer = KNNImputer(missing_values=0, col_max_missing=0.99)
    return times_open
Exemplo n.º 11
0
class MissForestImputer(object):
    def __init__(self):
        self.imputer = MissForest(verbose=0)

    def encode_cat(self, X_c):
        data = X_c.copy()
        nonulls = data.dropna().values
        impute_reshape = nonulls.reshape(-1, 1)
        encoder = OrdinalEncoder()
        impute_ordinal = encoder.fit_transform(impute_reshape)
        data.loc[data.notnull()] = np.squeeze(impute_ordinal)
        return data, encoder

    def decode_cat(self, X_c, encoder):
        data = X_c.copy()
        nonulls = data.dropna().values.reshape(-1, 1)
        n_cat = len(encoder.categories_[0])
        nonulls = np.round(nonulls).clip(0, n_cat - 1)
        nonulls = encoder.inverse_transform(nonulls)
        data.loc[data.notnull()] = np.squeeze(nonulls)
        return data

    def fit_transform(self, X):
        num_X = X.select_dtypes(include='number')
        cat_X = X.select_dtypes(exclude='number')

        # encode the categorical columns to numeric columns
        if cat_X.shape[1] > 0:
            cat_encoders = {}
            cat_X_enc = []
            for c in cat_X.columns:
                X_c_enc, encoder = self.encode_cat(cat_X[c])
                cat_X_enc.append(X_c_enc)
                cat_encoders[c] = encoder
            cat_X_enc = pd.concat(cat_X_enc, axis=1)
            X_enc = pd.concat([num_X, cat_X_enc], axis=1)
            cat_columns = cat_X.columns
            cat_indices = [
                i for i, c in enumerate(X_enc.columns) if c in cat_columns
            ]
        else:
            X_enc = X
            cat_indices = None

        X_imp = self.imputer.fit_transform(X_enc.values.astype(float),
                                           cat_vars=cat_indices)
        X_imp = pd.DataFrame(X_imp, columns=X_enc.columns)

        if cat_X.shape[1] > 0:
            num_X_imp = X_imp[num_X.columns]
            cat_X_imp = X_imp[cat_X.columns]
            cat_X_dec = []
            for c in cat_X.columns:
                X_c_dec = self.decode_cat(cat_X_imp[c], cat_encoders[c])
                cat_X_dec.append(X_c_dec)
            cat_X_dec = pd.concat(cat_X_dec, axis=1)
            X_imp = pd.concat([num_X_imp, cat_X_dec], axis=1)

        X_imp = X_imp[X.columns]
        return X_imp
Exemplo n.º 12
0
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs):
    """
    Impute missing values in DataFrame (np.nan or None).
    ------------------------
    Args:
        * df: pd.DataFrame of (samples x features)
        * method: string for what method of imputation to use
            ** 'mean': mean imputation
            ** 'knn': K-NN imputation (see missingpy.KNNImputer)
            ** 'rf': random forest imputation (see missingpy.MissForest)

    Returns:
        * pd.DataFrame: imputed values (samples x features)
    """
    assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method)

    if method=='mean':
        return df.fillna(df.mean(0))
    elif method=='knn':
        X = df.values
        imputer = KNNImputer(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
    elif method=='rf':
        X = df.values
        imputer = MissForest(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
Exemplo n.º 13
0
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"):
    tr_x = pre_tr_x.copy()
    te_x = pre_te_x.copy()
    if how == "mean":
        fill_value = tr_x[target_col].mean()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "median":
        fill_value = tr_x[target_col].median()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "rf":
        imputer = MissForest()
        tr_x[target_col] = imputer.fit_transform(tr_x[target_col])
        te_x[target_col] = imputer.transform(te_x[target_col])
    return tr_x, te_x
def test_default_with_invalid_input():
    # Test imputation with default values and invalid input

    # Test with all rows missing in a column
    X = np.array([
        [np.nan, 0, 0, 1],
        [np.nan, 1, 2, np.nan],
        [np.nan, 2, 3, np.nan],
        [np.nan, 4, 5, 5],
    ])
    imputer = MissForest(random_state=1337)
    msg = "One or more columns have all rows missing."
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with inf present
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not supported."
    assert_raise_message(ValueError, msg, MissForest().fit, X)

    # Test with inf present in matrix passed in transform()
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])

    X_fit = np.array([
        [0, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not supported."
    assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
Exemplo n.º 15
0
    def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train,
                test_dataset, _X_test, _y_test, _headers):
        self.steps = _steps
        self.answers = _answers
        self.X_train = _X_train
        self.y_train = _y_train
        self.X_test = _X_test
        self.y_test = _y_test
        self.headers = _headers

        self.train_pipe_steps = []

        for i, s in enumerate(self.steps):
            if (s == 'imputer'):
                if (self.answers[i][s] == 'Miss Forest'):
                    imputer = MissForest()

                if (self.answers[i][s] == 'KNN Miss Values'):

                    imputer = KNNImputer(n_neighbors=2)

        imputer.fit(self.X_train, self.y_train)
        self.X_train = imputer.transform(self.X_train)
        self.X_test = imputer.transform(self.X_test)

        self.new_train_dataset = pd.DataFrame(self.X_train,
                                              columns=self.headers[:-1])
        self.new_train_dataset[self.headers[-1]] = self.y_train

        self.new_test_dataset = pd.DataFrame(self.X_test,
                                             columns=self.headers[:-1])
        self.new_test_dataset[self.headers[-1]] = self.y_test

        return self.new_train_dataset, self.new_test_dataset
Exemplo n.º 16
0
def missforest_imputer(pd_data, random_state=None):
    """
    Impute missing values using the MissForest imputer.

    Inputs:
        pd_data: (DataFrame) Data containing missing values.
        random_state: (int, optional) Seed of the pseudo
            random number generator to use.

    Returns:
        pd_imputed: (DataFrame) Data with missing values imputed.
    """
    imputer = MissForest(random_state=random_state)

    pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data),
                              index=pd_data.index,
                              columns=pd_data.columns)

    return pd_imputed
def test_missforest_numerical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(df, axis=0)

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestRegressor(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df), df_imputed)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_missforest_categorical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestClassifier(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
    assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
Exemplo n.º 19
0
 def define_imputer(self,impute_type):
     '''Initialize the imputer to be used for every iteration.
     
     Input:
         impute_type: string, {'simple': SimpleImputer, 
         'iterative': IterativeImputer and 'forest': RandomForest imputer}
     Output:
         Imputer: imputer object to be used in the pipeline        
     '''
     if impute_type=='simple':
         self.imputer = SimpleImputer(missing_values=np.nan, strategy='median',
                                        add_indicator=self.model_args['add_missing_indicator'])
     elif impute_type=='iterative':
         self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
                                        add_indicator=self.model_args['add_missing_indicator'])
     elif impute_type=='forest':
         self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
def test_missforest_zero_part2():
    # Test with an imputable matrix and compare with missing_values="NaN"
    X_zero = gen_array(min_val=1, missing_values=0)
    X_nan = gen_array(min_val=1, missing_values=np.nan)
    statistics_mean = np.nanmean(X_nan, axis=0)

    imputer_zero = MissForest(missing_values=0, random_state=1337)
    imputer_nan = MissForest(missing_values=np.nan, random_state=1337)

    assert_array_equal(imputer_zero.fit_transform(X_zero),
                       imputer_nan.fit_transform(X_nan))
    assert_array_equal(imputer_zero.statistics_.get("col_means"),
                       statistics_mean)
def test_missforest_zero():
    # Test imputation when missing_values == 0
    missing_values = 0
    imputer = MissForest(missing_values=missing_values, random_state=0)

    # Test with missing_values=0 when NaN present
    X = gen_array(min_val=0)
    msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with all zeroes in a column
    X = np.array([
        [1, 0, 0, 0, 5],
        [2, 1, 0, 2, 3],
        [3, 2, 0, 0, 0],
        [4, 6, 0, 5, 13],
    ])
    msg = "One or more columns have all rows missing."
    assert_raise_message(ValueError, msg, imputer.fit, X)
Exemplo n.º 22
0
    def fit(self, dataset):
        """Train standard imputation model.
    
    Args:
      - dataset: incomplete dataset
    """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
Exemplo n.º 23
0
 def Missforest_Imputation(self, train_index, test_index, final):
     miss_info = self.miss_info
     obj_col = deepcopy(miss_info["obj_col"])
     cat_var = [
         idx for idx, i in enumerate(miss_info["original_column"])
         if i in obj_col
     ]
     if final:
         if obj_col == []:
             self.numMI = MissForest(max_depth=5).fit_transform(
                 X=self.full_miss_data.values)
             sample = self.numMI
         else:
             MI = MissForest(verbose=0, n_jobs=-1,
                             max_depth=5).fit_transform(
                                 X=self.full_miss_data.values,
                                 cat_vars=cat_var)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.MI_pd = MI_pd
             sample = self.MI_pd
     else:
         if obj_col == []:
             MISS = MissForest(max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values)
             self.numMI = MISS.transform(
                 X=self.full_miss_data.iloc[test_index, :].values)
             sample = self.numMI
         else:
             MIss = MissForest(verbose = 0, n_jobs  = -1 ,
                               max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values ,
                                                cat_vars= cat_var)
             MI = MIss.transform(
                 self.full_miss_data.iloc[test_index, :].values)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.numMI = MI_pd[self.notobj].values
             sample = MI_pd.values
     return sample
Exemplo n.º 24
0
    def fit(self, dataset):
        """Train standard imputation model.
        
        Args:
            - dataset: incomplete dataset
        """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == "mice":
                # TODO: Resolve the below:
                raise NotImplementedError(
                    "IterativeImputer not implemented due to versioning issues with fancyimpute"
                )
                # self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == "missforest":
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == "knn":
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
Exemplo n.º 25
0

SelectedImage = showImagesRandomImages(
    3)  #select and image randomly from MNSIT dataset
missingPercentage = 0.2  # missing rate percentage
missingImage = generateMissingFig(
    SelectedImage,
    missingPercentage)  #inserting missing values to the original image

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputed_by_KNN = imputer.fit_transform(missingImage)
KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN)
#plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = MissForest()
MissForest_imputed = imputer.fit_transform(missingImage)
MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed)
#plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = IterativeImputer()
MICE_imputed = imputer.fit_transform(missingImage)
MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed)
#plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

ppca = PPCA()
ppca.fit(data=SelectedImage, d=100, verbose=True)
PPCA_imputed = ppca.transform(missingImage)
PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed)
Exemplo n.º 26
0
msno.matrix(df) 

#histotams and density plots
dataset['horseLevel'].plot.hist(bins=10, alpha=0.5)
dataset['sireLevel'].plot.hist(bins=10, alpha=0.5)
dataset['damLevel'].plot.hist(bins=10, alpha=0.5)
dataset['sireOfdamLevel'].plot.hist(bins=10, alpha=0.5)

sns.distplot(dataset['horseLevel'], hist=False, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})


#random forrest imputation
imputer = MissForest()
imputedData = imputer.fit_transform(df)
imputedData = pd.DataFrame(imputedData, columns = df.columns)


#create train/test df
msk = np.random.rand(len(imputedData)) < 0.8
train = imputedData[msk]
test = imputedData[~msk]

#OLS
train['const'] = 1
reg1 = sm.OLS(endog=train['horseLevel'], exog=train[['damLevel', 'sireLevel', 'sireOfdamLevel']], 
    missing='drop')

results1 = reg1.fit()
Exemplo n.º 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
from copy import deepcopy

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("train_classification.csv")
df_ce = deepcopy(df)
for name in ["Name", "Sex", "Ticket", "Fare", "Cabin", "Embarked"]:
    col = df_ce[name]
    col[~col.isna()] = LabelEncoder().fit_transform(col[~col.isna()])

from missingpy import MissForest

imputer = MissForest()
imputer.fit_transform(df_ce.values.astype("float"))
Exemplo n.º 28
0
 def _random_forest(self,df): 
     imputer = MissForest(random_state=10) 
     imputed_values = pd.DataFrame(imputer.fit_transform(df))
     imputed_values.columns = df.columns
     return imputed_values
miss_sum.index.names = ['Name']
miss_sum['Name'] = miss_sum.index

#plot the missing value count
sns.set(style="whitegrid", color_codes=True)
sns.barplot(x='Name', y='count', data=miss_sum)
plt.xticks(rotation=90)
plt.show()

#change Period variable
train_data['Period'] = train_data['Period'].str.slice_replace(4, 14, '')
test_data['Period'] = test_data['Period'].str.slice_replace(4, 14, '')

#Impute missing values
from missingpy import MissForest
imputer = MissForest()
train_data_imputed = imputer.fit_transform(train_data)

train_data_imputed = pd.DataFrame(
    data=train_data_imputed[0:, 0:],
    index=[i for i in range(train_data_imputed.shape[0])],
    columns=train_data_columns)

train_data_imputed.columns

#train_data_imputed.reset_index(drop=True).reset_index(drop=True)

type(train_data_imputed)
train_data_imputed.head(10)
# write csv
train_data_imputed.to_excel('train_data_imputed.xlsx', index=False)
def test_missforest_mixed_multiple():
    # Test with mixed data type
    df = np.array([
        [np.nan, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [1, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, np.nan],
    ])

    n_rows, n_cols = df.shape
    cat_vars = [0]
    num_vars = np.setdiff1d(range(n_cols), cat_vars)
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    statistics_mean = np.nanmean(df, axis=0)

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[0, 0] = statistics_mode[0]
    df_imp2[6, 3] = statistics_mean[3]

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            if c in cat_vars:
                rf = RandomForestClassifier(n_estimators=100,
                                            random_state=1337)
            else:
                rf = RandomForestRegressor(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(imputer.statistics_.get('col_means'),
                       statistics_mean[num_vars])
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])