def test_missforest_zero_part2():
    # Test with an imputable matrix and compare with missing_values="NaN"
    X_zero = gen_array(min_val=1, missing_values=0)
    X_nan = gen_array(min_val=1, missing_values=np.nan)
    statistics_mean = np.nanmean(X_nan, axis=0)

    imputer_zero = MissForest(missing_values=0, random_state=1337)
    imputer_nan = MissForest(missing_values=np.nan, random_state=1337)

    assert_array_equal(imputer_zero.fit_transform(X_zero),
                       imputer_nan.fit_transform(X_nan))
    assert_array_equal(imputer_zero.statistics_.get("col_means"),
                       statistics_mean)
Exemplo n.º 2
0
def impute_times(final,
                 times_open,
                 times_closed,
                 columns,
                 imputation_method="mean"):
    """
    Impute open work items times with different methods
    :param final: Complete preprocessed dataframe
    :param times_open: Dataframe of work items that are not closed
    :param times_closed: Dataframe of work items that are closed
    :param columns: Columns to impute
    :param imputation_method: Choose between 'mean', 'KNN', 'forest'
    :return: Dataframe of open work items with imputed values
    """
    if imputation_method == "mean":
        for col in columns:
            mean = times_closed[col].mean()
            mask = (times_open[col] == 0)
            times_open[col].mask(mask, mean, inplace=True)
    if imputation_method in ["KNN", "forest"]:
        if imputation_method == "KNN":
            imputer = KNNImputer(missing_values=0, col_max_missing=0.9)
        if imputation_method == "forest":
            imputer = MissForest(missing_values=0)
        for col in columns:
            try:
                val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0]
                other = pd.DataFrame(index=final.index,
                                     data=val,
                                     columns=[col])
                mask = (times_open[col] == 0)
                times_open.loc[mask, col] = other
            except ValueError:
                imputer = KNNImputer(missing_values=0, col_max_missing=0.99)
    return times_open
Exemplo n.º 3
0
    def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train,
                test_dataset, _X_test, _y_test, _headers):
        self.steps = _steps
        self.answers = _answers
        self.X_train = _X_train
        self.y_train = _y_train
        self.X_test = _X_test
        self.y_test = _y_test
        self.headers = _headers

        self.train_pipe_steps = []

        for i, s in enumerate(self.steps):
            if (s == 'imputer'):
                if (self.answers[i][s] == 'Miss Forest'):
                    imputer = MissForest()

                if (self.answers[i][s] == 'KNN Miss Values'):

                    imputer = KNNImputer(n_neighbors=2)

        imputer.fit(self.X_train, self.y_train)
        self.X_train = imputer.transform(self.X_train)
        self.X_test = imputer.transform(self.X_test)

        self.new_train_dataset = pd.DataFrame(self.X_train,
                                              columns=self.headers[:-1])
        self.new_train_dataset[self.headers[-1]] = self.y_train

        self.new_test_dataset = pd.DataFrame(self.X_test,
                                             columns=self.headers[:-1])
        self.new_test_dataset[self.headers[-1]] = self.y_test

        return self.new_train_dataset, self.new_test_dataset
Exemplo n.º 4
0
def deploy(file_name):
    file_name = file_name + '.csv'
    df = pd.read_csv(file_name)
    df = df.tail(30000)
    df = df.replace(to_replace=-9999, value=np.nan)
    #
    # i=0
    # while (i<30):
    #     i=i+1
    #     df['pressure'].fillna(method='backfill', inplace=True)
    #     df['gph'].fillna(method='backfill', inplace=True)
    # #
    #
    # df= df[['pressure','temp','gph']]
    # print(df.head(10))
    # df.replace(np.nan,0)

    # df1 = pd.read_excel('/Users/jashrathod/Desktop/')
    df_new = pd.DataFrame()
    df_new['wdir_new'] = df['wdir']
    df_new['gph'] = df['gph']
    df_new.reset_index(inplace=True)
    print(df_new.head())
    #df_new = df.replace(-9999, np.nan)
    imputer = MissForest()
    df_new = imputer.fit_transform(df_new)
    #print(df_new.head())
    df_new = pd.DataFrame(df_new)
    df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'})
    print(df_new.columns)
    print(df_new.head())
    df = df.join(df_new)

    df_new.to_excel("1filmiss.xls")
Exemplo n.º 5
0
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs):
    """
    Impute missing values in DataFrame (np.nan or None).
    ------------------------
    Args:
        * df: pd.DataFrame of (samples x features)
        * method: string for what method of imputation to use
            ** 'mean': mean imputation
            ** 'knn': K-NN imputation (see missingpy.KNNImputer)
            ** 'rf': random forest imputation (see missingpy.MissForest)

    Returns:
        * pd.DataFrame: imputed values (samples x features)
    """
    assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method)

    if method=='mean':
        return df.fillna(df.mean(0))
    elif method=='knn':
        X = df.values
        imputer = KNNImputer(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
    elif method=='rf':
        X = df.values
        imputer = MissForest(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
Exemplo n.º 6
0
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = MissForest(random_state=1, n_jobs=-1)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    #min_vals_2=np.nanmin(XY_completed_train,axis=0)
    #max_vals_2=np.nanmax(XY_completed_train,axis=0)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
Exemplo n.º 7
0
def reconstruct(dataset, mask):
    print('Reconstructing using MissForest...')

    # train_data = dataset.orig_ds['train_X']
    # mask = dataset.miss_masks[config_idx]['train_X']

    (datasetLen, dim) = np.shape(dataset)
    train_data = dataset.copy()
    incomplete_dataset = np.zeros((datasetLen, dim))

    # IterativeImputer requires corrupted entries to be identified as NaN
    # Using the mask to replace in the input dataset all zero entries for NaN
    for i in range(datasetLen):
        frame = train_data.loc[i, :]
        ms = mask.loc[i, :]
        ms.values[ms.values == 0] = np.nan
        incomplete_dataset[i] = frame.values * ms.values

    incomplete_dataset = pd.DataFrame(incomplete_dataset)

    imputer = MissForest(max_iter=5, verbose=0)
    reconstructed_dataset = imputer.fit_transform(incomplete_dataset)

    print(np.shape(reconstructed_dataset))
    print(reconstructed_dataset)

    return pd.DataFrame(reconstructed_dataset)
def test_statstics_fit_transform():
    # Test statistics_ when data in fit() and transform() are different
    X = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    Y = np.array([
        [0, 0, 0, 0],
        [2, 2, 2, 1],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [9, 9, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest()
    imputer.fit(X).transform(Y)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def test_default_with_invalid_input():
    # Test imputation with default values and invalid input

    # Test with all rows missing in a column
    X = np.array([
        [np.nan, 0, 0, 1],
        [np.nan, 1, 2, np.nan],
        [np.nan, 2, 3, np.nan],
        [np.nan, 4, 5, 5],
    ])
    imputer = MissForest(random_state=1337)
    msg = "One or more columns have all rows missing."
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with inf present
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not supported."
    assert_raise_message(ValueError, msg, MissForest().fit, X)

    # Test with inf present in matrix passed in transform()
    X = np.array([
        [np.inf, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])

    X_fit = np.array([
        [0, 1, 1, 2, np.nan],
        [2, 1, 2, 2, 3],
        [3, 2, 3, 3, 8],
        [np.nan, 6, 0, 5, 13],
        [np.nan, 7, 0, 7, 8],
        [6, 6, 2, 5, 7],
    ])
    msg = "+/- inf values are not supported."
    assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
def test_missforest_imputation_shape():
    # Verify the shapes of the imputed matrix
    n_rows = 10
    n_cols = 2
    X = gen_array(n_rows, n_cols)
    imputer = MissForest()
    X_imputed = imputer.fit_transform(X)
    assert_equal(X_imputed.shape, (n_rows, n_cols))
Exemplo n.º 11
0
def rf_imputing(data):
  #code me !
  # Make an instance and perform the imputation
  imputer = MissForest(verbose=True)
  X = data.drop('VALUE_PER_UNIT', axis=1)
  X_imputed = imputer.fit_transform(X)
  # X_imputed['VALUE_PER_UNIT'] =  data['VALUE_PER_UNIT']
  return X_imputed
def test_missforest_categorical_multiple():
    # Test with two missing values for multiple iterations
    df = np.array([
        [0, 0, np.nan, 1],
        [0, 1, 1, 2],
        [0, 2, 1, 2],
        [np.nan, 4, 1, 5],
        [1, 7, 0, 7],
        [1, 8, 0, 8],
        [1, 15, 0, 19],
        [1, 18, 0, 17],
    ])
    cat_vars = [0, 2]
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    n_rows, n_cols = df.shape

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols)

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            rf = RandomForestClassifier(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
    np.random.seed(rand_seed)

    n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)

    imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True)
    x_filled = imputer.fit_transform(xmiss)

    mse = mse_own(x_filled, data_x, mask)

    print("MSE for MissForest: ", mse)

    return x_filled, mse
Exemplo n.º 14
0
 def Missforest_Imputation(self, train_index, test_index, final):
     miss_info = self.miss_info
     obj_col = deepcopy(miss_info["obj_col"])
     cat_var = [
         idx for idx, i in enumerate(miss_info["original_column"])
         if i in obj_col
     ]
     if final:
         if obj_col == []:
             self.numMI = MissForest(max_depth=5).fit_transform(
                 X=self.full_miss_data.values)
             sample = self.numMI
         else:
             MI = MissForest(verbose=0, n_jobs=-1,
                             max_depth=5).fit_transform(
                                 X=self.full_miss_data.values,
                                 cat_vars=cat_var)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.MI_pd = MI_pd
             sample = self.MI_pd
     else:
         if obj_col == []:
             MISS = MissForest(max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values)
             self.numMI = MISS.transform(
                 X=self.full_miss_data.iloc[test_index, :].values)
             sample = self.numMI
         else:
             MIss = MissForest(verbose = 0, n_jobs  = -1 ,
                               max_depth=5).\
             fit(X = self.full_miss_data.iloc[train_index,:].values ,
                                                cat_vars= cat_var)
             MI = MIss.transform(
                 self.full_miss_data.iloc[test_index, :].values)
             MI_pd = pd.DataFrame(MI, columns=miss_info["original_column"])
             self.numMI = MI_pd[self.notobj].values
             sample = MI_pd.values
     return sample
Exemplo n.º 15
0
def mf_impute(inp, subject=None, cols=None, categorical_variables=None):
    data = copy.deepcopy(inp)
    # Prepare input
    # if cols is none, perform for all columns (except first column)
    if cols is None:
        cols = data.columns[1:]
    # If subject is null, perform for all subjects
    if subject is None:
        inp = data[cols]
    else:
        # Create a dataframe with all selected subjects
        inp = pandas.DataFrame()
        for s in subject:
            inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols])
    if len(inp.columns) < 2:
        raise Exception("Multiple variables must be given as input")

    # Encode string columns
    # Note: only categorical variables are encoded
    if not categorical_variables is None:
        labels = {}
        for col in categorical_variables:
            if inp[col].dtype == np.dtype(object):
                encoded, mapping, label = label_encode(inp[col])
                # Convert string column to encoded result
                inp[col] = encoded
                labels[col] = label

    else:
        labels = {}

    # Prepare MissForest Imputer
    imputer = MissForest()
    cat_vars = None
    if not categorical_variables is None:
        cat_vars = []
        for categorical_variable in categorical_variables:
            cat_vars.append(list(inp.columns).index(categorical_variable))

    # Fit and Transform the input
    res = imputer.fit_transform(inp.values, cat_vars=cat_vars)
    res = pandas.DataFrame(res, index=inp.index, columns=inp.columns)

    # Convert encoded columns back to strings
    for col in labels.keys():
        res[col] = labels[col].inverse_transform(res[col].astype(int))

    data.loc[res.index, res.columns] = res
    return data
Exemplo n.º 16
0
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"):
    tr_x = pre_tr_x.copy()
    te_x = pre_te_x.copy()
    if how == "mean":
        fill_value = tr_x[target_col].mean()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "median":
        fill_value = tr_x[target_col].median()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "rf":
        imputer = MissForest()
        tr_x[target_col] = imputer.fit_transform(tr_x[target_col])
        te_x[target_col] = imputer.transform(te_x[target_col])
    return tr_x, te_x
Exemplo n.º 17
0
 def define_imputer(self,impute_type):
     '''Initialize the imputer to be used for every iteration.
     
     Input:
         impute_type: string, {'simple': SimpleImputer, 
         'iterative': IterativeImputer and 'forest': RandomForest imputer}
     Output:
         Imputer: imputer object to be used in the pipeline        
     '''
     if impute_type=='simple':
         self.imputer = SimpleImputer(missing_values=np.nan, strategy='median',
                                        add_indicator=self.model_args['add_missing_indicator'])
     elif impute_type=='iterative':
         self.imputer = IterativeImputer(missing_values=np.nan, initial_strategy='median',
                                        add_indicator=self.model_args['add_missing_indicator'])
     elif impute_type=='forest':
         self.imputer = MissForest(random_state=self.random_state,n_jobs=-2)
Exemplo n.º 18
0
def missforest_imputer(pd_data, random_state=None):
    """
    Impute missing values using the MissForest imputer.

    Inputs:
        pd_data: (DataFrame) Data containing missing values.
        random_state: (int, optional) Seed of the pseudo
            random number generator to use.

    Returns:
        pd_imputed: (DataFrame) Data with missing values imputed.
    """
    imputer = MissForest(random_state=random_state)

    pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data),
                              index=pd_data.index,
                              columns=pd_data.columns)

    return pd_imputed
def test_missforest_zero():
    # Test imputation when missing_values == 0
    missing_values = 0
    imputer = MissForest(missing_values=missing_values, random_state=0)

    # Test with missing_values=0 when NaN present
    X = gen_array(min_val=0)
    msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
    assert_raise_message(ValueError, msg, imputer.fit, X)

    # Test with all zeroes in a column
    X = np.array([
        [1, 0, 0, 0, 5],
        [2, 1, 0, 2, 3],
        [3, 2, 0, 0, 0],
        [4, 6, 0, 5, 13],
    ])
    msg = "One or more columns have all rows missing."
    assert_raise_message(ValueError, msg, imputer.fit, X)
Exemplo n.º 20
0
    def fit(self, dataset):
        """Train standard imputation model.
    
    Args:
      - dataset: incomplete dataset
    """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
def test_missforest_numerical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(df, axis=0)

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestRegressor(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df), df_imputed)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
Exemplo n.º 22
0
    def fit(self, dataset):
        """Train standard imputation model.
        
        Args:
            - dataset: incomplete dataset
        """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == "mice":
                # TODO: Resolve the below:
                raise NotImplementedError(
                    "IterativeImputer not implemented due to versioning issues with fancyimpute"
                )
                # self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == "missforest":
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == "knn":
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return
def test_missforest_categorical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestClassifier(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
    assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
def test_missforest_mixed_multiple():
    # Test with mixed data type
    df = np.array([
        [np.nan, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [1, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, np.nan],
    ])

    n_rows, n_cols = df.shape
    cat_vars = [0]
    num_vars = np.setdiff1d(range(n_cols), cat_vars)
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    statistics_mean = np.nanmean(df, axis=0)

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[0, 0] = statistics_mode[0]
    df_imp2[6, 3] = statistics_mean[3]

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            if c in cat_vars:
                rf = RandomForestClassifier(n_estimators=100,
                                            random_state=1337)
            else:
                rf = RandomForestRegressor(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(imputer.statistics_.get('col_means'),
                       statistics_mean[num_vars])
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
Exemplo n.º 25
0
 def __init__(self):
     self.imputer = MissForest(verbose=0)
Exemplo n.º 26
0

SelectedImage = showImagesRandomImages(
    3)  #select and image randomly from MNSIT dataset
missingPercentage = 0.2  # missing rate percentage
missingImage = generateMissingFig(
    SelectedImage,
    missingPercentage)  #inserting missing values to the original image

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputed_by_KNN = imputer.fit_transform(missingImage)
KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN)
#plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = MissForest()
MissForest_imputed = imputer.fit_transform(missingImage)
MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed)
#plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = IterativeImputer()
MICE_imputed = imputer.fit_transform(missingImage)
MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed)
#plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

ppca = PPCA()
ppca.fit(data=SelectedImage, d=100, verbose=True)
PPCA_imputed = ppca.transform(missingImage)
PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed)
Exemplo n.º 27
0
def prepare_data(data,
                 data_idxs,
                 outcome,
                 convert_categorical=True,
                 keep_cols=None,
                 scaler=None,
                 imputer=None,
                 verbose=False,
                 seed=None):
    X = data.iloc[:, 0:-6]  # TODO: get rid of magic number

    # remove excluded variables
    for v in EXCLUDE_VARS:
        if v in X.columns:
            print('dropped {} column...'.format(v))
            X = X.drop([v], axis=1)

    # convert categorical variables
    if convert_categorical:
        X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1)
        X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1)
        X = X.drop(['ethnicity', 'gender'], axis=1)
        X = X.drop(['Other', 'Female'], axis=1)  # to avoid colinearity

    ## Extract outcomes
    y = None
    names = {
        'time': 'censor_or_{}_days'.format(outcome),
        'event': '{}_indicator'.format(outcome),
    }
    y = data[[names['time'], names['event']]]

    ## Filter for appropriate samples
    prev_ct = len(y)
    pos_events = y.iloc[:, 0] > 0  # event times > 0
    X = X.loc[pos_events]
    y = y.loc[pos_events]
    data_idxs = list(
        [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc])
    print('filtered out {} events with times < 0'.format(prev_ct - len(y)))

    if keep_cols is None:
        X = X.loc[:, (X != 0).any(axis=0)]  # drop columns w/ all zero
    else:
        for vr in keep_cols:
            if not set([vr]).issubset(X.columns):
                X[vr] = 0.0  # impute with zero by default
        X = X[keep_cols]

    # check for nulls and impute
    x_null = np.sum(pd.isnull(X))
    y_null = np.sum(pd.isnull(y))
    if (x_null.sum() > 0) or (y_null.sum() > 0):
        print('Will impute...')
        print('NULL (X, y):', x_null, y_null)
    if imputer is None:
        print('Fitting MissForest...')
        imputer = MissForest(random_state=seed)
        X_data = imputer.fit_transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)
        print('Fitted.')
    else:
        X_data = imputer.transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)

    # scale numerical values
    if scaler is None:
        scaler = StandardScaler()
        X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS])
    else:
        X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS])

    if verbose:
        print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape))
        print('Columns: {}'.format(X.columns))
        print('---------------- X ----------------\n{}'.format(X.describe()))
        print('---------------- y ----------------\n{}'.format(y.describe()))

    return X, y, scaler, imputer, data_idxs
Exemplo n.º 28
0
def panel_data(train, years_ahead=1):
    """
    It uses a random forest trained on the observed values of a data matrix (selected series codes except those
    in submit_rows_index) to predict the missing values.
    after that, use panel data model for prediction 
    Returns:
      y_pred: prediction values of target
    """
    train_melt = pd.melt(train.iloc[:, 0:38],
                         id_vars=['Country Name', 'Series Code'],
                         value_vars=train.columns[0:36],
                         var_name='year',
                         value_name='value')
    train_melt['year'] = train_melt['year'].str[:4].astype(int)
    panel = train_melt.groupby(['Country Name', 'year',
                                'Series Code'])['value'].mean().unstack()

    # only use code with at least one observed value across 36 years in each country for the imputation data matrix
    left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max(
        axis=0) <= 18
    pred = panel.iloc[:, 9:].iloc[:, left_feature.values]

    # construct matrix of features across countries
    df = []
    ct_list = list(set(pred.index.get_level_values(0)))
    ct_list = sorted(ct_list)
    for i in ct_list:
        df.append(pred.loc[i])
    predictors = pd.concat(df, axis=1)

    # random forest imputation
    imputer = MissForest()
    predictors_imputed = imputer.fit_transform(predictors)

    panel.reset_index(inplace=True)
    panel.columns = ['Country Name', 'year'] + [
        'y' + str(i) for i in range(1, 10)
    ] + ['x' + str(i) for i in range(1, 1297)]
    nfeature = int(predictors.shape[1] / 214)
    split = list(range(nfeature, predictors_imputed.shape[1], nfeature))
    _ = np.split(predictors_imputed, split, 1)
    predictors_new = pd.DataFrame(np.vstack(_))
    predictors_new['year'] = panel.year
    predictors_new['Country Name'] = panel['Country Name']
    predictors_new.columns = [
        'x' + str(i) for i in range(1, pred.shape[1] + 1)
    ] + ['year', 'Country Name']

    # combine the updated feature matrix and responses
    feature = predictors_new.isna().sum() <= 0  # change to 1
    panel_left = predictors_new.iloc[:, feature.values]
    panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead))

    # Split prediction and target
    panel_train = panel_comb.loc[panel_comb.year < 2007]
    panel_train = panel_train.set_index(['Country Name', 'year'])
    panel_test = panel_comb.loc[panel_comb.year == 2007]
    panel_test = panel_test.set_index(['Country Name', 'year'])

    # panel data model
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        Ypred = pd.DataFrame()
        for i in range(1, 10):
            formula = 'y' + str(i) + '~1+' + '+'.join(
                panel_train.columns[11:].values) + '+EntityEffects'
            mod = PanelOLS.from_formula(formula, panel_train)
            res = mod.fit(cov_type='clustered', cluster_entity=True)
            Ypred['y' + str(i)] = res.predict(data=panel_test).predictions

    # Eval
    Yval = panel_test.iloc[:, :9]
    rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2)))
    print(rmse)

    return Ypred
Exemplo n.º 29
0
 def _random_forest(self,df): 
     imputer = MissForest(random_state=10) 
     imputed_values = pd.DataFrame(imputer.fit_transform(df))
     imputed_values.columns = df.columns
     return imputed_values
Exemplo n.º 30
0
my_test_data1 = test.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
test_data = title_extract(my_test_data1)
test_data = dummy_encode(test_data, 2, 6, 0, 7)

# Feature scaling (Age)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
CV_data[['Age']] = sc.fit_transform(CV_data[['Age']])
test_data[['Age']] = sc.fit_transform(test_data[['Age']])


from missingpy import MissForest

# Make an instance and perform the imputation
imputer = MissForest(random_state=0)
my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1))

CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1))
CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:])
CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1)

test_data_missforest = imputer.transform(test_data)
test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns)

## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed
## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model

train_data.to_excel(r'train_data.xlsx', index = False)
CV_data_missforest.to_excel(r'CV_data.xlsx', index = False)
test_data_missforest.to_excel(r'test_data.xlsx', index = False)