def test_missforest_zero_part2():
    # Test with an imputable matrix and compare with missing_values="NaN"
    X_zero = gen_array(min_val=1, missing_values=0)
    X_nan = gen_array(min_val=1, missing_values=np.nan)
    statistics_mean = np.nanmean(X_nan, axis=0)

    imputer_zero = MissForest(missing_values=0, random_state=1337)
    imputer_nan = MissForest(missing_values=np.nan, random_state=1337)

    assert_array_equal(imputer_zero.fit_transform(X_zero),
                       imputer_nan.fit_transform(X_nan))
    assert_array_equal(imputer_zero.statistics_.get("col_means"),
                       statistics_mean)
Пример #2
0
class MissForestImputer(object):
    def __init__(self):
        self.imputer = MissForest(verbose=0)

    def encode_cat(self, X_c):
        data = X_c.copy()
        nonulls = data.dropna().values
        impute_reshape = nonulls.reshape(-1, 1)
        encoder = OrdinalEncoder()
        impute_ordinal = encoder.fit_transform(impute_reshape)
        data.loc[data.notnull()] = np.squeeze(impute_ordinal)
        return data, encoder

    def decode_cat(self, X_c, encoder):
        data = X_c.copy()
        nonulls = data.dropna().values.reshape(-1, 1)
        n_cat = len(encoder.categories_[0])
        nonulls = np.round(nonulls).clip(0, n_cat - 1)
        nonulls = encoder.inverse_transform(nonulls)
        data.loc[data.notnull()] = np.squeeze(nonulls)
        return data

    def fit_transform(self, X):
        num_X = X.select_dtypes(include='number')
        cat_X = X.select_dtypes(exclude='number')

        # encode the categorical columns to numeric columns
        if cat_X.shape[1] > 0:
            cat_encoders = {}
            cat_X_enc = []
            for c in cat_X.columns:
                X_c_enc, encoder = self.encode_cat(cat_X[c])
                cat_X_enc.append(X_c_enc)
                cat_encoders[c] = encoder
            cat_X_enc = pd.concat(cat_X_enc, axis=1)
            X_enc = pd.concat([num_X, cat_X_enc], axis=1)
            cat_columns = cat_X.columns
            cat_indices = [
                i for i, c in enumerate(X_enc.columns) if c in cat_columns
            ]
        else:
            X_enc = X
            cat_indices = None

        X_imp = self.imputer.fit_transform(X_enc.values.astype(float),
                                           cat_vars=cat_indices)
        X_imp = pd.DataFrame(X_imp, columns=X_enc.columns)

        if cat_X.shape[1] > 0:
            num_X_imp = X_imp[num_X.columns]
            cat_X_imp = X_imp[cat_X.columns]
            cat_X_dec = []
            for c in cat_X.columns:
                X_c_dec = self.decode_cat(cat_X_imp[c], cat_encoders[c])
                cat_X_dec.append(X_c_dec)
            cat_X_dec = pd.concat(cat_X_dec, axis=1)
            X_imp = pd.concat([num_X_imp, cat_X_dec], axis=1)

        X_imp = X_imp[X.columns]
        return X_imp
Пример #3
0
def reconstruct(dataset, mask):
    print('Reconstructing using MissForest...')

    # train_data = dataset.orig_ds['train_X']
    # mask = dataset.miss_masks[config_idx]['train_X']

    (datasetLen, dim) = np.shape(dataset)
    train_data = dataset.copy()
    incomplete_dataset = np.zeros((datasetLen, dim))

    # IterativeImputer requires corrupted entries to be identified as NaN
    # Using the mask to replace in the input dataset all zero entries for NaN
    for i in range(datasetLen):
        frame = train_data.loc[i, :]
        ms = mask.loc[i, :]
        ms.values[ms.values == 0] = np.nan
        incomplete_dataset[i] = frame.values * ms.values

    incomplete_dataset = pd.DataFrame(incomplete_dataset)

    imputer = MissForest(max_iter=5, verbose=0)
    reconstructed_dataset = imputer.fit_transform(incomplete_dataset)

    print(np.shape(reconstructed_dataset))
    print(reconstructed_dataset)

    return pd.DataFrame(reconstructed_dataset)
Пример #4
0
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = MissForest(random_state=1, n_jobs=-1)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    #min_vals_2=np.nanmin(XY_completed_train,axis=0)
    #max_vals_2=np.nanmax(XY_completed_train,axis=0)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
Пример #5
0
def deploy(file_name):
    file_name = file_name + '.csv'
    df = pd.read_csv(file_name)
    df = df.tail(30000)
    df = df.replace(to_replace=-9999, value=np.nan)
    #
    # i=0
    # while (i<30):
    #     i=i+1
    #     df['pressure'].fillna(method='backfill', inplace=True)
    #     df['gph'].fillna(method='backfill', inplace=True)
    # #
    #
    # df= df[['pressure','temp','gph']]
    # print(df.head(10))
    # df.replace(np.nan,0)

    # df1 = pd.read_excel('/Users/jashrathod/Desktop/')
    df_new = pd.DataFrame()
    df_new['wdir_new'] = df['wdir']
    df_new['gph'] = df['gph']
    df_new.reset_index(inplace=True)
    print(df_new.head())
    #df_new = df.replace(-9999, np.nan)
    imputer = MissForest()
    df_new = imputer.fit_transform(df_new)
    #print(df_new.head())
    df_new = pd.DataFrame(df_new)
    df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'})
    print(df_new.columns)
    print(df_new.head())
    df = df.join(df_new)

    df_new.to_excel("1filmiss.xls")
Пример #6
0
def rf_imputing(data):
  #code me !
  # Make an instance and perform the imputation
  imputer = MissForest(verbose=True)
  X = data.drop('VALUE_PER_UNIT', axis=1)
  X_imputed = imputer.fit_transform(X)
  # X_imputed['VALUE_PER_UNIT'] =  data['VALUE_PER_UNIT']
  return X_imputed
def test_missforest_imputation_shape():
    # Verify the shapes of the imputed matrix
    n_rows = 10
    n_cols = 2
    X = gen_array(n_rows, n_cols)
    imputer = MissForest()
    X_imputed = imputer.fit_transform(X)
    assert_equal(X_imputed.shape, (n_rows, n_cols))
def test_missforest_categorical_multiple():
    # Test with two missing values for multiple iterations
    df = np.array([
        [0, 0, np.nan, 1],
        [0, 1, 1, 2],
        [0, 2, 1, 2],
        [np.nan, 4, 1, 5],
        [1, 7, 0, 7],
        [1, 8, 0, 8],
        [1, 15, 0, 19],
        [1, 18, 0, 17],
    ])
    cat_vars = [0, 2]
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    n_rows, n_cols = df.shape

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols)

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            rf = RandomForestClassifier(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
    np.random.seed(rand_seed)

    n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)

    imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True)
    x_filled = imputer.fit_transform(xmiss)

    mse = mse_own(x_filled, data_x, mask)

    print("MSE for MissForest: ", mse)

    return x_filled, mse
Пример #10
0
def mf_impute(inp, subject=None, cols=None, categorical_variables=None):
    data = copy.deepcopy(inp)
    # Prepare input
    # if cols is none, perform for all columns (except first column)
    if cols is None:
        cols = data.columns[1:]
    # If subject is null, perform for all subjects
    if subject is None:
        inp = data[cols]
    else:
        # Create a dataframe with all selected subjects
        inp = pandas.DataFrame()
        for s in subject:
            inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols])
    if len(inp.columns) < 2:
        raise Exception("Multiple variables must be given as input")

    # Encode string columns
    # Note: only categorical variables are encoded
    if not categorical_variables is None:
        labels = {}
        for col in categorical_variables:
            if inp[col].dtype == np.dtype(object):
                encoded, mapping, label = label_encode(inp[col])
                # Convert string column to encoded result
                inp[col] = encoded
                labels[col] = label

    else:
        labels = {}

    # Prepare MissForest Imputer
    imputer = MissForest()
    cat_vars = None
    if not categorical_variables is None:
        cat_vars = []
        for categorical_variable in categorical_variables:
            cat_vars.append(list(inp.columns).index(categorical_variable))

    # Fit and Transform the input
    res = imputer.fit_transform(inp.values, cat_vars=cat_vars)
    res = pandas.DataFrame(res, index=inp.index, columns=inp.columns)

    # Convert encoded columns back to strings
    for col in labels.keys():
        res[col] = labels[col].inverse_transform(res[col].astype(int))

    data.loc[res.index, res.columns] = res
    return data
def test_missforest_categorical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestClassifier(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [0, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
    assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
Пример #12
0
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"):
    tr_x = pre_tr_x.copy()
    te_x = pre_te_x.copy()
    if how == "mean":
        fill_value = tr_x[target_col].mean()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "median":
        fill_value = tr_x[target_col].median()
        tr_x.fillna(fill_value, inplace=True)
        te_x.fillna(fill_value, inplace=True)
    elif how == "rf":
        imputer = MissForest()
        tr_x[target_col] = imputer.fit_transform(tr_x[target_col])
        te_x[target_col] = imputer.transform(te_x[target_col])
    return tr_x, te_x
Пример #13
0
def missforest_imputer(pd_data, random_state=None):
    """
    Impute missing values using the MissForest imputer.

    Inputs:
        pd_data: (DataFrame) Data containing missing values.
        random_state: (int, optional) Seed of the pseudo
            random number generator to use.

    Returns:
        pd_imputed: (DataFrame) Data with missing values imputed.
    """
    imputer = MissForest(random_state=random_state)

    pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data),
                              index=pd_data.index,
                              columns=pd_data.columns)

    return pd_imputed
def test_missforest_numerical_single():
    # Test imputation with default parameter values

    # Test with a single missing value
    df = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(df, axis=0)

    y = df[:, 0]
    X = df[:, 1:]
    good_rows = np.where(~np.isnan(y))[0]
    bad_rows = np.where(np.isnan(y))[0]

    rf = RandomForestRegressor(n_estimators=10, random_state=1337)
    rf.fit(X=X[good_rows], y=y[good_rows])
    pred_val = rf.predict(X[bad_rows])

    df_imputed = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [pred_val, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest(n_estimators=10, random_state=1337)
    assert_array_equal(imputer.fit_transform(df), df_imputed)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
Пример #15
0
 def _random_forest(self,df): 
     imputer = MissForest(random_state=10) 
     imputed_values = pd.DataFrame(imputer.fit_transform(df))
     imputed_values.columns = df.columns
     return imputed_values
miss_sum['Name'] = miss_sum.index

#plot the missing value count
sns.set(style="whitegrid", color_codes=True)
sns.barplot(x='Name', y='count', data=miss_sum)
plt.xticks(rotation=90)
plt.show()

#change Period variable
train_data['Period'] = train_data['Period'].str.slice_replace(4, 14, '')
test_data['Period'] = test_data['Period'].str.slice_replace(4, 14, '')

#Impute missing values
from missingpy import MissForest
imputer = MissForest()
train_data_imputed = imputer.fit_transform(train_data)

train_data_imputed = pd.DataFrame(
    data=train_data_imputed[0:, 0:],
    index=[i for i in range(train_data_imputed.shape[0])],
    columns=train_data_columns)

train_data_imputed.columns

#train_data_imputed.reset_index(drop=True).reset_index(drop=True)

type(train_data_imputed)
train_data_imputed.head(10)
# write csv
train_data_imputed.to_excel('train_data_imputed.xlsx', index=False)
# Imputation des données manquantes (Deuxième méthode-MissForest)
# =============================================================================

# Recopier notre base initiale (Afin de l'utiliser pour la deuxième méthode d'imputation)
df_housing_impute2=df_housing_copy.copy()

# Indixation des variables qualitatives
for var in list(var_qualitative.columns):
    df_housing_impute2[var] = pd.Series(df_housing_impute2[var], dtype="category").cat.codes
    for i in range(0,len(df_housing)):
        if df_housing_impute2[var][i] == -1:
            df_housing_impute2[var][i] = np.nan

# imputation par MissForest
imputer = MissForest(missing_values = np.nan)
Ny_imputed = imputer.fit_transform(df_housing_impute2)
df_housing_impute2=pd.DataFrame(Ny_imputed, columns=df_housing_impute2.columns.values.tolist())
df_housing_impute2.isnull().sum()

# =============================================================================
# Sélection des variables
# =============================================================================
# Data sans variable à expliquer
X2=df_housing_impute2.loc[:, df_housing_impute2.columns != "Class_prix"]

# Variable à expliquer
y2=df_housing_impute2.Class_prix

# Création d'une instance de la classe
lr1 = LogisticRegression()
Пример #18
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
    print()
    mi_data = miss_data_x.astype(float)
    no, dim = imputed_data_x.shape
    miss_data = np.reshape(mi_data, (no, dim))
    np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f')
    print('Shape of miss data: ', miss_data.shape)
    print('Save results in missing_data.csv')

    print()
    print('=== GAIN RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse, 6)))
    #print('Kích thước của file đầu ra: ', imputed_data_x.shape)
    np.savetxt("data/imputed_data.csv",
               imputed_data_x,
               delimiter=',',
               fmt='%d')
    print('Save results in Imputed_data.csv')

    # MissForest

    print()
    print('=== MissForest RMSE ===')
    data = miss_data_x
    imp_mean = MissForest(max_iter=5)
    miss_f = imp_mean.fit_transform(data)
    #miss_f = pd.DataFrame(imputed_train_df)
    rmse_MF = rmse_loss(ori_data_x, miss_f, data_m)
    print('RMSE Performance: ' + str(np.round(rmse_MF, 6)))
    np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MF.csv')

    # MICE From Auto Impute
    print()
    print('=== MICE of Auto Impute RMSE ===')
    data_mice = pd.DataFrame(miss_data_x)
    mi = MiceImputer(k=1,
                     imp_kwgs=None,
                     n=1,
                     predictors='all',
                     return_list=True,
                     seed=None,
                     strategy='default predictive',
                     visit='default')
    mice_out = mi.fit_transform(data_mice)
    c = [list(x) for x in mice_out]
    c1 = c[0]
    c2 = c1[1]
    c3 = np.asarray(c2)
    mice_x = c3
    #print('here :', mice_x, miss_f, miss_f.shape)
    rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m)
    print('=== MICE of Auto Impute RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse_MICE, 6)))
    np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MICE.csv')

    return imputed_data_x, rmse
Пример #19
0
    data.drop(data[data.DebtRatio > 1].index, inplace=True)
    data.drop(data[data.age <= 0].index, inplace=True)
    data.drop(data[data.age > 100].index, inplace=True)
    data.drop(data[(data.NumberWorse1 > 20)].index, inplace=True)
    data.drop(data[(data.NumberRealEstateLoansOrLines > 40)].index,
              inplace=True)
    data.drop(data[(data.NumberWorse2 > 40)].index, inplace=True)
    data.drop(data[(data.NumberOfDependents > 15)].index, inplace=True)
    data.drop(data[data.NumberWorse2 > 10].index, inplace=True)
    data.drop(data[data.NumberRealEstateLoansOrLines > 5].index, inplace=True)
    data.drop(data[data.NumberOfOpenCreditLinesAndLoans > 20].index,
              inplace=True)

    # Filling missing values
    imputer = MissForest()
    data2 = imputer.fit_transform(data)
    # imputer = KNNImputer(n_neighbors=2, weights="uniform")
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    # data2 = imp.fit_transform(test_data)

    data2 = pd.DataFrame(data2, columns=attributes)

    data2['age'] = data2['age'].round(0)
    data2['NumberWorse1'] = data2['NumberWorse1'].round(0)
    data2['MonthlyIncome'] = data2['MonthlyIncome'].round(0)
    data2['NumberOfOpenCreditLinesAndLoans'] = data2[
        'NumberOfOpenCreditLinesAndLoans'].round(0)
    data2['NumberOfTimes90DaysLate'] = data2['NumberOfTimes90DaysLate'].round(
        0)
    data2['NumberRealEstateLoansOrLines'] = data2[
        'NumberRealEstateLoansOrLines'].round(0)
Пример #20
0
# Missing Forest imputation attempt

# Import dependencies
import numpy as np
import pandas as pd
from missingpy import MissForest

# Load data
train = pd.read_csv("/home/nishant/Desktop/IDA Project/mod_data/train.csv")
cols = train.columns.tolist()

# Impute values
# Function returns a numpy ndarray, which we convert to DataFrame again
imputer = MissForest()

print("[INFO] Imputation started")
X_imputed = imputer.fit_transform(train.values)

print("[INFO] Imputation complete")
train_mf = pd.DataFrame(X_imputed, columns=cols)

# Save new DataFrame to drive
train_mf.to_csv("/home/nishant/Desktop/IDA Project/mod_data/train_mf.csv",
                index=False)
def test_missforest_mixed_multiple():
    # Test with mixed data type
    df = np.array([
        [np.nan, 0, 0, 1],
        [0, 1, 2, 2],
        [0, 2, 3, 2],
        [1, 4, 5, 5],
        [1, 7, 6, 7],
        [1, 8, 8, 8],
        [1, 15, 18, np.nan],
    ])

    n_rows, n_cols = df.shape
    cat_vars = [0]
    num_vars = np.setdiff1d(range(n_cols), cat_vars)
    statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
    statistics_mean = np.nanmean(df, axis=0)

    # Fit missforest and transform
    imputer = MissForest(random_state=1337)
    df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)

    # Get iterations used by missforest above
    max_iter = imputer.iter_count_

    # Get NaN mask
    nan_mask = np.isnan(df)
    nan_rows, nan_cols = np.where(nan_mask)

    # Make initial guess for missing values
    df_imp2 = df.copy()
    df_imp2[0, 0] = statistics_mode[0]
    df_imp2[6, 3] = statistics_mean[3]

    # Loop for max_iter count over the columns with NaNs
    for _ in range(max_iter):
        for c in nan_cols:
            # Identify all other columns (i.e. predictors)
            not_c = np.setdiff1d(np.arange(n_cols), c)
            # Identify rows with NaN and those without in 'c'
            y = df_imp2[:, c]
            X = df_imp2[:, not_c]
            good_rows = np.where(~nan_mask[:, c])[0]
            bad_rows = np.where(nan_mask[:, c])[0]

            # Fit model and predict
            if c in cat_vars:
                rf = RandomForestClassifier(n_estimators=100,
                                            random_state=1337)
            else:
                rf = RandomForestRegressor(n_estimators=100, random_state=1337)
            rf.fit(X=X[good_rows], y=y[good_rows])
            pred_val = rf.predict(X[bad_rows])

            # Fill in values
            df_imp2[bad_rows, c] = pred_val

    assert_array_equal(df_imp1, df_imp2)
    assert_array_equal(imputer.statistics_.get('col_means'),
                       statistics_mean[num_vars])
    assert_array_equal(
        imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
Пример #22
0
dataset.isnull().sum()


# *missingpy* library supports the following algorithms: <br>
# * __k-Nearest Neighbours__ imputation - the _KNNImputer_ class provides imputation for completing missing values using the _k-Nearest Neighbors_ approach. This algorithm required to have normalized data, because it is based on euclidean distance.
# * __MissForest__ imputes missing values using _Random Forests_ in an iterative fashion. It does not require normalization, but all categorical data should be one-hot-encoded.<br><br>
# 
# For this dataset __MissForest__ method was used.

# In[53]:


imputer = MissForest()
dataset_to_convert = dataset.to_numpy()
dataset_without_nan = imputer.fit_transform(dataset_to_convert)


# In[54]:


df = pd.DataFrame(dataset_without_nan, columns = dataset.columns)


# Dataset without missing values was saved as _df_.

# In[55]:


df.head()
Пример #23
0
def imputeMatrix(dataM):
	
	nan=np.nan
	imputer = MissForest()
	dataT = imputer.fit_transform(dataM)
	return dataT
Пример #24
0
    "country_x", "Year_x", "country_y", "Year_y", "BORROWERS_CTY_x",
    "BORROWERS_CTY_y", "Year_y", "level_0"
])

# In[20]:

#to numeric
cols = full.columns.drop(["ISO2 Code", "year"])

full[cols] = full[cols].apply(pd.to_numeric, errors='coerce')

# In[20]:

from missingpy import MissForest
imputer = MissForest()
full_imp = imputer.fit_transform(full)

full = pd.DataFrame(data=full_imp, columns=full.columns, index=full.index)

# In[83]:

#creating variable

iso = full["ISO2 Code"]

full = full.groupby('ISO2 Code').ffill()

full["ISO2 Code"] = iso

full["gdp_growth"] = full.groupby(
    'ISO2 Code', sort=False).NGDP_R_K_IX.apply(lambda x: x.pct_change(12))
Пример #25
0
def prepare_data(data,
                 data_idxs,
                 outcome,
                 convert_categorical=True,
                 keep_cols=None,
                 scaler=None,
                 imputer=None,
                 verbose=False,
                 seed=None):
    X = data.iloc[:, 0:-6]  # TODO: get rid of magic number

    # remove excluded variables
    for v in EXCLUDE_VARS:
        if v in X.columns:
            print('dropped {} column...'.format(v))
            X = X.drop([v], axis=1)

    # convert categorical variables
    if convert_categorical:
        X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1)
        X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1)
        X = X.drop(['ethnicity', 'gender'], axis=1)
        X = X.drop(['Other', 'Female'], axis=1)  # to avoid colinearity

    ## Extract outcomes
    y = None
    names = {
        'time': 'censor_or_{}_days'.format(outcome),
        'event': '{}_indicator'.format(outcome),
    }
    y = data[[names['time'], names['event']]]

    ## Filter for appropriate samples
    prev_ct = len(y)
    pos_events = y.iloc[:, 0] > 0  # event times > 0
    X = X.loc[pos_events]
    y = y.loc[pos_events]
    data_idxs = list(
        [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc])
    print('filtered out {} events with times < 0'.format(prev_ct - len(y)))

    if keep_cols is None:
        X = X.loc[:, (X != 0).any(axis=0)]  # drop columns w/ all zero
    else:
        for vr in keep_cols:
            if not set([vr]).issubset(X.columns):
                X[vr] = 0.0  # impute with zero by default
        X = X[keep_cols]

    # check for nulls and impute
    x_null = np.sum(pd.isnull(X))
    y_null = np.sum(pd.isnull(y))
    if (x_null.sum() > 0) or (y_null.sum() > 0):
        print('Will impute...')
        print('NULL (X, y):', x_null, y_null)
    if imputer is None:
        print('Fitting MissForest...')
        imputer = MissForest(random_state=seed)
        X_data = imputer.fit_transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)
        print('Fitted.')
    else:
        X_data = imputer.transform(X)
        X = pd.DataFrame(data=X_data, columns=X.columns)

    # scale numerical values
    if scaler is None:
        scaler = StandardScaler()
        X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS])
    else:
        X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS])

    if verbose:
        print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape))
        print('Columns: {}'.format(X.columns))
        print('---------------- X ----------------\n{}'.format(X.describe()))
        print('---------------- y ----------------\n{}'.format(y.describe()))

    return X, y, scaler, imputer, data_idxs
Пример #26
0
def panel_data(train, years_ahead=1):
    """
    It uses a random forest trained on the observed values of a data matrix (selected series codes except those
    in submit_rows_index) to predict the missing values.
    after that, use panel data model for prediction 
    Returns:
      y_pred: prediction values of target
    """
    train_melt = pd.melt(train.iloc[:, 0:38],
                         id_vars=['Country Name', 'Series Code'],
                         value_vars=train.columns[0:36],
                         var_name='year',
                         value_name='value')
    train_melt['year'] = train_melt['year'].str[:4].astype(int)
    panel = train_melt.groupby(['Country Name', 'year',
                                'Series Code'])['value'].mean().unstack()

    # only use code with at least one observed value across 36 years in each country for the imputation data matrix
    left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max(
        axis=0) <= 18
    pred = panel.iloc[:, 9:].iloc[:, left_feature.values]

    # construct matrix of features across countries
    df = []
    ct_list = list(set(pred.index.get_level_values(0)))
    ct_list = sorted(ct_list)
    for i in ct_list:
        df.append(pred.loc[i])
    predictors = pd.concat(df, axis=1)

    # random forest imputation
    imputer = MissForest()
    predictors_imputed = imputer.fit_transform(predictors)

    panel.reset_index(inplace=True)
    panel.columns = ['Country Name', 'year'] + [
        'y' + str(i) for i in range(1, 10)
    ] + ['x' + str(i) for i in range(1, 1297)]
    nfeature = int(predictors.shape[1] / 214)
    split = list(range(nfeature, predictors_imputed.shape[1], nfeature))
    _ = np.split(predictors_imputed, split, 1)
    predictors_new = pd.DataFrame(np.vstack(_))
    predictors_new['year'] = panel.year
    predictors_new['Country Name'] = panel['Country Name']
    predictors_new.columns = [
        'x' + str(i) for i in range(1, pred.shape[1] + 1)
    ] + ['year', 'Country Name']

    # combine the updated feature matrix and responses
    feature = predictors_new.isna().sum() <= 0  # change to 1
    panel_left = predictors_new.iloc[:, feature.values]
    panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead))

    # Split prediction and target
    panel_train = panel_comb.loc[panel_comb.year < 2007]
    panel_train = panel_train.set_index(['Country Name', 'year'])
    panel_test = panel_comb.loc[panel_comb.year == 2007]
    panel_test = panel_test.set_index(['Country Name', 'year'])

    # panel data model
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        Ypred = pd.DataFrame()
        for i in range(1, 10):
            formula = 'y' + str(i) + '~1+' + '+'.join(
                panel_train.columns[11:].values) + '+EntityEffects'
            mod = PanelOLS.from_formula(formula, panel_train)
            res = mod.fit(cov_type='clustered', cluster_entity=True)
            Ypred['y' + str(i)] = res.predict(data=panel_test).predictions

    # Eval
    Yval = panel_test.iloc[:, :9]
    rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2)))
    print(rmse)

    return Ypred
Пример #27
0
#histotams and density plots
dataset['horseLevel'].plot.hist(bins=10, alpha=0.5)
dataset['sireLevel'].plot.hist(bins=10, alpha=0.5)
dataset['damLevel'].plot.hist(bins=10, alpha=0.5)
dataset['sireOfdamLevel'].plot.hist(bins=10, alpha=0.5)

sns.distplot(dataset['horseLevel'], hist=False, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})


#random forrest imputation
imputer = MissForest()
imputedData = imputer.fit_transform(df)
imputedData = pd.DataFrame(imputedData, columns = df.columns)


#create train/test df
msk = np.random.rand(len(imputedData)) < 0.8
train = imputedData[msk]
test = imputedData[~msk]

#OLS
train['const'] = 1
reg1 = sm.OLS(endog=train['horseLevel'], exog=train[['damLevel', 'sireLevel', 'sireOfdamLevel']], 
    missing='drop')

results1 = reg1.fit()
Пример #28
0
my_train_data1 = my_train_data.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
my_train_data1 = title_extract(my_train_data1)
train_data = pd.concat([my_train_data['Survived'].reset_index(drop=True), my_train_data1.reset_index(drop=True)], axis=1)
train_data = dummy_encode(train_data, 3, 7, 1, 8)

# Feature scaling (Age)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train_data[['Age']] = sc.fit_transform(train_data[['Age']])

from missingpy import MissForest

# Make an instance and perform the imputation
imputer = MissForest(random_state=0)
train_data = pd.DataFrame(imputer.fit_transform(train_data.drop(['Survived'], axis=1)), columns=train_data.columns[1:])

# we do the same for the CV and test set

my_CV_data1 = my_CV_data.loc[:, ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
CV_data = title_extract(my_CV_data1)
CV_data = dummy_encode(CV_data, 3, 7, 1, 8) 

my_test_data1 = test.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]
test_data = title_extract(my_test_data1)
test_data = dummy_encode(test_data, 2, 6, 0, 7)

# Feature scaling (Age)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
Пример #29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
from copy import deepcopy

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("train_classification.csv")
df_ce = deepcopy(df)
for name in ["Name", "Sex", "Ticket", "Fare", "Cabin", "Embarked"]:
    col = df_ce[name]
    col[~col.isna()] = LabelEncoder().fit_transform(col[~col.isna()])

from missingpy import MissForest

imputer = MissForest()
imputer.fit_transform(df_ce.values.astype("float"))
Пример #30
0
def rf(data):
    from missingpy import MissForest
    rf = MissForest()
    return rf.fit_transform(data)