示例#1
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
示例#2
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def test_iterative_imputer_with_low_rank_random_matrix():
    imputer = IterativeImputer(n_iter=50, random_state=0)
    XY_completed = imputer.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="IterativeImputer")
    assert missing_mae < 0.1, "Error too high with IterativeImputer method!"
def multi_imp(data,m):
    XY=data
    n_imputations = m
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
        XY_completed.extend(imputer.fit_transform(XY))
    return np.array(XY_completed)
示例#5
0
 def get_predict(self, flag, in_data):
   output = in_data.copy()
   output.shape = (utils.M_NUM, 1)
   output[~flag] = np.nan
   solver = MICE()
   tmp = self.t_measure.copy()
   tmp = np.column_stack((tmp, output)).transpose()
   tmp = solver.fit_transform(tmp)
   output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1)
   return output
def test_iterative_imputer_with_low_rank_random_matrix_approximate():
    imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0)
    XY_completed = imputer.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(
        XY,
        XY_completed,
        missing_mask,
        name="IterativeImputer with n_nearest_features=5")
    assert missing_mae < 0.1, "Error too high with IterativeImputer " \
                              "method using n_nearest_features=5!"
示例#7
0
def mice_imputer_wo_target(df):
    mice = IterativeImputer()
    return pd.DataFrame(mice.fit_transform(df),
                        columns=[
                            'city', 'city_development_index', 'gender',
                            'relevent_experience', 'enrolled_university',
                            'education_level', 'major_discipline',
                            'experience', 'company_size', 'company_type',
                            'last_new_job', 'training_hours'
                        ])
def test_iterative_imputer_train_test_with_low_rank_random_matrix():
    XY_incomplete_train = XY_incomplete[:250]
    XY_incomplete_test = XY_incomplete[250:]
    XY_test = XY[250:]
    imputer = IterativeImputer(n_iter=50, random_state=0)
    imputer.fit(XY_incomplete_train)
    XY_completed_test = imputer.transform(XY_incomplete_test)
    _, missing_mae = reconstruction_error(XY_test,
                                          XY_completed_test,
                                          missing_mask,
                                          name="IterativeImputer Train/Test")
    assert missing_mae < 0.1, "Error too high with IterativeImputer train/test method!"
def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate():
    n_imputations = 5
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=5,
                                   sample_posterior=True,
                                   random_state=i)
        XY_completed.append(imputer.fit_transform(XY_incomplete))
    _, missing_mae = reconstruction_error(XY,
                                          np.mean(XY_completed, axis=0),
                                          missing_mask,
                                          name="IterativeImputer as MICE")
    assert missing_mae < 0.1, "Error too high with IterativeImputer as MICE!"
示例#10
0
    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        start = X
        y_present = y is not None
        groupby_present = self.groupby is not None
        self.imputers = []
        if y_present or groupby_present:
            assert not (groupby_present and y_present)
            if y_present:
                classes = np.unique(y)
                gen_mask = lambda c: y == c
            if groupby_present:
                classes = X[self.groupby].unique()
                gen_mask = lambda c: X[self.groupby] == c
            self.imputers = {
                c: {
                    "impute": [
                        IterativeImputer(n_iter=self.n_iter,
                                         sample_posterior=True,
                                         random_state=ix,
                                         **self.kwargs)
                        for ix in range(self.multiple)
                    ],
                    "mask":
                    gen_mask(c),
                }
                for c in classes
            }

            msg = """Imputation transformer: {} imputers x {} classes""".format(
                self.multiple, len(classes))
            logger.info(msg)

            for c, d in self.imputers.items():
                for imp in d["impute"]:
                    imp.fit(X[d["mask"], :])

        else:
            for ix in range(self.multiple):
                self.imputers.append(
                    IterativeImputer(n_iter=self.n_iter,
                                     sample_posterior=True,
                                     random_state=ix,
                                     **self.kwargs))
            msg = """Imputation transformer: {} imputers""".format(
                self.multiple)
            logger.info(msg)
            for ix in range(self.multiple):
                self.imputers[ix].fit(X)

        return self
示例#11
0
 def __init__(self, method, **kwargs):
     self.clf = None
     self.method = method
     if method == "SoftImpute":
         self.clf = SoftImpute(**kwargs)
     elif method == "KNN":
         self.clf = KNN(**kwargs)
     elif method == "Naive":
         self.clf = SimpleFill()
     elif method == 'II':
         raise ('NOT TESTED')
         self.clf = IterativeImputer(min_value=0)
     else:
         raise ("Not Implemented method")
示例#12
0
class DFIterativeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, max_iter=10):
        self.imputer = None
        self.max_iter = max_iter

    def fit(self, X, y=None):
        self.imputer = IterativeImputer(max_iter=self.max_iter)
        self.imputer.fit(X)
        return self

    def transform(self, X):
        X_filled = self.imputer.transform(X)
        X_filled = pd.DataFrame(X_filled, index=X.index, columns=X.columns)
        return X_filled
示例#13
0
def deal_mar(df):
    """Deal with missing data with missing at random pattern."""

    Xy_incomplete = df.values

    # knn
    with NoStdStreams():
        Xy_filled_knn = KNN().fit_transform(Xy_incomplete);
    score_knn = compute_imputation_score(Xy_filled_knn)
    print("Imputation score of knn is {}".format(score_knn))
    # matrix factorization
    with NoStdStreams():
        Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete);
    score_mf = compute_imputation_score(Xy_filled_mf)
    print("Imputation score of matrix factorization is {}".format(score_knn))
    # multiple imputation
    with NoStdStreams():
        Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete)
    score_ii = compute_imputation_score(Xy_filled_ii)
    print("Imputation score of multiple imputation is {}".format(score_ii))

    score_dict = {'knn': score_knn,
                  'matrix factorization': score_mf, 'multiple imputation': score_ii}
    print("Imputation method with the highest socre is {}".format(max(score_dict, key=score_dict.get)))
    recommend = max(score_dict, key=score_dict.get)
    return recommend
示例#14
0
def baseline_inpute(X_incomplete, method='mean', level=0):

    if method == 'mean':
        X_filled_mean = SimpleFill().fit_transform(X_incomplete)
        return X_filled_mean
    elif method == 'knn':
        k = [3, 10, 50][level]
        X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete)
        return X_filled_knn
    elif method == 'svd':
        rank = [
            np.ceil((X_incomplete.shape[1] - 1) / 10),
            np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1
        ][level]
        X_filled_svd = IterativeSVD(rank=int(rank),
                                    verbose=False).fit_transform(X_incomplete)
        return X_filled_svd
    elif method == 'mice':
        max_iter = [3, 10, 50][level]
        X_filled_mice = IterativeImputer(
            max_iter=max_iter).fit_transform(X_incomplete)
        return X_filled_mice
    elif method == 'spectral':
        # default value for the sparsity level is with respect to the maximum singular value,
        # this is now done in a heuristic way
        sparsity = [0.5, None, 3][level]
        X_filled_spectral = SoftImpute(
            shrinkage_value=sparsity).fit_transform(X_incomplete)
        return X_filled_spectral
    else:
        raise NotImplementedError
示例#15
0
def clean_missing(df, features, setting):
    """Clean missing values in the dataset.
    Parameters
    ----------
    df : DataFrame
    features : List
        List of feature names.
    Returns
    -------
    features_new : List
        List of feature names after cleaning.
    Xy_filled : array-like
        Numpy array where missing values have been cleaned.
    """

    df_preprocessed, features_new = missing_preprocess(df, features)
    if setting == 'mcar':
        recommend = deal_mcar(df_preprocessed)
    elif setting == 'mar':
        recommend = deal_mar(df_preprocessed)
    elif setting == 'mnar':
        recommend = deal_mnar(df_preprocessed)
    else:
        print("Default MAR")
        recommend = deal_mar(df_preprocessed)

    if recommend == 'mean':
        print("Applying mean imputation ...")
        Xy_filled = Imputer(missing_values=np.nan,
                            strategy='mean').fit_transform(
                                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'mode':
        print("Applying mode imputation ...")
        Xy_filled = Imputer(missing_values=np.nan,
                            strategy='most_frequent').fit_transform(
                                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'knn':
        print("Applying knn imputation ...")
        with NoStdStreams():
            Xy_filled = KNN().fit_transform(df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'matrix factorization':
        print("Applying matrix factorization ...")
        with NoStdStreams():
            Xy_filled = MatrixFactorization().fit_transform(
                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'multiple imputation':
        print("Applying multiple imputation ...")
        with NoStdStreams():
            Xy_filled = IterativeImputer().fit_transform(
                df_preprocessed.values)
        print("Missing values cleaned!")
    else:
        print("Error: Approach not available!")
    return features_new, Xy_filled
示例#16
0
class vk_sensing():
    def __init__(self, method, **kwargs):
        self.clf = None
        self.method = method
        if method == "SoftImpute":
            self.clf = SoftImpute(**kwargs)
        elif method == "KNN":
            self.clf = KNN(**kwargs)
        elif method == "Naive":
            self.clf = SimpleFill()
        elif method == 'II':
            raise ('NOT TESTED')
            self.clf = IterativeImputer(min_value=0)
        else:
            raise ("Not Implemented method")

    def fit_transform(self, X_train):
        # print (X_train, np.isnan(X_train).all())
        assert (self.clf is not None)
        X_est = None
        if np.isnan(X_train).any():
            if np.isnan(X_train).all():
                X_est = np.zeros_like(X_train)
            else:
                # print (np.isnan(self.clf.fit_transform(X_train)).any())
                X_est = massage_imputed_matrix(self.clf.fit_transform(X_train))
        else:
            X_est = X_train
        assert (not np.isnan(X_est).any())
        return X_est

    def CVfit(self, X, val_ratio=0.2):
        mask = np.invert(np.isnan(X))
        sample_mask = np.random.rand(*X.shape) < val_ratio
        X_train = X.copy()
        X_train[mask & (~sample_mask)] = np.nan
        X_val = X.copy()
        X_val[mask & (sample_mask)] = np.nan
        cur_best_err = np.inf
        cur_best_k = None
        for k in GLOB_IMPUTE_K_SWEEP:
            clf = construct_low_rank_imputer(self.method, k)
            if np.isnan(X_train).any():
                if np.isnan(X_train).all():
                    X_est = np.zeros_like(X_train)
                else:
                    X_est = massage_imputed_matrix(clf.fit_transform(X_train))
            else:
                X_est = X_train
            err = MAE(X_est, X_val)
            # print (k, err, RMSN(X_est, X_val))
            if err < cur_best_err:
                cur_best_err = err
                cur_best_k = k
        if cur_best_k is None:
            cur_best_k = 1
        # print (cur_best_k)
        self.clf = construct_low_rank_imputer(self.method, cur_best_k)
示例#17
0
def deal_mcar(df):
    """Deal with missing data with missing completely at random pattern."""
    # number of instances
    num_instances = df.shape[0]

    # number of rows containing missing
    num_missing_instances = df.isnull().sum(axis=1).astype(bool).sum()

    # missing percentage
    missing_percentage = num_missing_instances / num_instances
    print("Missing percentage is {}".format(missing_percentage))

    if missing_percentage < 0.05:
        recommend = 'list deletion'
    else:
        Xy_incomplete = df.values
        # mean
        Xy_filled_mean = Imputer(missing_values=np.nan,
                                 strategy='mean').fit_transform(Xy_incomplete)
        score_mean = compute_imputation_score(Xy_filled_mean)
        print("Imputation score of mean is {}".format(score_mean))
        # mode
        Xy_filled_mode = Imputer(
            missing_values=np.nan,
            strategy='most_frequent').fit_transform(Xy_incomplete)
        score_mode = compute_imputation_score(Xy_filled_mode)
        print("Imputation score of mode is {}".format(score_mode))
        # knn
        with NoStdStreams():
            Xy_filled_knn = KNN().fit_transform(Xy_incomplete)
        score_knn = compute_imputation_score(Xy_filled_knn)
        print("Imputation score of knn is {}".format(score_knn))
        # matrix factorization
        with NoStdStreams():
            Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete)
        score_mf = compute_imputation_score(Xy_filled_mf)
        print(
            "Imputation score of matrix factorization is {}".format(score_knn))
        # multiple imputation
        with NoStdStreams():
            Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete)
        score_ii = compute_imputation_score(Xy_filled_ii)
        print("Imputation score of multiple imputation is {}".format(score_ii))

        score_dict = {
            'mean': score_mean,
            'mode': score_mode,
            'knn': score_knn,
            'matrix factorization': score_mf,
            'multiple imputation': score_ii
        }
        print("Imputation method with the highest socre is {}".format(
            max(score_dict, key=score_dict.get)))
        recommend = max(score_dict, key=score_dict.get)
    return recommend
示例#18
0
    def __init__(self):
        path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
        df = pd.read_excel(path)
        df = df[~df['PCL_Strict3'].isna()]
        df = df[df["military_exp18_t3"] > 0]
        df = df[self.features + self.ID + self.target_features]
        df_pcl3 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
        df_pcl3 = PCL_calculator(df_pcl3)
        df_pcl2 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
        df_pcl2 = PCL_calculator(df_pcl2)
        df_pcl1 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")
        df_pcl1 = PCL_calculator(df_pcl1)

        df = df.merge(df_pcl1, on="ID", how='outer')
        df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer')
        df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer')

        df = df[~df['PCL_Strict3'].isna()]
        #df = df[~df['tred_cutoff'].isna()]
        df.drop(self.ID, inplace=True, axis=1)
        if mew:
            mice = IterativeImputer()
            df = pd.DataFrame(mice.fit_transform(df), columns=df.columns)

        all_x_col = self.features + self.features_2 + self.target_features_2
        #all_x_col = self.features + self.features_2
        #y_col = ["tred_cutoff"]
        y_col = ["PCL_Strict3"]
        X = df[all_x_col]
        Y = df[y_col]
        X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y)
        X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0)
        df = pd.concat([X_train, y_train], axis=1)
        self.X_test = X_test
        self.y_test =y_test

        self.X_train_0 = X_train_0
        self.X_test_0 = X_test_0
        self.y_train_0 = y_train_0
        self.y_test_0 = y_test_0

        self.df = df
def mice_impute(data):
    print("imputing data using mice")
    data_matrix = data.values
    filled_data = pd.DataFrame(
        IterativeImputer(imputation_order='random',
                         n_iter=5,
                         sample_posterior=True).fit_transform(data_matrix))
    filled_data.columns = data.columns
    filled_data.index = data.index
    filled_data.to_csv('mice_imputed_data.csv')
    print("data imputed using mice")
示例#20
0
def obj_sim(path):
    wine_data = pd.read_csv(path,
                            header=0,
                            index_col=0,
                            engine='python',
                            encoding='utf-8')
    wine_data = wine_data.values
    # t = BiScaler().fit_transform(wine_data[:, 4].reshape(-1, 1))
    # obj_data = SoftImpute().fit_transform(t)
    obj_data = IterativeImputer().fit_transform(wine_data[:, 4].reshape(-1, 1))
    draw(pd.DataFrame(obj_data), 0)
示例#21
0
def fill_ii(df):
    '''
    Use IterativeImputer to fill null number
    ----------------------------------
    df: the pandas.dataframe going to fill missing value
    '''
    df_filled_ii = pd.DataFrame(IterativeImputer().fit_transform(df.as_matrix()))
    df_filled_ii.columns = df.columns
    df_filled_ii.index = df.index

    return df_filled_ii
示例#22
0
def construct_low_rank_imputer(method, k):
    clf = None
    if method == "SoftImpute":
        clf = SoftImpute(max_rank=k, verbose=False)
    elif method == "KNN":
        clf = KNN(k=k, verbose=False)
    elif method == 'II':
        clf = IterativeImputer(min_value=0)
    else:
        raise ("Not implemented")
    return clf
示例#23
0
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
    np.random.seed(rand_seed)

    n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)

    x_filled = IterativeImputer().fit_transform(xmiss)

    mse = mse_own(x_filled, data_x, mask)

    print("MSE for MICE: " + str(mse))

    return x_filled, mse
示例#24
0
    def iterative_imputer(self, estimator, max_iter, tol, n_nearest_feature,
                          initial_strategy, imputation_order, skip_complete,
                          min_value, max_value, verbose, random_state):
        print("Interative Imputer")
        print(n_nearest_feature)
        my_estimator = None

        if estimator == 'BayesianRidge':
            my_estimator = BayesianRidge()
        if estimator == 'DecisionTreeRegressor':
            my_estimator = DecisionTreeRegressor()
        if estimator == 'ExtraTreesRegressor':
            my_estimator = ExtraTreesRegressor()
        if estimator == 'KNeighborsRegressor':
            my_estimator = KNeighborsRegressor()
        if estimator == 'DecisionTreeClassifier':
            my_estimator = DecisionTreeClassifier

        imp = IterativeImputer(
            estimator=my_estimator,
            missing_values=np.NAN,
            # sample_posterior=sample_posterior,
            max_iter=max_iter,
            tol=tol,
            n_nearest_features=n_nearest_feature,
            initial_strategy=initial_strategy,
            imputation_order=imputation_order,
            skip_complete=skip_complete,
            min_value=min_value,
            max_value=max_value,
            verbose=verbose,
            random_state=random_state,
            # add_indicator=add_indicator
        )

        print("Iterative Imputer is created")
        self.data = imp.fit_transform(self.data)
        self.data = pd.DataFrame(self.data)
        self.data.columns = self.featuresName
        self.data = self.data.infer_objects()
示例#25
0
def Data_prep(df, flag):
    # Removing Loan Id and Loan Status for One-Hot encoding and Imputation
    # ########################################################
    predictors = df.columns
    predictors[1]
    predictors = np.delete(predictors, 0)
    if flag != 1:
        predictors = np.delete(predictors, -1)
        print("train module flag 0")
        flag = 1
    df[predictors]

    # One-Hot Encoding
    # ########################################################
    df_dummy = pd.get_dummies(df[predictors], dummy_na=True)
    df_dummy
    df_dummy.count()
    df_dummy.head()
    newcols = df_dummy.columns
    newcols

    # Data Imputations
    # ########################################################
    from fancyimpute import IterativeImputer
    df_imputed = IterativeImputer().fit_transform(df_dummy)
    df_imputed = pd.DataFrame(df_imputed, columns=newcols)
    df_imputed.head()
    df_imputed.count()

    return (df_imputed)
def mice_imputation(train, test):

    data_mice_train = np.copy(train)
    data_mice_test = np.copy(test)

    for ind in range(data_mice_train[:, 0, :].shape[0]):
        data_mice_train[ind, 0, :][np.argwhere(
            data_mice_train[ind, 1, :] == 1.0)] = np.nan

    for ind in range(data_mice_test[:, 0, :].shape[0]):
        data_mice_test[ind,
                       0, :][np.argwhere(data_mice_test[ind,
                                                        1, :] == 1.0)] = np.nan

    mice_impute = IterativeImputer()

    #check if all columns have values if not impute 0
    for col in range(data_mice_train[:, 0, :].shape[1]):
        if (np.all(np.isnan(data_mice_train[:, 0, :][:, col]))):
            data_mice_train[:, 0, :][:, col] = 0.0

    mice_impute.fit(data_mice_train[:, 0, :])
    return mice_impute.transform(data_mice_test[:, 0, :])
示例#27
0
def replace_mice(method):
    train_df=pd.read_csv(path, parse_dates=True,encoding='utf-8')
    del_col=train_df.select_dtypes(include=['object']).columns
    for i in del_col:
        train_df=train_df.drop([i],axis=1)
    countcolumns=0
    for i in train_df.columns: 
        if(i==var):
            inx=countcolumns
        countcolumns=countcolumns+1   
    n_imputations = 10
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=n_imputations, sample_posterior=True, random_state=i)
        XY_completed.append(imputer.fit_transform(train_df.as_matrix()))
    XY_completed = np.mean(XY_completed, 0)
    XY_completed = np.round(XY_completed)   
    new_df = pd.read_csv(path,parse_dates=True,encoding='utf-8') 
    data_null_len=len(new_df[new_df[var].isnull()])
    for i in range(data_null_len):
        xx=train_df[train_df[var].isnull()].index[i]
        new_df[var].loc[xx]=abs(XY_completed[xx][inx])
    return new_df
示例#28
0
 def datainput(self):
     full_data = pd.read_csv(self.file, header=0)
     print('\nMissing values for each columns')
     print(full_data.isnull().sum()) # print # of mssing values
     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
     df_n = full_data.select_dtypes(include=numerics)
     col_names = list(df_n.columns)
     df_c = full_data.select_dtypes(exclude=numerics)
     ipt = input('\nIs there any missing values? (y/n) : ')
     if ipt == 'y':
         ct = input('Is there any missing values which is not digit? (y/n) : ')
         if ct == 'y':
             full_data.dropna()
         else:
             impute = IterativeImputer()
             df_n = impute.fit_transform(df_n) #process mssing values using imputer
             df_n = pd.DataFrame(df_n)
             df_n.columns = col_names
             full_data = pd.concat([df_n,df_c], axis=1)
         print('\nMissing values after processing')
         print(full_data.isnull().sum())  # print # of missing values
     train, test = train_test_split(full_data, test_size=0.3, shuffle = False) # train,test set split , default = shuffle
     return train, test
示例#29
0
def get_imputer(imputer_name, **add_params):

    imputer_name = imputer_name.lower()

    if imputer_name == 'knn':
        return KNN(**add_params)
    elif imputer_name.lower() == 'nnm':
        return NuclearNormMinimization(**add_params)
    elif imputer_name == 'soft':
        return SoftImpute(**add_params)
    elif imputer_name == 'iterative':
        return IterativeImputer(**add_params)
    elif imputer_name == 'biscaler':
        return BiScaler(**add_params)
    else:
        print('Choose one of predefined imputers')
示例#30
0
def nan_imputing(df):
    """
    There is only one feature with nans. Donor age at diagnosis. 
    We impute it using the KNN strategy
    :param df:
    :return:
    """
    # Imput missing data with mice
    fancy_imputed = df
    dummies = pd.get_dummies(df)
    imputed = pd.DataFrame(data=IterativeImputer().fit_transform(dummies),
                           columns=dummies.columns,
                           index=dummies.index)
    fancy_imputed.donor_age_at_diagnosis = imputed.donor_age_at_diagnosis
    fancy_imputed['donor_age_at_diagnosis'] = fancy_imputed[
        'donor_age_at_diagnosis'].astype(np.int)
    return fancy_imputed
示例#31
0
    def _handle_na(self, columns, fillna_strategy):
        """
        Handle the missing values for Numerical Features
        :param columns: columns/features name in the dataframe
        :param fillna_strategy: NA handling strategy
        """
        if fillna_strategy in ['mean', 'median', 'most_frequent', 'mode']:
            # Change mode to most_frequent
            fillna_strategy = 'most_frequent' if fillna_strategy == 'mode' else fillna_strategy

            imp = SimpleImputer(missing_values=np.nan,
                                strategy=fillna_strategy)
            self.output_df[columns] = imp.fit_transform(self.df[columns])
            # return self.imputers[column] = imp
        elif fillna_strategy == 'new':
            for column in columns:
                new_col_name = column + '_new'
                if self.output_df[column].isnull().count() > 0:
                    self.output_df[new_col_name] = np.where(
                        self.output_df[column].isnull(), 1, 0)
        elif fillna_strategy == 'end_distribution':
            for column in columns:
                if self.output_df[column].isnull().count() > 0:
                    new_col_name = column + '_new'
                    extreme = self.df[column].mean(
                    ) + 3 * self.df[column].std()
                    self.output_df[column] = self.output_df[column].fillna(
                        extreme)
        elif fillna_strategy == 'mice':
            from fancyimpute import IterativeImputer
            imp = IterativeImputer()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
            # self.imputers[columns] = imp
        elif fillna_strategy == 'knn':
            from fancyimpute import KNN
            imp = KNN()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
            # self.imputers[column] = imp
        elif fillna_strategy == 'softimpute':
            from fancyimpute import SoftImpute
            imp = SoftImpute()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
示例#32
0
    def fit(self, dataset):
        """Train standard imputation model.
    
    Args:
      - dataset: incomplete dataset
    """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return