def MultipleImputation(dataset, features):
    '''
    Takes a dataset and a set of feature names that refer to features to be imputed in the dataset.
    Facilitates multiple imputation technique on missing data and returns the imputed dataset.
    
    dataset: dataset with missing values (dataframe)
    features: set with feature names specifying what features to be grouped for imputation (set or list)
    '''

    # make copy of original dataset to prevent changes in original dataset
    dataset_copy = dataset.copy()

    # convert deferred_income to positive values in order to allow log10 transformation
    if "deferred_income" in features:
        dataset_copy["deferred_income"] *= -1

    # do log10 transformation; +1 to transform 0 values
    data_log = np.log10(dataset_copy[list(features)] + 1)

    # restrict min value to 0 to avoid <0 imputed values
    # --> important when fitting imputation model with feature values close to 0
    data_filled = MICE(n_imputations=500, verbose=False,
                       min_value=0).complete(np.array(data_log))
    data_filled = pd.DataFrame(data_filled)
    data_filled.index = dataset.index
    data_filled.columns = data_log.columns

    # transform back to linear scale; subtract 1 to obtain original non-imputed values
    data_filled = 10**data_filled - 1

    # convert deferred_income back to negative values (original values)
    if "deferred_income" in features:
        data_filled["deferred_income"] *= -1

    return data_filled
示例#2
0
def estimate_by_mice(df):
    df_estimated_variables = df.copy()
    random.seed(14)
    mice = MICE()  # model=RandomForestClassifier(n_estimators=100))
    result = mice.complete(np.asarray(df.values, dtype=float))
    df_estimated_variables.loc[:, df.columns] = result[:][:]
    return df_estimated_variables
示例#3
0
def test_mice_column_with_low_rank_random_matrix():
    mice = MICE(n_imputations=100, impute_type='col')
    XY_completed = mice.complete(XY_incomplete)
    _, missing_mae = reconstruction_error(
        XY,
        XY_completed,
        missing_mask,
        name="MICE (impute_type=col)")
    assert missing_mae < 0.1, "Error too high with column method!"
示例#4
0
def test_mice_row_with_low_rank_random_matrix_approximate():
    mice = MICE(n_imputations=100, impute_type='pmm', n_nearest_columns=5)
    XY_completed = mice.complete(XY_incomplete)
    _, missing_mae = reconstruction_error(
        XY,
        XY_completed,
        missing_mask,
        name="MICE (impute_type=row)")
    assert missing_mae < 0.1, "Error too high with approximate PMM method!"
示例#5
0
 def get_predict(self, flag, in_data):
   output = in_data.copy()
   output.shape = (utils.M_NUM, 1)
   output[~flag] = np.nan
   solver = MICE()
   tmp = self.t_measure.copy()
   tmp = np.column_stack((tmp, output)).transpose()
   tmp = solver.complete(tmp)
   output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1)
   return output
示例#6
0
    def Impute_the_data(self,
                        imputer="MEANMEDIAN",
                        x_data=[],
                        y_data=[],
                        con_cols=[],
                        cat_cols=[],
                        misper=[]):

        import pandas as pd
        from imblearn.under_sampling import RandomUnderSampler
        import warnings
        warnings.simplefilter('ignore', DeprecationWarning)

        if imputer == "MICE":

            #savenane for the imputed file for reuse

            x_filtered_savename = './Data/x_filtered_' + imputer + '_' + str(
                misper) + '.csv'

            #try to see if the old file is saved and use it

            try:
                x_filtered = pd.read_csv(x_filtered_savename)
                x_filtered = pd.DataFrame(x_filtered, columns=x_data.columns)
                print "Loaded from presaved file"

            #If the old file is non-existent, process the file and save new file

            except:
                print "Could not find the saved file. Generating new one with MICE imputation.\n"

                from fancyimpute import MICE

                impute = MICE()
                x_filtered = x_data
                if int(misper) > 0:
                    x_filtered = impute.complete(x_filtered)
                x_filtered = pd.DataFrame(x_filtered, columns=x_data.columns)
                x_filtered.to_csv(x_filtered_savename, index=False)

        ros = RandomUnderSampler()

        X_resampled, y_resampled = ros.fit_sample(x_filtered.values,
                                                  y_data.values.ravel())

        train_x = pd.DataFrame(X_resampled, columns=x_data.columns)
        train_y = pd.DataFrame(y_resampled, columns=y_data.columns)

        return {
            'train_y': train_y,
            'train_x': train_x,
            'X_resampled': X_resampled,
            'y_resampled': y_resampled
        }
示例#7
0
def get_data(filename, from_pickle=False):
    '''
    Input: filename (csv if from_pickle=False,
                     pickfile if from_pickle=True)
                     
    Output: scaled X, y
    '''
    if from_pickle:
        df = pd.read_csv('train.csv')
        df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
        df['active'] = (df['last_trip_date'].dt.month >= 6).astype(int)

        y = df.pop('active').values
        npz = np.load(filename + '.npz')
        X_filled = npz[filename]
        return X_filled, y

    else:
        df = pd.read_csv(filename)
        df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
        df['signup_date'] = pd.to_datetime(df['signup_date'])
        df['active'] = (df['last_trip_date'].dt.month >= 6).astype(int)
        df = pd.get_dummies(df, columns=['city', 'phone'])
        df.drop(['last_trip_date', 'signup_date'], axis=1, inplace=True)

        y = df.pop('active').values
        X = df.values.astype(float)

        scaler = StandardScaler()
        X_scaled = MICE(n_imputations=6690).complete(X)
        X_scaled = scaler.fit_transform(X)
        return X, y
def select_feature():
    complete_cols = []
    for col in ["f" + str(j) for j in range(1, 20) if j != 5]:
        complete_cols.append(col)
    df_train = pd.read_csv("../transdata/train_impute_v1.csv",header=0,index_col=None)
    print(df_train.describe())
    # Use 3 nearest rows which have a feature to fill in each row's missing features
    # df_train =fi.KNN(k=2).complete(df_train)
    # df_train = knn_impute_few_observed(df_train,k=3,missing_mask=df_train.shape)

    train_cols = list(set(set(df_train.columns)-set("label"))-set(complete_cols))
    df_cols = df_train[complete_cols]
    for col in train_cols:
        print(col)
        impute_col = complete_cols
        impute_col.append(col)
        df_col = df_train[impute_col]
        da_col = MICE().complete(df_col.values)
        df_cols[col] = pd.Series(da_col[:,-1])
    # df_train = knnimput.KNN(k=1).complete(df_train.values)
    # df_train = pd.DataFrame(df_train,columns=train_cols)
    df_train.to_csv("../transdata/train_imputed_v2.csv",header=True,index=False)
    print("imputation over!")
    del df_train
    gc.collect()
    df_test = pd.read_csv("../transdata/test_impute_v1.csv",header=0,index_col=None).astype(np.float16)
    print(df_test.describe())
    # df_test.drop(labels=["date"], axis=1, inplace=True)
    # df_test = fi.KNN(k=3).complete(df_test)
    df_test.to_csv("../transdata/test_imputed_v2.csv", header=True, index=False)
    return
示例#9
0
def impute_missing_values(numerical_features):
    imputed_numerical_features = pd.DataFrame(
        MICE().complete(numerical_features))
    imputed_numerical_features.columns = numerical_features.columns
    imputed_numerical_features.set_index(numerical_features.index,
                                         inplace=True)

    return imputed_numerical_features
示例#10
0
def smart_impute(features, features_to_impute = []):
    g = features.columns.to_series().groupby(features.dtypes).groups
    #print(g)
    if len(features_to_impute) == 0:
        features_to_impute = features.select_dtypes(include=['float64'])
    else:
        features_to_impute = features[features_to_impute]
    imputed_features = pd.DataFrame(MICE().complete(features_to_impute), index=features_to_impute.index.values, columns=features_to_impute.columns.values)
    return pd.concat([imputed_features, features.drop(features_to_impute, axis=1)], axis=1)
 def __init__(self, classifier, impute=True, impute_mode=MICE()):
     """
     INPUT:
     - classifier = Model classifier object
     - impute = Bool, runs imputation
     """
     self.clf = classifier
     self.solver = impute_mode
     self.impute = impute
示例#12
0
def imputeAndCalculate(train_unimp, test_unimp, obj):
    from copy import copy
    from fancyimpute import MICE
    train = copy(train_unimp)
    test = copy(test_unimp)
    # Assume mean imputation for now. These are the numerical features to be imputed.
    #    for i in range(2,11):
    #        train[:,i][np.isnan(train[:,i])] = np.nanmean(train[:,i])
    #        test[:,i][np.isnan(test[:,i])] = np.nanmean(train[:,i])

    train[:, 2:12] = MICE().complete(train[:, 2:12])
    test[:, 2:12] = MICE().complete(test[:, 2:12])

    # Calculated Features
    train = updateCalculatedFeatures(train)
    test = updateCalculatedFeatures(test)
    obj.X_tr_imp.append(train)
    obj.X_ts_imp.append(test)
示例#13
0
 def estimate_by_mice(df, _iscategorical, group):
     df_estimated_var = df.copy()
     random.seed(129)
     mice = MICE()  # model=RandomForestClassifier(n_estimators=100))
     array_X = np.asarray(df.values, dtype=float)
     if array_X.ndim < 2:
         array_X = array_X.reshape(array_X.shape[0], -1)
         res = mice.complete(array_X, _iscategorical, group)
         df_estimated_var.loc[:, :] = res[:][:]
     else:
         res = mice.complete(array_X, _iscategorical, group)
     if group == 3:
         df_estimated_var['restecg'] = res[:][:]
     elif group == 4:
         df_estimated_var['slope'] = res[:][:]
     else:
         df_estimated_var.loc[:, df.columns] = res[:][:]
     return df_estimated_var
示例#14
0
def main6(path: str) -> None:
    """[summary]
    Args:
        path (str): [files path]
    """
    fi: DataFrame = pd.read_csv(path, low_memory=False)
    fi = pd.DataFrame(MICE().fit_transform(fi))
    print(fi.shape)
    fi = fi.dropna(axis=1)
    print(fi.shape)
    def impute_missing_values(self, value_set, strategy):
        """
        对原始数据矩阵进行填充
        :param value_set: 待处理的原始数据矩阵
        :param strategy: 1:剔除缺失值 2:高频值填充 3:属性相关关系填充 4:数据对象相似性填充
        :return: 进行填充过的数据矩阵,类型为list: (col1, col2, ...)
        """
        # 以剔除缺失值的方法进行处理
        if strategy == 1:
            new_value_set = []
            for data_sample in value_set:
                new_data_sample = []
                if None in data_sample or 'NA' in data_sample:
                    continue
                else:
                    for data in data_sample:
                        new_data_sample.append(float(data))
                new_value_set.append(new_data_sample)
            value_array = np.array(new_value_set)

        elif strategy in [2, 3, 4]:
            # 将value_set矩阵转化为numpy矩阵,并将其中的缺失值用np.nan替换
            new_value_set = []
            for data_sample in value_set:
                new_data_sample = []
                for data in data_sample:
                    if data and data != 'NA':
                        new_data_sample.append(float(data))
                    else:
                        new_data_sample.append(np.nan)
                new_value_set.append(new_data_sample)
            value_array = np.array(new_value_set)

            # 以最高频值进行填补,由于均为概率类的数值属性,所以用平均数代替
            if strategy == 2:
                value_array = SimpleFill(
                    fill_method="mean").complete(value_array)

            # 以属性相关关系进行填补,取相关性最高的三个属性做
            elif strategy == 3:
                value_array = MICE(n_nearest_columns=3).complete(value_array)

            # 以数据对象相似性进行填补,取相似度最高的10个数据对象
            elif strategy == 4:
                for batch in range(len(value_array) // 1000 + 1):
                    value_array[batch*1000 : min(batch*1000+1000, len(value_array))] = \
                        KNN(k = 10).complete(value_array[batch*1000 : min(batch*1000+1000, len(value_array))])
        else:
            raise ArgInputError("The strategy should be in (1,2,3,4)!")

        # 将填充过的数据矩阵按feature_col转换为n个col的list
        feature_col_list = []
        for i in range(len(value_array[0])):
            feature_col_list.append(value_array[:, i].tolist())
        return feature_col_list
示例#16
0
def impute_data(X):
    """Impute the data using Matrix Factorization

    Parameters
    ----------
    X: np.array
       Matrix of predictors

    Returns
    -------
    impute_data_filled: np.array
       X, with missing values filled
    """
    #impute_data = X
    #data_index = X.index
    #data_cols = df.columns

    #solver = MatrixFactorization(verbose=False)
    solver = MICE()
    impute_data = solver.complete(X)
    #impute_df = pd.DataFrame(impute_data_filled, index=data_index, columns=data_cols)
    return impute_data
示例#17
0
def test_create_imputed_datasets_two_alleles():
    dataset = Dataset.from_nested_dictionary({
        "HLA-A*02:01": {
            "A" * 9: 20.0,
            "C" * 9: 40000.0,
        },
        "HLA-A*02:05": {
            "S" * 9: 500.0,
            "A" * 9: 25.0,
        },
    })
    imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25))
    eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
    expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
    for allele_name, allele_data in imputed_dataset.groupby_allele():
        eq_(set(allele_data.peptides), expected_peptides)
示例#18
0
def calculate_imputation_error(feature, numerical_data, numerical_features):
    numerical_data = numerical_data.copy(deep=True)
    feature_data = numerical_data[feature][0:200].copy().reset_index(drop=True)
    numerical_data[feature][0:200] = np.nan
    completed_numerical_data = pd.DataFrame(
        MICE(verbose=False).complete(numerical_data))
    completed_numerical_data.columns = numerical_features
    imputed_feature = completed_numerical_data[feature][0:200]
    imputed_data = pd.DataFrame([feature_data, imputed_feature]).T
    imputed_data.columns = ['Real value', 'Imputed value']
    imputed_data['Imputation error (%)'] = np.abs(
        (imputed_data['Real value'] - imputed_data['Imputed value']) /
        imputed_data['Real value']) * 100
    imputation_error = np.mean(imputed_data['Imputation error (%)'])
    print('Imputation error for', feature, ': ', imputation_error)

    return [feature, imputation_error]
def nan_imputing(df):
    """
    There is only one feature with nans. Donor age at diagnosis. 
    We impute it using the MICE strategy
    :param df:
    :return:
    """
    # Imput missing data with mice
    fancy_imputed = df
    dummies = pd.get_dummies(df)
    imputed = pd.DataFrame(data=MICE(verbose=False).complete(dummies),
                           columns=dummies.columns,
                           index=dummies.index)
    fancy_imputed.donor_age_at_diagnosis = imputed.donor_age_at_diagnosis
    fancy_imputed['donor_age_at_diagnosis'] = fancy_imputed[
        'donor_age_at_diagnosis'].astype(np.int)
    return fancy_imputed
示例#20
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
示例#21
0
 def Do_impute(df):
     print('Imputating ...')
     df = pd.DataFrame(df)
     tmp_df = df[[
         'image_top_1', 'param_2', 'param_3', 'city', 'region', 'param_1',
         'category_name', 'parent_category_name', 'user_type'
     ]]
     tmp_df = tmp_df.replace(0, np.nan)
     tmp_df = pd.DataFrame(data=MICE().complete(tmp_df),
                           columns=tmp_df.columns,
                           index=tmp_df.index)
     df.drop([
         'image_top_1', 'param_2', 'param_3', 'city', 'region', 'param_1',
         'category_name', 'parent_category_name', 'user_type'
     ],
             axis=1,
             inplace=True)
     df.join(tmp_df)
示例#22
0
def clean_fill_nulls(df):
    active_date = date(2014, 6, 1)
    df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
    df['signup_date'] = pd.to_datetime(df['signup_date'])
    #df['iPhone'] = (df['phone'] == 'iPhone').astype(int)
    df['luxury_car_user'] = df['luxury_car_user'].astype(int)
    df['active'] = (df['last_trip_date'] > active_date).astype(int)
    df.drop(['signup_date', 'last_trip_date'], axis=1, inplace=True)
    df = pd.get_dummies(df, columns=['city', 'phone'])

    #y = df.pop('active').values
    array = df.values.astype(float)
    array_filled_mice = MICE(n_imputations=6700).complete(array)
    scaler = StandardScaler()
    array_filled_mice = scaler.fit_transform(array_filled_mice)
    columns = [
        'avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver',
        'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'luxury_car_user',
        'weekday_pct', 'active', 'city_Astapor', "city_King's Landing",
        'city_Winterfell', 'phone_Android', 'phone_iPhone'
    ]
    return pd.DataFrame(array_filled_mice, columns=columns)
示例#23
0
def impute(city, methods="KNN"):
    filename = base_path_2 + city + "_airquality_processing.csv"
    if city == 'bj':
        attr_need = ["station_id_num", "PM25_Concentration", "PM10_Concentration", "O3_Concentration", "time_week",
                     "time_month", "time_day", "time_hour", "CO_Concentration", "NO2_Concentration",
                     "SO2_Concentration"]
    else:
        attr_need = ["station_id_num", "PM25_Concentration", "PM10_Concentration", "time_week",
                     "time_month", "time_day", "time_hour", "NO2_Concentration"]
    df = pd.read_csv(filename, sep=',')
    df['time'] = pd.to_datetime(df['time'])
    df.index = df['time']
    df[df < 0] = np.nan
    station_groups = df.groupby(['station_id'])
    stations = load_station()
    city_station = stations[city]
    stations_group = {}
    for station, group in station_groups:
        df1 = group
        df1['station_id_num'] = df1.apply(
            lambda row: float(city_station[str(row.station_id)]['station_num_id']), axis=1)
        XY_incomplete = df1[attr_need].values
        # print(XY_incomplete)
        if methods == "KNN":
            XY_completed = KNN(k=5).complete(XY_incomplete)
        # print(XY_completed)
        if methods == "MICE":
            # print(XY_incomplete)
            try:
                XY_completed = MICE(n_imputations=100).complete(XY_incomplete)
            except:
                continue
        # print(XY_completed)
        group.loc[:, attr_need] = XY_completed
        stations_group[station] = group
    import cPickle as pickle
    f1 = file(base_path_3 + city + '_data_history_'+methods+'.pkl', 'wb')
    pickle.dump(stations_group, f1, True)
示例#24
0
 def Do_impute(df):
     print('Imputating ...')
     #        df = pd.DataFrame(df)
     #       'image_top_1', 'param_2', 'param_3',
     tmp_df = df[[
         "param_2", "city", "parent_category_name", "user_type",
         "category_name", "user_type", "image_top_1", "param_1", "param_3",
         "image"
     ]]
     #        tmp_df = tmp_df.replace(-1234, np.nan)
     #        cols = ["image_top_1", "param_1", "param_2", "param_3", 'param_1', 'category_name', 'parent_category_name','user_type']
     tmp_df = pd.DataFrame(data=MICE().complete(tmp_df),
                           columns=tmp_df.columns,
                           index=tmp_df.index)
     #        tmp_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
     df.drop([
         "param_2", "city", "parent_category_name", "user_type",
         "category_name", "user_type", "image_top_1", "param_1", "param_3",
         "image"
     ],
             axis=1,
             inplace=True)
     df.join(tmp_df)
示例#25
0
def treat_missing_valuesMICE(X):
    X_filled = MICE(init_fill_method='median', n_imputations=10,
                    n_burn_in=5).complete(X)
    return X_filled
示例#26
0
# Import the libraries
import numpy as np
import pandas as pd

# Import Daa
dataset = pd.read_csv('MissingData1.csv', sep=",", header=None)
dataset = dataset.replace(1e99, np.NaN)

#MICE - Multiple Imputation by Chained Equations
from fancyimpute import MICE
solver = MICE()
Imputed_dataframe = solver.complete(dataset)

#write to output file
np.savetxt('induriMissingResult1.txt',
           Imputed_dataframe,
           delimiter=',',
           newline='\n')
示例#27
0
 def test(self, flag, data):
   if (flag == 1).sum() == self.data.m_num:
     return data
   else:
     solver = MICE()
     return self.imputate(flag, data, solver)
示例#28
0
rt['Box Office'] = sf(rt['Box Office'])
#rt['audience - User Ratings'] = sf(rt['audience - User Ratings'])
#rt['audience - Average Rating'] = sf(rt['audience - Average Rating'])
rt['actor1_star'] = sf(rt['actor1_star'])
rt['actor2_star'] = sf(rt['actor2_star'])
rt['actor3_star'] = sf(rt['actor3_star'])
rt['length'] = sf(rt['length'])
rt['director1_star'] = sf(rt['director1_star'])
rt['actor3_bignominations'] = sf(rt['actor3_bignominations'])

movie = rt['movie_id']
movie = pd.DataFrame(movie)

#rt = rt.replace(np.nan, '')
rating = rt['diff_rating']
rating = pd.DataFrame(rating)

rt = rt.drop(['id', 'diff_rating', 'movie_id'], axis=1)
rt = rt.astype(float)

X_fill = MICE().complete(rt.as_matrix())
X_fill = pd.DataFrame(X_fill)
X_fill.columns = rt.columns
X_fill = pd.concat((X_fill, rating), axis=1)

X_fill = pd.concat(
    [X_fill, rating, studio, movie, actor1, actor2, actor3, director1], axis=1)

X_fill.to_csv('rotten_impute.csv', encoding='utf-8')
#dum = pd.concat([dum, ])
test['TARGET'] = test.TARGET.apply(lambda x: 1 if x >= th else 0)

# test from df
df[df.TARGET.isnull()][['SK_ID_CURR', 'TARGET'
                        ]].SK_ID_CURR.tolist() == test.SK_ID_CURR.tolist()

# new TARGET field
TARGET = df[df.TARGET.notnull()].TARGET.tolist() + test.TARGET.tolist()
# check
print(len(df[df.TARGET == 0]) + len(df[df.TARGET == 1]) == len(df))

#---------
# setting
#---------
log_dir = '../log_mice_inputation'
init_logging(log_dir)
X_missing = df[df.TARGET == 1]
X_missing.drop(['TARGET'], axis=1)
#-------------------
# core algorithm: input should be array
#-------------------
from fancyimpute import MICE  # for imputing

logging.info('visit_sequence: {}'.format('monotone'))
logging.info('impute_type: {}'.format('col'))
logging.info('init_fill_method: {}'.format('mean'))
logging.info('target == 1')
X_filled1 = MICE(visit_sequence='monotone',
                 impute_type='col',
                 init_fill_method='mean').complete(X_missing.values)
示例#30
0
 def complete(self, data):
     results = []
     for i in range(self.imputations):
         results.append(
             MICE(n_imputations=1, verbose=self.verbose).complete(data))
     return results
示例#31
0
# replace関数によって、Noneをnanに変換
production_miss_num.replace('None', np.nan, inplace=True)

# mice関数を利用するためにデータ型を変換(mice関数内でモデル構築をするため)
production_miss_num['thickness'] = \
  production_miss_num['thickness'].astype('float64')
production_miss_num['type'] = \
  production_miss_num['type'].astype('category')
production_miss_num['fault_flg'] = \
  production_miss_num['fault_flg'].astype('category')

# ダミー変数化(「第9章 カテゴリ型」で詳しく解説)
production_dummy_flg = pd.get_dummies(
  production_miss_num[['type', 'fault_flg']], drop_first=True)

# mice関数にPMMを指定して、多重代入法を実施
# n_imputationsは取得するデータセットの数
# n_burn_inは値を取得する前に試行する回数
mice = MICE(n_imputations=10, n_burn_in=50, impute_type='pmm')

# 処理内部でTensorFlowを利用
production_mice = mice.multiple_imputations(
  # 数値の列とダミー変数を連結
  pd.concat([production_miss_num[['length', 'thickness']],
             production_dummy_flg], axis=1)
)

# 下記に補完する値が格納されている
production_mice[0]