Пример #1
0
    def datawig_imputation(self):

        X_train = self.X_train.copy()
        X_test = self.X_test.copy()
        external_data = self.external_data.copy()
        cols_no_missings = X_train.columns[~X_train.isnull().any()].tolist()
        cols_missings = X_train.columns[X_train.isnull().any()].tolist()
        numeric = list(X_train[cols_missings]
                       .select_dtypes(include=np.number)
                       .columns)
        categorical = [variable for variable
                       in list(X_train[cols_missings].columns)
                       if variable not in numeric]
        for col in cols_missings:
            if col in numeric:
                imputer = datawig.SimpleImputer(input_columns=cols_no_missings,
                                                output_column=col,
                                                output_path='imputer_model')
                imputer.fit(train_df=X_train, num_epochs=10)
                X_train_pred = imputer.predict(X_train.copy()).iloc[:, -1]
                mask_train = X_train[col].isnull()
                X_train.loc[mask_train, col] = X_train_pred[mask_train]
                X_test_pred = imputer.predict(X_test.copy()).iloc[:, -1]
                mask_test = X_test[col].isnull()
                X_test.loc[mask_test, col] = X_test_pred[mask_test]
            if col in categorical:
                imputer = datawig.SimpleImputer(input_columns=cols_no_missings,
                                                output_column=col,
                                                output_path='imputer_model')
                imputer.fit(train_df=X_train, num_epochs=10)
                X_train = imputer.predict(X_train.copy()).iloc[:, 0:-2]
                X_test = imputer.predict(X_test.copy()).iloc[:, 0:-2]
        return X_train, X_test, external_data
Пример #2
0
def secondImpute(seshat):
    # Already imputed the CCs, so just grab everything else that is imputable
    varsToImpute = [v for v in IMPUTABLE_VARS if v not in CCs]
    for predictVar in tqdm(varsToImpute):
        print("Imputing: {}".format(predictVar))
        imputeData = seshat[IMPUTABLE_VARS]
        # Train set is all of the entries where the target column is not null
        trainSet = imputeData[~imputeData[predictVar].isnull()]
        # And the prediction set is everything else
        predictSet = imputeData[imputeData[predictVar].isnull()]
        # If the training set is the entire set, we've hit a CC-related var we've
        # already imputed, so just skip this feature
        if trainSet.shape[0] == seshat.shape[0]:
            continue
        modelPath = 'model/{}_imputer'.format(predictVar.replace('/', ''))
        if modelExists(predictVar):
            imputer = datawig.SimpleImputer.load(modelPath)
            imputer.load_hpo_model(hpo_name=0)
        else:
            imputer = datawig.SimpleImputer(input_columns=lDel(
                IMPUTABLE_VARS, predictVar),
                                            output_column=predictVar,
                                            output_path=modelPath)
            imputer.fit(train_df=trainSet, num_epochs=1000)

        predicted = imputer.predict(predictSet)
        pred = predicted['{}_imputed'.format(predictVar)]
        seshat[predictVar] = pd.concat([
            seshat[predictVar].dropna(),
            pred,
        ]).reindex_like(seshat[predictVar])
    return seshat
Пример #3
0
def imputeCCs(seshat):
    trainSet = getCCTrainSet(seshat)
    modelVars = ccVars(seshat)
    for predictVar in CCs:
        predictData = seshat[modelVars]
        predictData = predictData[predictData[predictVar].isnull()]
        if modelExists(predictVar):
            imputer = datawig.SimpleImputer.load(
                'model/{}_imputer'.format(predictVar))
            imputer.load_hpo_model(hpo_name=0)
        else:
            imputer = datawig.SimpleImputer(
                input_columns=lDel(modelVars, predictVar),
                output_column=predictVar,
                output_path='model/{}_imputer'.format(predictVar))
            imputer.fit_hpo(train_df=trainSet,
                            num_epochs=1000,
                            user_defined_scores=[(p2Score, 'p2_prediction')])

        pred = imputer.predict(predictData)['{}_imputed'.format(predictVar)]
        seshat[predictVar] = pd.concat([
            seshat[predictVar].dropna(),
            pred,
        ]).reindex_like(seshat[predictVar])
    return seshat
Пример #4
0
def ngram(df):
    df_drop = df.dropna()
    df_train, df_test = train_test_split(df_drop, test_size=0.15, random_state=RANDOM_SEED)
    #df_train, df_test = datawig.utils.random_split(df)

    #Initialize a SimpleImputer model
    imputer = datawig.SimpleImputer(
        input_columns=['sepal length','sepal width','petal width','class'], # column(s) containing information about the column we want to impute
        output_column='petal length', # the column we'd like to impute values for
        #output_path = 'imputer_model', # stores model data and metrics
        )

    #hyperparameter opimization for numerical data
    imputer.fit_hpo(
        train_df=df_train,
        num_epochs=10,
        learning_rate_candidates=[1e-3, 1e-4],
        final_fc_hidden_units=[[100]]
    )
    #Fit an imputer model on the train data
    imputer.fit(train_df=df_train)

    #Impute missing values and return original dataframe with predictions
    df_mv_list = df[df['petal length'].isnull()]
    imputed_mv = imputer.predict(df_mv_list)

    #bring and map actual values from the original data
    imputed_mv['actual'] = imputed_mv.index.map(df.set_index(df.index)['petal length'])
    return imputed_mv
Пример #5
0
    def __init__(self,
                 df,
                 impute_atts,
                 na_mark=None,
                 output_path="datawig/",
                 num_epochs=50):
        """
        :param df: pandas dataframe, stores the data to fit the imputer.
        :param impute_atts: list of str, each str represents the name of column to be imputed using datawig model. Column can be categorical or numerical.
        :param na_mark: str, represents the symbol of missing values. Default is None, i.e. NaN represents the missing values.
        :param output_path: str, the path to store the learned datawig model.
        :param num_epochs: integer, the maximum iteration of datawig model.
        """
        super().__init__("@".join(["DatawigImputer"] + impute_atts),
                         df,
                         focus_atts=impute_atts,
                         fit_flag=False,
                         na_mark=na_mark)

        learned_imputers = {}
        for ai in impute_atts:
            learned_imputers[ai] = datawig.SimpleImputer(
                input_columns=list(set(df.columns).difference(ai)),
                output_column=ai,
                output_path=output_path).fit(train_df=df,
                                             num_epochs=num_epochs)
        self.step = learned_imputers
def imputer_v1(source_columns, target_column_name):
    imputer = datawig.SimpleImputer(
        input_columns=source_columns,  # column(s) containing information about the column we want to impute
        output_column=target_column_name,  # the column we'd like to impute values for
        output_path='imputer_model'  # stores model data and metrics
    )
    return imputer
def missing_values(incsv_file, outcsv_file):
    try:
        dataset = pd.read_csv(incsv_file)  
    except OSError:
        print('cannot open', incsv_file)
        sys.exit(0)
    
    columns_null=dataset.columns[dataset.isnull().any()]
    dataset_filled=pd.DataFrame(0,index=np.arange(len(dataset)),columns=columns_null)
    missing_value_count=list()
    
    for col in columns_null:
        null_cells=dataset[col].isnull()
        filled_cells=dataset[col].notnull()
        imputer=datawig.SimpleImputer(
                dataset.columns[dataset.columns!=col],
                col,
                'imputer_model') 
        imputer.fit(dataset[filled_cells])
        predicted=imputer.predict(dataset[null_cells])
        dataset_filled[col]=predicted[col+'_imputed']
        missing_value_count.append("number of missing values replaced in "+ str(col) + " is "+ str(predicted.shape[0]))

    dataset = dataset.fillna(dataset_filled)
    dataset.to_csv(outcsv_file)
    
    #print("number of missing values replaced: ",dataset_filled.notnull().sum().sum())
    
    for i in missing_value_count:
        print("\n\n",i)
Пример #8
0
def missing(dataset):

    if dataset.shape[0] == 0:
        return print("empty dataset")
    columns_with_null_val = dataset.columns[dataset.isnull().any()]
    dataset_filled_val = pd.DataFrame(0,
                                      index=np.arange(len(dataset)),
                                      columns=columns_with_null_val)
    missing_value_count = list()
    for target in columns_with_null_val:
        null_cells = dataset[target].isnull()
        filled_cells = dataset[target].notnull()
        imputer = datawig.SimpleImputer(
            dataset.columns[dataset.columns != target], target,
            'imputer_model')
        imputer.fit(dataset[filled_cells])
        predicted = imputer.predict(dataset[null_cells])
        dataset_filled_val[target] = predicted[target + '_imputed']
        missing_value_count.append("number of missing values replaced in " +
                                   str(target) + " is " +
                                   str(predicted.shape[0]))

    dataset = dataset.fillna(dataset_filled_val)
    for i in missing_value_count:
        print("\n\n", i)
    return dataset
Пример #9
0
    def data_wig_impute(self):

        #df_train, df_test = datawig.utils.random_split(train)
        #
        # # Initialize a SimpleImputer model
        imputer = datawig.SimpleImputer(
            input_columns=['1', '2', '3', '4', '5', '6', '7', 'target'],
            #     # column(s) containing information about the column we want to impute
            output_column='0',  # the column we'd like to impute values for
            output_path='imputer_model'  # stores model data and metrics
        )
Пример #10
0
def set_missing_value(raw_data, input_columns, output_column, num_epochs):
    import datawig
    rd_train, rd_test = datawig.utils.random_split(raw_data)
    # 初始化并拟合一个简单的imputer模型
    imputer = datawig.SimpleImputer(
        input_columns = input_columns,
        output_column = output_column,
        output_path = 'imputer_model').fit(rd_train, num_epochs=num_epochs)#存储模型数据和度量
    imputed_test = imputer.predict(rd_test)
#     print('MSE:{.4lf}', mean_squared_error())
    imputed = imputer.predict(raw_data)
    raw_data.loc[(data[output_column].isnull()), output_column] = imputed.loc[(imputed[output_column].isnull()), str(output_column + '_imputed')].apply(lambda x: float(round(x, 1)))
    return raw_data
Пример #11
0
    def fit_transform(self, df_train, df_corrupted, predictors):
        df_imputed = df_corrupted.copy()

        for col in self.categorical_columns + self.numerical_columns:
            output_col = col
            input_cols = list(set(df_train.columns) - set([output_col]))

            print(f'Fitting model for column: {col}')
            model = datawig.SimpleImputer(input_cols, output_col, 'imputer_model')
            model.fit(df_train)

            df_imputed = model.predict(df_imputed)
            df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)
            df_imputed = df_imputed[df_corrupted.columns]
                
        return df_imputed
Пример #12
0
def testImpute(data, modelVars):
    train, test = datawig.utils.random_split(data)
    predictVar = 'CC_PolPop'
    actual = test[predictVar].copy()
    test[predictVar] = test[predictVar].map(lambda _: np.nan)

    imputer = datawig.SimpleImputer(
        input_columns=lDel(modelVars, predictVar),
        output_column=predictVar,
        output_path='model/test_imputer'.format(predictVar))

    imputer.fit_hpo(train_df=train,
                    num_epochs=1000,
                    user_defined_scores=[(p2Score, 'p2_prediction')])
    imputed = imputer.predict(test)
    predicted = imputed['{}_imputed'.format(predictVar)]
    print('Pred: {}'.format(p2prediction(predicted, actual)))
Пример #13
0
def imputate(data, target_column, num_epochs, logs_path):
    null_rows = data[target_column].isnull()
    df_train, df_test = datawig.utils.random_split(data)
    imputer = datawig.SimpleImputer(
        input_columns=data.
        columns,  # column(s) containing information about the column we want to impute
        output_column=target_column,  # the column we'd like to impute values for
        output_path=logs_path  # stores model data and metrics
    )
    imputer.fit(train_df=df_train, num_epochs=num_epochs, patience=num_epochs)
    imputed = imputer.predict(df_test)
    mse = np.mean(
        (imputed[target_column + "_imputed"] - imputed[target_column])**2)**0.5

    imputed.at[null_rows,
               target_column] = imputed[null_rows][target_column + "_imputed"]
    imputed.drop(target_column + "_imputed", axis=1)
    return imputed, mse
Пример #14
0
def impute_data(df, output_column, input_columns, num_epochs=50):
    df_train = df.dropna(subset=[output_column])
    if is_string_dtype(df[output_column]) and\
            len(df[output_column].unique()) >= len(df[output_column].dropna()):
        print(
            output_column,
            'is categorical and only has unique values, cannot do imputation')
        return df

    imputer = datawig.SimpleImputer(
        input_columns=
        input_columns,  # column(s) containing info about the column we want to impute
        output_column=output_column,  # the column we'd like to impute values for
    )
    # Fit an imputer model on the train data
    imputer.fit(train_df=df_train, num_epochs=num_epochs)

    # Impute missing values and return original dataframe with predictions
    imputed_df = imputer.predict(df)
    return imputed_df
Пример #15
0
def test():
    df_orig = pd.read_csv("https://goo.gl/ioc2Td",
                          usecols=[
                              'pop_1992', 'pop_1997', 'pop_2002', 'pop_2007',
                              'country', 'continent'
                          ])
    df = df_orig.mask(np.random.random(df_orig.shape) < 0.3)
    input_columns = ['pop_1992', 'pop_1997', 'pop_2002', 'country']
    output_column = 'pop_2007'

    res = impute_all_data(df)

    df_train, df_test = datawig.utils.random_split(df)

    imputer = datawig.SimpleImputer(
        input_columns=
        input_columns,  # column(s) containing information about the column we want to impute
        output_column=output_column,  # the column we'd like to impute values for
    )
    imputer.fit(train_df=df_train, num_epochs=50)
    imputed = imputer.predict(df_test)
Пример #16
0
def missing(data):

    if data.shape[0] == 0:
        return print("empty dataset")
    col_null = data.columns[data.isnull().any()]
    data_out = pd.DataFrame(0, index=np.arange(len(data)), columns=col_null)
    pstatement = []
    for nul_col in col_null:
        cnull = data[nul_col].isnull()
        cwnull = data[nul_col].notnull()
        imputer = datawig.SimpleImputer(data.columns[data.columns != nul_col],
                                        nul_col, 'imputer_model')
        imputer.fit(data[cwnull])
        final = imputer.predict(data[cnull])
        data_out[nul_col] = final[nul_col + '_imputed']
        pstatement.append("number of missing values replaced in " +
                          str(nul_col) + " is " + str(final.shape[0]))

    data = data.fillna(data_out)
    print("\n\n\n")
    for i in pstatement:
        print("\n", i)
    return data
Пример #17
0
    def train_model_for_categorical_variables(self):
        time = str(datetime.datetime.now())
        for c in variables:
            if c[1] == "string":

                var = self.variables.copy()
                var.remove(c)
                #initialize the model

                imputer = datawig.SimpleImputer(
                    input_columns=
                    var,  # column(s) containing information about the column we want to impute
                    output_column=c,  # the column we'd like to impute values for
                    output_path='lib/imputer_models' + time + '/' +
                    str(c)  # stores model data and metrics
                )
                imputer.fit(train_df=self.df, num_epochs=5)
                self.models[c] = imputer
                self.categorical_variables.append(c[0])

                print("Training completed to treat the categorical variable: ",
                      c[0])

        return True
Пример #18
0
def main():
    seshat = pd.read_csv('model/seshat-with-regression-vars.csv')
    seshat = seshat.groupby(['BasePolity']).first()
    betterWithAllVars = [
        'CC_Govt', 'CC_Hier', 'CC_Infra', 'CC_Money', 'CC_Texts', 'CC_Writing'
    ]
    modelVars = ccVars(seshat)
    # For each imputable variable
    #for predictVar in IMPUTABLE_VARS:
    for predictVar in CCs:
        r2s = []
        p2s = []
        p2ms = []
        varSet = 'few'
        if predictVar in betterWithAllVars:
            varSet = 'many'
            modelVars = IMPUTABLE_VARS
        print('Validating {}'.format(predictVar))
        # Select known values
        knownVals = (seshat[~seshat[predictVar].isna()])
        # Generate a train and test set on known values

        #        for df_train, df_test, i in regionKFold(knownVals):
        for i, df_train, df_test in crossValKFold(knownVals, 5):
            # Train a model using the train set
            modelPath = 'model/test_{}_{}_imputer_{}'.format(
                i, predictVar.replace('/', ''), varSet)
            if os.path.isdir(modelPath):
                imputer = datawig.SimpleImputer.load(modelPath)
                imputer.load_hpo_model(hpo_name=0)
            else:
                imputer = datawig.SimpleImputer(input_columns=lDel(
                    modelVars, predictVar),
                                                output_column=predictVar,
                                                output_path=modelPath)
                imputer.fit(train_df=df_train, num_epochs=1000)
            # Predict the values in the test set
            predicted = imputer.predict(df_test)
            if predictVar in IMPUTABLE_CATEGORICAL_VARS:
                p, r, f, s = precision_recall_fscore_support(
                    predicted[predictVar],
                    predicted['{}_imputed'.format(predictVar)])
                with open('validationCategorical.csv', 'a') as f:
                    f.write('{},{},{}\n'.format(predictVar, r2, p2))

            else:
                try:
                    # Compute fidelity metrics
                    r2, p2m, p2 = score(
                        np.array(predicted[predictVar]).astype(np.float64),
                        np.array(
                            predicted['{}_imputed'.format(predictVar)]).astype(
                                np.float64))
                    r2s.append(r2)
                    p2s.append(p2)
                    p2ms.append(p2m)
                    with open('validationRegression.csv', 'a') as f:
                        f.write('{},{},{},{}\n'.format(i, predictVar, r2, p2m,
                                                       p2))
                except:
                    continue
        with open('final.csv', 'a') as f:
            f.write('{},{},{},{}\n'.format(predictVar, np.mean(r2s),
                                           np.mean(p2s), np.mean(p2ms)))
Пример #19
0
# MICE - Works & takes only Numerical Vars 
from impyute.imputation.cs import mice
# start the MICE training (Can be applied to all numerical Vars that have missing info in datasets)
Df_NumericalVars = Df.select_dtypes(include = np.number)
Df_Imputed_MICE = pd.DataFrame(data=mice(Df_NumericalVars.values), columns=Df_NumericalVars.columns, index=Df_NumericalVars.index)

# DataWig Imputation - https://github.com/awslabs/datawig - takes a lot of time
import datawig
# Var1 needs to be imputed
# Split data into obs with Var1 not missing and Var1 missing
X_train = X[pd.notnull(X.var1)] #Var1 not missing is used to in training
X_test = X[pd.isnull(X.Var1)] #Var1 missing
# Parameters
imputer = datawig.SimpleImputer(
    input_columns=['Var2','Var3','Var4','Var5','Var6', 'Var7'], # column(s), Categorical & Numerical, these vars themselves can have missing data
    output_column='revol_util', # the column we'd like to impute values for. Can take only 1 column at a time
    output_path = 'imputer_model') # stores model data and metrics
#Fit an imputer model on the train data
imputer.fit(train_df=X_train, num_epochs=50)    #num_epochs is not needed while imputing for Categorical Var (i.e misisng in Cat var)
#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(X_test)


### Soft Probability Imputation - Implementation - Didn't check - https://gist.github.com/Vernal-Inertia/bf2e75e23ea0a508bbebfeadb0aafabe
valueCounts = {}
def CountAll():
    global all_columns, nanCounts, valueCounts
    all_columns = list(df)
    nanCounts = df.isnull().sum()
    for x in all_columns:
        valueCounts[x] = df[x].value_counts()
Пример #20
0
# start the MICE training
imputed_training = mice(train.values)

##6.) Imputing using deep neural networks (Datawig)
##This method works really well with numeric and categorical variables . It is a library that learns ML models by using DNN to impute
##missing values. It has support for both CPU and GPU for training
##Advantages are that it is quite accurate compared to other imputation techniques,it can handle categorical data with 'Feature Encoder'
##Disadvatages are that it is slow with large datasets, a requirement is that you need to specify the columns that contain information about the target column
##that will be impyuted

##Example Code for imputation using neural networks

import datawig
df_train, df_test = datawig.utils.random_split(train)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    # column(s) containing information about the column we want to impute
    input_columns=['1', '2', '3', '4', '5', '6', '7', 'target'],
    output_column='0',  # the column we'd like to impute values for
    output_path='imputer_model'  # stores model data and metrics
)

train = data_split[0].copy()
##For the above data we will use various imputation methods.
train2 = train.copy()
train3 = train.copy()
train4 = train.copy()

impute_methods = ['std', 'robust', 'minmax', 'normal', 'knn', 'nn', 'mice', '']
from sklearn.preprocessing import SimpleImputer
Пример #21
0
#Randomly replace 30% of the first column with NaN values
column = X['Skew2']
print(column.size)
missing_pct = int(column.size * 0.3)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])
print(column)

import datawig

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=[
        'EK2'
    ],  # column(s) containing information about the column we want to impute
    output_column='Skew2',  # the column we'd like to impute values for
    output_path='imputer_model'  # stores model data and metrics
)

#Fit an imputer model on the train data
imputer.fit(train_df=X)

#Impute missing values and return original dataframe with predictions
X = imputer.predict(X)
X['Skew2'] = X['Skew2_imputed']
del X['Skew2_imputed']
print(X)

from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
Пример #22
0
# CPI f2_score       :      0.35543474289083
# Weekly_Sales f2_score:  0.5619225329783258

import datawig
import pandas as pd
from sklearn.metrics import r2_score as score

df = pd.read_csv('impute_2013_final.csv')
df = df.fillna(0)

df_train, df_test = datawig.utils.random_split(df, split_ratios=[0.8, 0.2])

imputer = datawig.SimpleImputer(
    input_columns = ['Store', 'Fuel_Price', 'Dept', 'Temperature', 'Promotion1', 'Promotion2', 'Promotion3',
          'Promotion4', 'Promotion5', 'Promotion1_imputed', 'Promotion2_imputed', 'Promotion3_imputed',
          'Promotion4_imputed', 'Promotion5_imputed', 'Unemployment', 'Size (sq ft)', 'CPI_imputed'],
    output_column = 'Weekly_Sales',
    output_path = 'imputer_model'
)

imputer.fit(train_df=df_train, num_epochs = 100)

imputed = imputer.predict(df_test)

new_dataframe = pd.DataFrame(data = imputed)

new_dataframe.to_csv('2013_sales_complete.csv')

f1 = score(imputed['Weekly_Sales'], imputed['Weekly_Sales_imputed'])
print('Weekly_Sales f2_score: ', f1)
Пример #23
0
 def fit(self, df):
     for column in self.columns_to_impute:
         input_columns = list(set(df.columns) - set([self.label_column, column]))
         self.imputers[column] = datawig.SimpleImputer(input_columns=input_columns,
                                                       output_column=column, output_path=self.out).fit(train_df=df)
    score_TMI_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            impute_estimator, newdata, y_missing, scoring='neg_mean_squared_error',
            cv=N_SPLITS
        ) 

seed(7)
X_tech=pd.DataFrame(X_missing) 
X_tech.columns=['V_'+str(i) for i in X_tech.columns]
newdata1=X_tech.copy()
type_var=X_tech.dtypes
for i in list(X_tech.columns):
    if sum(pd.isna(X_tech[i])==True)>0 and type_var[i] in ['int64', 'float64']:
        imputer = datawig.SimpleImputer(
            input_columns=list(X_tech.drop(labels=i,axis=1).columns), # column(s) containing information about the column we want to impute
            output_column= i, # the column we'd like to impute values for
            output_path = 'imputer_model' # stores model data and metrics
            )
        imputer.fit(train_df=X_tech, num_epochs=50)
        
        imputed = imputer.predict(X_tech)
        newdata1.loc[np.where(pd.isna(X_tech[i])==True)[0],i]=imputed.iloc[np.where(pd.isna(X_tech[i])==True)[0],len(imputed.columns)-1]
    elif sum(pd.isna(X_tech[i])==True)>0 and type_var[i] not in ['int64', 'float64']:
        imputer = datawig.SimpleImputer(
            input_columns=list(X_tech.drop(labels=i,axis=1).columns), # column(s) containing information about the column we want to impute
            output_column= i, # the column we'd like to impute values for
            output_path = 'imputer_model' # stores model data and metrics
            )
        imputer.fit(train_df=X_tech, num_epochs=50)
        
        imputed = imputer.predict(X_tech)