예제 #1
0
def test_output_file_exists(test_data, results_path):
    data = test_data(SHAPE)
    labels = np.array([1, 0, 1, 1, 0])
    imputed_mode = []
    imputed_mode.append(["mode", (impy.mode(np.copy(data)), labels)])
    imputed_mode.append(["mean", (impy.mean(np.copy(data)), labels)])

    impy.util.compare(imputed_mode, log_path=results_path)
    with open(results_path, 'r') as fin:
        expected = {'mode': [('SVC', 0.0)], 'mean': [('SVC', 0.0)]}
        assert ast.literal_eval(next(fin)) == expected
예제 #2
0
 def test_impute(self):
     #si = SingleImputer(strategy={'bare_nuclei':"pmm"})
     #df_impute = si.fit_transform(self.df.iloc[:,1:10])
     df_impute = impy.mean(self.df.iloc[:, 1:10])
     #print(df_impute.iloc[:, 5])
     df_impute.iloc[:, 5] = df_impute.iloc[:, 5].apply(lambda x: np.around(x, decimals = 0), 1)
     robjects.r('library(mice)')
     robjects.r('dataset_impute <- mice(bc_data[, 2:10], print=FALSE)')
     r_impute = robjects.r('mice::complete(dataset_impute,1)$bare_nuclei')
     #print(r_impute)
     for i in range(df_impute.shape[0]):
         print('P ' + str(df_impute.iloc[:, 5][i])+ 'R ' + str(r_impute[i]))
         self.assertEqual(df_impute.iloc[:, 5][i],r_impute[i])
예제 #3
0
 def setUp(self):
     """
     self.data_c: Complete dataset/No missing values
     self.data_m: Incommplete dataset/Has missing values
     """
     mask = np.zeros((5, 5), dtype=bool)
     mask[0][0] = True
     data_m = impy.dataset.test_data(mask=mask)
     labels = np.array([1, 0, 1, 1, 0])
     self.imputed_mode = []
     self.imputed_mode.append(
         ["mode", (impy.mode(np.copy(data_m)), labels)])
     self.imputed_mode.append(
         ["mean", (impy.mean(np.copy(data_m)), labels)])
예제 #4
0
def impute_mean(df_soc, numerical_col):

    my_imputer = SimpleImputer()
    for e in numerical_col:
        s_array = df_soc[e].to_numpy()

        has_nan = np.isnan(s_array.reshape(-1, 1))
        print(has_nan)
        if(has_nan.any()):
            imputed = impy.mean(s_array.reshape(-1, 1))
            print("Imputed: ", e)
            df_soc[e] = imputed

    return df_soc

#impy.mean(arr)
예제 #5
0
def imputeValues(filename):
    dataset = pd.DataFrame()
    dataset2 = pd.DataFrame()

    df = pd.read_csv(filename)

    for i in df.columns:
        if df[i].dtype == np.float64 or df[i].dtype == np.int64:
            dataset[i] = df[i]

    dataset2 = impy.mean(dataset)
    dataset2.columns = dataset.columns

    for j in df.columns:
        if j in dataset2.columns:
            df[j] = dataset2[j]

    df.to_csv("../Frontend/comets_final.csv", index=False)
    return df
예제 #6
0
import numpy as np

n = 5
arr = np.random.uniform(high=6, size=(n, n))
for _ in range(3):
    arr[np.random.randint(n), np.random.randint(n)] = np.nan
print(arr)
print(20 * '_')

#np.array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, np.nan],
#          [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805],
#          [0.79802036, np.nan, 0.51729349, 5.06533123, 3.70669172],
#          [1.30848217, 2.08386584, 2.29894541, np.nan, 3.38661392],
#          [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]])

import impyute as impy
print(impy.fast_knn(arr))
print(20 * '_')
print(impy.mean(arr))
print(20 * '_')
#%%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#impyute digunakan untuk memmasukkan nilai yg hilang
import impyute as impy

data_ruspini = pd.read_csv("data_ruspini_missing.csv")
data_ruspini = data_ruspini.replace("?", np.nan)
#data_ruspini

data_ruspini_array = np.array(data_ruspini, dtype=float)
data_baru = impy.mean(data_ruspini_array)
#data_baru

data_frame_ruspini_missing = pd.DataFrame({
    'x': data_ruspini_array[:, 0],
    'y': data_ruspini_array[:, 1],
    'label': data_ruspini_array[:, 2],
})

data_frame_ruspini_baru = pd.DataFrame({
    'x': data_baru[:, 0],
    'y': data_baru[:, 1],
    'label': data_baru[:, 2],
})

print(data_frame_ruspini_baru)
print(data_frame_ruspini_missing)

#visualisasi
예제 #8
0
def test_mean_impute_missing_values():
    """ After imputation, no Nan's should exist"""
    imputed = impy.mean(data_m)
    assert not np.isnan(imputed).any()
예제 #9
0
 def test_mean_impute_missing_values(self):
     """ After imputation, no Nan's should exist"""
     imputed = impy.mean(self.data_m)
     self.assertFalse(np.isnan(imputed).any())
예제 #10
0
"""test_compare.py"""
import numpy as np
import impyute as impy

mask = np.zeros((5, 5), dtype=bool)
mask[0][0] = True
data_m = impy.dataset.test_data(mask=mask)
labels = np.array([1, 0, 1, 1, 0])
imputed_mode = []
imputed_mode.append(["mode", (impy.mode(np.copy(data_m)), labels)])
imputed_mode.append(["mean", (impy.mean(np.copy(data_m)), labels)])

def test_output_file_exists():
    """ Small test to just check that it runs without fialing"""
    path = "./results.txt"
    impy.util.compare(imputed_mode, log_path=path)
예제 #11
0
 def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target):
     try:
         self.miss_info = miss_info
         self.columns = notobj
         self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[
             "num_col"]
         metric = {"rmse": {}, "nrmse": {}}
         self.rawT = T
         self.target = target
         if target is not None: self.target_y = T[target]
         else: self.target_y = None
         self.cv = {}
         self.cv.update(deepcopy(metric))
         self.kf = kf
         self.MSE = {}
         self.MSE.update(deepcopy(metric))
         self.result = {}
         self.time_ck = {}
         X = deepcopy(T)
         mask = pd.DataFrame(mask, columns=T.columns.tolist())
         self.rawmask = mask
         X[(mask == 1).values] = np.nan
         if obj in [None, []]: obj = None
         else: pass
         ##########################################
         self.X = X[notobj]
         self.T = T[notobj]
         self.mask = mask[notobj]
         self.notobj = notobj
         ##########################################
         if obj is not None:
             ############ Numeric + Category  #################
             cat_impute = SimpleImputer(strategy="most_frequent")
             X[obj] = cat_impute.fit_transform(X[obj])
             self.true_obj = T[obj]
             self.pd_obj = X[obj]
             ###################################################
             TT = deepcopy(T)
             cat_encoder = miss_info["ce_encoder"]
             for k in cat_encoder.category_mapping:
                 col, map_ = k["col"], k["mapping"]
                 TT[col] = TT[col].replace(
                     dict(zip(k["mapping"].index, k["mapping"].values)))
             self.full_miss_data = TT
             self.full_miss_data[(mask == 1).values] = np.nan
             mice_data = deepcopy(T)
             for obj_col in obj:
                 mice_data[obj_col] = "Cols_" + mice_data[obj_col]
             self.full_mice_data = mice_data
             self.full_mice_data[(mask == 1).values] = np.nan
         else:
             ########## Numeric  ###############################
             num_data = deepcopy(self.X)
             num_data[(self.mask == 1).values] = np.nan
             self.full_miss_data = deepcopy(num_data)
             self.full_mice_data = deepcopy(num_data)
             ###################################################
         self.algo = algo
         self.method = {
             "MissForest" : lambda x : MissForest(verbose = 0, n_jobs  = -1 ).fit(x) ,
             "mean" : lambda x : impy.mean(x) ,
             "median" : lambda x : impy.median(x) ,
             "mode" : lambda x : impy.mode(x) ,
             "knn" : lambda x : impy.fast_knn(x) ,
             "MICE" : lambda x : impy.mice(x) ,
             "EM" : lambda x : impy.em(x),
             "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\
             fit_transform(pd.DataFrame(x)).values,
         }
     except Exception as e:
         print(e)
         pass
예제 #12
0
def test_mean(test_data):
    data = test_data(SHAPE)
    imputed = impy.mean(data)
    return_na_check(imputed)