/
demo_GINN_imputation.py
156 lines (134 loc) · 6.46 KB
/
demo_GINN_imputation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
'''Main function impute with Graph Imputation Neural Networks (GINN)'''
# Necessary packages
import datetime
import csv
import numpy as np
from sklearn import model_selection, preprocessing
import argparse
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings("ignore")
# My packages
from data_helper import file_list, data_K_Fold, dictionary_datasets, imputed_dataset
from utils import csv_reader, mask_generation, data2onehot, write_file
from GINN.ginn.core import GINN
from utils import inverse_onehot, order_by_address, check_approximation
'''Start code Python'''
def main(args):
'''Main function for imputed with GINN
Args:
- from_id: start index to file list
- to_id: end index to file list
- fold_size: fold_size start from index 1
Returns:
- write imputed_data: imputed data
'''
# Input parameters
from_id = args.from_id
to_id = args.to_id
fold_size = args.fold_size
# Initial parameters
missingness_flag = [0, 10, 20, 30, 40, 50] # t% missing data
seed = 42
# Main program
for i_file in range(from_id, to_id):
file_name = file_list[i_file]
print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name))
for i in tqdm(range(1, fold_size)):
for missingness in missingness_flag:
(D_miss_train, D_miss_test) = csv_reader(data_K_Fold, file_name, i, method='data_missing',
missingness=missingness)
x_train = D_miss_train[:, :(D_miss_train.shape[1] - 1)]
y_train = D_miss_train[:, -1]
x_test = D_miss_test[:, :(D_miss_test.shape[1] - 1)]
y_test = D_miss_test[:, -1]
missing_train, missing_train_mask = mask_generation(x_train)
missing_test, missing_test_mask = mask_generation(x_test)
cx_train = np.c_[missing_train, y_train]
cx_test = np.c_[missing_test, y_test]
mask_train = np.c_[missing_train_mask, np.ones(y_train.shape)]
mask_test = np.c_[missing_test_mask, np.ones(y_test.shape)]
# Here we proprecess the data applying a one-hot encoding for the categorical variables. We get the encoded dataset
# three different masks that indicates the missing features and if these features are categorical or numerical,
# plus the new columns for the categorical variables with their one-hot range.
numerical_columns = dictionary_datasets['{}'.format(file_name)]['numerical']
categorical_columns = dictionary_datasets['{}'.format(file_name)]['categorical']
[oh_data, oh_mask, oh_numerical_mask, oh_categorical_mask, oh_categorical_columns, classes_dictionary] = data2onehot(
np.r_[cx_train, cx_test], np.r_[mask_train, mask_test], numerical_columns, categorical_columns)
# We scale the features with a min max scaler that will preserve the one-hot encoding
oh_data_train = oh_data[:x_train.shape[0], :]
oh_data_test = oh_data[x_train.shape[0]:, :]
oh_mask_train = oh_mask[:x_train.shape[0], :]
oh_num_mask_train = oh_mask[:x_train.shape[0], :]
oh_cat_mask_train = oh_mask[:x_train.shape[0], :]
oh_mask_test = oh_mask[x_train.shape[0]:, :]
oh_num_mask_test = oh_mask[x_train.shape[0]:, :]
oh_cat_mask_test = oh_mask[x_train.shape[0]:, :]
# Scaler
scaler_train = preprocessing.MinMaxScaler()
oh_data_train = scaler_train.fit_transform(oh_data_train)
scaler_test = preprocessing.MinMaxScaler()
oh_data_test = scaler_test.fit_transform(oh_data_test)
# Now we are ready to impute the missing values on the training set!
imputer_train = GINN(
oh_data_train,
oh_mask_train,
oh_num_mask_train,
oh_cat_mask_train,
oh_categorical_columns,
numerical_columns,
categorical_columns
)
# Transform
imputer_train.fit(epochs=1)
imputed_train = scaler_train.inverse_transform(imputer_train.transform())
# Impute test
imputer_train.add_data(
oh_data_test,
oh_mask_test,
oh_num_mask_test,
oh_cat_mask_test
)
imputed_test = imputer_train.transform()
imputed_test = scaler_test.inverse_transform(imputed_test[x_train.shape[0]:])
# print(imputed_train[0])
# Rebuild construct matrix
if categorical_columns != []:
# Rebuild train
D_inverse_tr = inverse_onehot(cx_train.shape, imputed_train, oh_categorical_columns, classes_dictionary)
imputed_train = order_by_address(D_inverse_tr, num_cols=numerical_columns, cat_cols=categorical_columns)
# Rebuild test
D_inverse_te = inverse_onehot(cx_test.shape, imputed_test, oh_categorical_columns, classes_dictionary)
imputed_test = order_by_address(D_inverse_te, num_cols=numerical_columns, cat_cols=categorical_columns)
# Check the approximation of each element
imputed_train_checked = check_approximation(imputed_train, cx_train)
imputed_test_checked = check_approximation(imputed_test, cx_test)
# Write result
imputed_path = os.path.join(imputed_dataset, file_name)
write_file(imputed_train_checked, imputed_test_checked, imputed_path, 'GINN', missingness, i)
if __name__ == "__main__":
# Inputs for the main function
parser = argparse.ArgumentParser()
parser.add_argument(
'--from_id',
help='start index to file list',
default=0,
type=int
)
parser.add_argument(
'--to_id',
help='end index to file list',
default=len(file_list),
type=int
)
parser.add_argument(
'--fold_size',
help='fold_size start from index 1',
default=11,
type=int
)
args = parser.parse_args()
# Calls main function
main(args)
'''Code Finished'''