def gain(data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - feature_name: feature namelist of original data - onehotencoder: onehotencoder of this data - ori_data_dim: dimensions of original data - gain_parameters: GAIN network parameters: - data_name: the file name of dataset - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations - onehot: the number of feature for onehot encoder (start from first feature) - predict: option for prediction mode Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters data_name = gain_parameters['data_name'] batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] onehot = gain_parameters['onehot'] predict = gain_parameters['predict'] # Model Path model_path = 'model/' + data_name # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector q X = tf.placeholder(tf.float32, shape=[None, dim], name='X') # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim], name='M') # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim], name='H') # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='D_W1') # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b1') D_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='D_W2') D_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b2') D_W3 = tf.Variable(xavier_init([h_dim, dim]), name='D_W3') D_b3 = tf.Variable(tf.zeros(shape=[dim]), name='D_b3') # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1') G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1') G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2') G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2') G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3') G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3') theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() saver = tf.train.Saver() if predict is True and os.path.exists(model_path + '.ckpt.meta'): print("Model Restore") saver.restore(sess, model_path + '.ckpt') else: sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) if predict is False: save_path = saver.save(sess, model_path + '.ckpt') ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) # Reverse encoding if onehot > 0: imputed_data = reverse_encoding(imputed_data, feature_name, onehotencoder, onehot, ori_data_dim) return imputed_data
def cph(data_x, cph_parameters, data_image): seed = 25 random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) '''Impute missing values in data_x Args: - data_x: original data with missing values - parameters: CPH network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = cph_parameters['batch_size'] hint_rate = cph_parameters['hint_rate'] alpha = cph_parameters['alpha'] iterations = cph_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) #print(h_dim) # Normalization norm_data, norm_parameters = normalization(data_x) #norm_data_x = np.nan_to_num(norm_data, 0) norm_data_x = np.nan_to_num(data_x, 0) ## CPH architecture # Input placeholders X_pre = tf.placeholder(tf.float32, shape=[1, 483, dim, 3]) # Data vector #X = tf.placeholder(tf.float32, shape = [None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables conv_filter_w1 = tf.Variable(tf.random_normal([1, 4, 3, 3])) conv_filter_b1 = tf.Variable(tf.random_normal([3])) conv_filter_w2 = tf.Variable(tf.random_normal([1, 4, 3, 1])) conv_filter_b2 = tf.Variable(tf.random_normal([1])) # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [ G_W1, G_W2, G_W3, G_b1, G_b2, G_b3, conv_filter_w1, conv_filter_b1, conv_filter_w2, conv_filter_b2 ] ## CPH functions # CNN + Generator def generator(x, m): relu_feature_maps1 = tf.nn.relu( \ tf.nn.conv2d(x, conv_filter_w1, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b1) max_pool1 = tf.nn.max_pool(relu_feature_maps1, ksize=[1, 1, 4, 1], strides=[1, 1, 1, 1], padding='SAME') relu_feature_maps2 = tf.nn.relu( \ tf.nn.conv2d(max_pool1, conv_filter_w2, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b2) max_pool2 = tf.nn.max_pool(relu_feature_maps2, ksize=[1, 1, 4, 1], strides=[1, 1, 1, 1], padding='SAME') x2 = tf.reshape(max_pool2, [483, dim]) # Concatenate Mask and Data inputs = tf.concat(values=[x2, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## CPH structure # Generator G_sample = generator(X_pre, M) X2 = X_pre[0, :, :, 0] # Combine with observed data Hat_X = X2 * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## CPH loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X2 - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## CPH solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) #print(len(batch_idx)) image_mb = data_image[:, batch_idx, :, :] X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb image_mb[0, :, :, 0] = X_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X_pre: image_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X_pre: image_mb, M: M_mb, H: H_mb}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb image_mb = data_image image_mb[0, :, :, 0] = X_mb imputed_data = sess.run([G_sample], feed_dict={ X_pre: image_mb, M: M_mb })[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization #imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
def PC_GAIN (incomplete_data_x , gain_parameters , data_m): '''Impute missing values in incomplete_data_x Args: - incomplete_data_x: original data with missing values - gain_parameters: PC_GAIN network parameters: - batch_size: Batch size,64 - hint_rate: Hint rate,0.9 - alpha: Hyperparameter,200 - beta: Hyperparameter,20 - lambda_: Hyperparameter,0.2 - k: Hyperparameter,4 - iterations: Iterations,10000 Returns: - imputed_data: imputed data ''' # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] beta = gain_parameters['beta'] lambda_ = gain_parameters['lambda_'] k = gain_parameters['k'] iterations = gain_parameters['iterations'] cluster_species = gain_parameters['cluster_species'] # Other parameters no, dim = incomplete_data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data , norm_parameters = normalization(incomplete_data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## PC_GAIN architecture X = tf.placeholder(tf.float32, shape = [None, dim]) M = tf.placeholder(tf.float32, shape = [None, dim]) H = tf.placeholder(tf.float32, shape = [None, dim]) Z = tf.placeholder(tf.float32, shape = [None, dim]) Y = tf.placeholder(tf.float32, shape = [None, k]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape = [h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape = [h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape = [dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim*2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape = [h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape = [h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape = [dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] C_W1 = tf.Variable(xavier_init([dim, h_dim])) C_b1 = tf.Variable(tf.zeros(shape = [h_dim])) C_W2 = tf.Variable(xavier_init([h_dim, h_dim])) C_b2 = tf.Variable(tf.zeros(shape = [h_dim])) C_W3 = tf.Variable(xavier_init([h_dim, k])) C_b3 = tf.Variable(tf.zeros(shape = [k])) # 分类器 theta_C = [C_W1, C_b1, C_W2, C_b2, C_W3, C_b3] ## PC_GAIN functions # Generator def generator(x,m): # Concatenate Mask and Data inputs = tf.concat(values = [x, m], axis = 1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values = [x, h], axis = 1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob, D_logit # Classer (neural network classifier mentioned in the paper) def classer(feature): C_h1 = tf.nn.relu(tf.matmul(feature, C_W1) + C_b1) C_h2 = tf.nn.relu(tf.matmul(C_h1, C_W2) + C_b2) C_h3 = tf.matmul(C_h2, C_W3) + C_b3 C_prob = tf.nn.softmax(C_h3) return C_prob ## PC_GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1-M) # Discriminator D_prob, D_logit = discriminator(Hat_X, H) ## PC_GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8)) G_loss_with_C = -tf.reduce_mean(Y * tf.log(Y + 1e-8)) MSE_loss = tf.reduce_mean((M * X - M * G_sample) * (M * X - M * G_sample)) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss_pre = G_loss_temp + alpha * MSE_loss G_loss = G_loss_temp + alpha * MSE_loss + beta * G_loss_with_C ## PC_GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver_pre = tf.train.AdamOptimizer().minimize(G_loss_pre, var_list=theta_G) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) ##Select pre-training data loss_rate = [] for i in range(no): index = 0 for j in range(dim): if data_m[i,j] == 0: index = index + 1 loss_rate.append([index , i]) loss_rate = sorted(loss_rate,key=(lambda x:x[0])) no_x_L = int(no * lambda_) index_x_L = [] for i in range(no_x_L): index_x_L.append(loss_rate[i][1]) norm_data_x_L = norm_data_x[index_x_L, :] data_m_L = data_m[index_x_L, :] ##Pre-training print('...Pre-training') for it in tqdm(range(int(iterations * 0.7))): batch_idx = sample_batch_index(no_x_L, batch_size) X_mb = norm_data_x_L[batch_idx, :] M_mb = data_m_L[batch_idx, :] Z_mb = uniform_sampler(0, 0.01, batch_size, dim) H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp X_mb = M_mb * X_mb + (1-M_mb) * Z_mb _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb}) _, G_loss_curr, MSE_loss_curr = sess.run([G_solver_pre, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H:H_mb}) Z_mb = uniform_sampler(0, 0.01, no_x_L, dim) M_mb = data_m_L X_mb = norm_data_x_L X_mb = M_mb * X_mb + (1-M_mb) * Z_mb imputed_data_L = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0] imputed_data_L = data_m_L * norm_data_x_L + (1 - data_m_L) * imputed_data_L ## Select different clustering methods if cluster_species == 'KM': data_c , data_class = KM(imputed_data_L, k) elif cluster_species == 'SC': data_c , data_class = SC(imputed_data_L, k) elif cluster_species == 'AC': data_c , data_class = AC(imputed_data_L, k) elif cluster_species == 'KMPP': data_c , data_class = KMPP(imputed_data_L, k) else: exit('have not this cluster methods') ## Pseudo-label training multi-classification SVM ## You can also choose other classifiers, ## such as the neural network classifier mentioned in the paper coder = preprocessing.OneHotEncoder() model = svm.SVC(kernel="linear", decision_function_shape="ovo") coder.fit(data_class.reshape(-1,1)) model.fit(imputed_data_L, data_class) ## Updata the generator G and the discriminator D ## To avoid the effects of pre-training, ## you can also choose to reinitialize the generator parameters for it in tqdm(range(iterations)): batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] Z_mb = uniform_sampler(0, 0.01, batch_size, dim) H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp X_mb = M_mb * X_mb + (1-M_mb) * Z_mb _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb}) ## Introducing pseudo label supervision Hat_X_curr = sess.run(Hat_X, feed_dict = {X: X_mb, M: M_mb, H:H_mb}) y_pred = model.predict(Hat_X_curr) sample_prob = coder.transform(y_pred.reshape(-1,1)).toarray() _, G_loss_curr, MSE_loss_curr , G_loss_with_C_curr = sess.run([G_solver, G_loss_temp, MSE_loss, G_loss_with_C], feed_dict = {X: X_mb, M: M_mb, H:H_mb , Y:sample_prob}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1-M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data imputed_data = renormalization(imputed_data, norm_parameters) return imputed_data
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector X = tf.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
def egain(miss_data_x, gain_parameters): #def Egain(miss_data_x, gain_parameters): '''Impute missing values in data_x Args: - miss_data_x: missing data - gain_parameters: GAIN network parameters: - batch_size: Batch size - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix m = 1 - np.isnan(miss_data_x) # System parameters batch_size = gain_parameters['batch_size'] # hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] loss_type = ['trickLogD', 'minimax', 'ls'] nloss = 3 beta = 1.0 ncandi = 1 #1#3 nbest = 1 #1#3 nD = 1 # # of discrim updates for each gen update # Other parameters no, dim = miss_data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(miss_data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture #tf.reset_default_graph() tf.compat.v1.get_default_graph() # Input placeholders # Data vector X = tf1.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf1.placeholder(tf.float32, shape=[None, dim]) # B vector B = tf1.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] # Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Hint vector H = B * M + 0.5 * (1 - B) # 0.5 => 0.1 # Generator G_sample = generator(X, M) D_prob_g = discriminator(X * M + G_sample * (1 - M), H) # Combine with observed data fake_X = tf1.placeholder(tf.float32, shape=[None, dim]) # Hint vector Hat_X = X * M + fake_X * (1 - M) # D loss D_prob = discriminator(Hat_X, H) D_loss_temp = -tf.reduce_mean( (M * tf1.log(D_prob + 1e-8) + (1 - M) * tf1.log(1. - D_prob + 1e-8))) D_loss = D_loss_temp # Updated parramter D_solver = tf1.train.AdamOptimizer(learning_rate=0.002, beta1=0.5, beta2=0.99).minimize(D_loss, var_list=theta_D) # G loss #Update loss function G_loss_logD = -tf.reduce_mean((1 - M) * 1 / 2 * tf1.log(D_prob_g + 1e-8)) G_loss_minimax = tf.reduce_mean( (1 - M) * 1 / 2 * tf1.log(1. - D_prob_g + 1e-8)) G_loss_ls = tf1.reduce_mean((1 - M) * tf1.square(D_prob_g - 1)) MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) G_loss_logD_all = G_loss_logD + alpha * MSE_loss G_loss_minimax_all = G_loss_minimax + alpha * MSE_loss G_loss_ls_all = G_loss_ls + alpha * MSE_loss #Update parramter G_solver_logD = tf1.train.AdamOptimizer(learning_rate=0.002, beta1=0.5, beta2=0.99).minimize( G_loss_logD_all, var_list=theta_G) G_solver_minimax = tf1.train.AdamOptimizer(learning_rate=0.002, beta1=0.5, beta2=0.99).minimize( G_loss_minimax_all, var_list=theta_G) G_solver_ls = tf1.train.AdamOptimizer(learning_rate=0.002, beta1=0.5, beta2=0.99).minimize( G_loss_ls_all, var_list=theta_G) # Fitness function Fq_score = tf.reduce_mean((1 - M) * D_prob) Fd_score = -tf1.log( tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[0]))) + tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[1]))) + tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[2]))) + tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[3]))) + tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[4]))) + tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[5])))) ## Iterations sess = tf1.Session() # Start Iterations gen_new_params = [] fitness_best = np.zeros(nbest) fitness_candi = np.zeros(ncandi) # for it in tqdm(range(iterations)): for it in tqdm(range(iterations)): # Train candidates G if it == 0: for can_i in range(0, ncandi): sess.run(tf1.global_variables_initializer()) batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = m[batch_idx, :] Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim) X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb B_mb = sample_batch_binary(dim, batch_size) gen_samples = sess.run([G_sample], feed_dict={ X: X_mb, M: M_mb })[0] fq_score, fd_score = sess.run([Fq_score, Fd_score], feed_dict={ X: X_mb, M: M_mb, fake_X: gen_samples, B: B_mb }) fitness = fq_score + beta * fd_score fitness_best[can_i] = fitness params = [] for param in theta_G: params.append(sess.run(param)) gen_new_params.append(params) gen_best_params = copy.deepcopy(gen_new_params) else: # generate new candidate gen_old_params = copy.deepcopy(gen_new_params) # print(gen_old_params[0][-1]) # print(it) for can_i in range(ncandi): for type_i in range(nloss): batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = m[batch_idx, :] Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim) # update 1.0 ==> 0.01 X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb B_mb = sample_batch_binary(dim, batch_size) # Load and update weights for i in range(len(theta_G)): theta_G[i].load(gen_old_params[can_i][i], sess) loss = loss_type[type_i] if loss == 'trickLogD': sess.run([G_solver_minimax], feed_dict={ X: X_mb, M: M_mb, B: B_mb }) elif loss == 'minimax': sess.run([G_solver_logD], feed_dict={ X: X_mb, M: M_mb, B: B_mb }) elif loss == 'ls': sess.run([G_solver_ls], feed_dict={ X: X_mb, M: M_mb, B: B_mb }) # calculate fitness score gen_samples = sess.run([G_sample], feed_dict={ X: X_mb, M: M_mb })[0] fq_score, fd_score = sess.run([Fq_score, Fd_score], feed_dict={ X: X_mb, M: M_mb, fake_X: gen_samples, B: B_mb }) fitness = fq_score + beta * fd_score # print(fitness) gap = fitness_best - fitness if min(gap) < 0: idx_replace = np.argmin(gap) params = [] for param in theta_G: params.append(sess.run(param)) gen_best_params[idx_replace] = params fitness_best[idx_replace] = fitness if can_i * nloss + type_i < ncandi: idx = can_i * nloss + type_i params = [] for param in theta_G: params.append(sess.run(param)) gen_new_params[idx] = params fitness_candi[idx] = fitness else: gap = fitness_candi - fitness if min(gap) < 0: idx_replace = np.argmin(gap) params = [] for param in theta_G: params.append(sess.run(param)) gen_new_params[idx_replace] = params fitness_candi[idx_replace] = fitness # Train D for i in range(nD): batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = m[batch_idx, :] Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim) # 1.0 ==> 0.01 X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb B_mb = sample_batch_binary(dim, batch_size) # impute data for each candidat for can_i in range(ncandi): for w in range(len(theta_G)): theta_G[w].load(gen_new_params[can_i][w], sess) if can_i == ncandi - 1: gen_samples_cani = sess.run( [G_sample], feed_dict={ X: X_mb[can_i * batch_size // ncandi:], M: M_mb[can_i * batch_size // ncandi:] })[0] else: gen_samples_cani = sess.run( [G_sample], feed_dict={ X: X_mb[can_i * batch_size // ncandi:(can_i + 1) * batch_size // ncandi], M: M_mb[can_i * batch_size // ncandi:(can_i + 1) * batch_size // ncandi] })[0] # print(gen_samples_cani.shape) if can_i == 0: gen_samples = gen_samples_cani else: gen_samples = np.append(gen_samples, gen_samples_cani, axis=0) sess.run([D_solver], feed_dict={ X: X_mb, M: M_mb, fake_X: gen_samples, B: B_mb }) ## Return imputed data idx = np.argmax(fitness_best) # print(idx) for i in range(len(theta_G)): theta_G[i].load(gen_best_params[idx][i], sess) Z_mb = uniform_sampler(0.0, 0.01, no, dim) M_mb = m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] sess.close() imputed_data = m * norm_data_x + (1 - m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, miss_data_x) return imputed_data
def gain(data_x, gain_parameters, ori_data_x, train_index, test_index, mechanism): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyper-parameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape no_train = len(train_index) # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, nan=0) # pytorch. generator = Generator(dim, h_dim) discriminator = Discriminator(dim, h_dim) discriminator2 = Discriminator(dim, h_dim) # Optimizers generator_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999)) discriminator_optimizer = torch.optim.SGD(discriminator.parameters(), lr=0.0001) discriminator2_optimizer = torch.optim.SGD(discriminator2.parameters(), lr=0.0001) for i in tqdm(range(iterations), desc='pytorch'): # if i % 2000 == 0 and i != 0: # test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index) # Sample batch batch_idx = sample_batch_index(no_train, batch_size) X_mb = norm_data_x[train_index][batch_idx, :] M_mb = data_m[train_index][batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors if mechanism == 'mar': H_mb = hint_for_mar(hint_rate, M_mb) # np.logical_or(1 - h, 1 - mask) H_mb_2 = hint_for_mar(hint_rate, M_mb) # np.logical_or(1 - h, 1 - mask) else: H_mb = M_mb * binary_sampler(hint_rate, batch_size, dim) H_mb_2 = M_mb * binary_sampler(hint_rate, batch_size, dim) # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb X_mb = torch.Tensor(X_mb) M_mb = torch.Tensor(M_mb) H_mb = torch.Tensor(H_mb) H_mb_2 = torch.Tensor(H_mb_2) G_sample = generator(X_mb, M_mb) Hat_X = X_mb * M_mb + G_sample * (1 - M_mb) D_prob = discriminator(Hat_X, H_mb) D_prob2 = discriminator2(Hat_X, H_mb_2) d_loss_value = d_loss(M_mb, D_prob) d_loss_value2 = d_loss(M_mb, D_prob2) # y = torch.rand(1) g_loss_value = g_loss(M_mb, 0.5 * D_prob + 0.5 * D_prob2, alpha, X_mb, G_sample) discriminator2_optimizer.zero_grad() discriminator_optimizer.zero_grad() generator_optimizer.zero_grad() g_loss_value.backward(retain_graph=True) d_loss_value.backward(retain_graph=True) d_loss_value2.backward(retain_graph=True) generator_optimizer.step() discriminator_optimizer.step() discriminator2_optimizer.step() test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = (1 - np.isnan(data_x)).astype(float) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) # parameter initialization X = tf.convert_to_tensor(norm_data_x) X = tf.dtypes.cast(X, tf.float32) M = tf.convert_to_tensor(data_m) M = tf.dtypes.cast(M, tf.float32) X_input = tf.concat(values=[X, M], axis=1) ## GAIN architecture # Generator class Generator(tf.keras.Model): def __init__(self): super().__init__() self.flatten = layers.Flatten(input_shape=[dim * 2]) self.dense1 = layers.Dense(h_dim, activation='relu') self.dense2 = layers.Dense(h_dim, activation='relu') self.dense_output = layers.Dense(dim, activation='sigmoid') return def call(self, inputs, training=None): x = self.flatten(inputs) x = self.dense1(x) x = self.dense2(x) x = self.dense_output(x) return x # Discriminator class Discriminator(tf.keras.Model): def __init__(self): super().__init__() self.flatten = layers.Flatten(input_shape=[dim * 2]) self.dense1 = layers.Dense(h_dim, activation='relu') self.dense2 = layers.Dense(h_dim, activation='relu') self.dense_output = layers.Dense(dim, activation='sigmoid') return def call(self, inputs, training=None): x = self.flatten(inputs) x = self.dense1(x) x = self.dense2(x) x = self.dense_output(x) return x ## GAIN loss # Generator def generator_loss(generator, discriminator, x, m): generator.trainable = True discriminator.trainable = False G_input = tf.concat(values=[x, m], axis=1) G_sample = generator(G_input) MSE_loss = tf.reduce_mean( (m * x - m * G_sample)**2) / tf.reduce_mean(m) D_input = tf.concat(values=[G_sample, m], axis=1) D_prob = discriminator(D_input) G_loss_tmp = -tf.reduce_mean((1 - m) * tf.math.log(D_prob + 1e-8)) return G_loss_tmp + alpha * MSE_loss # Discriminator def discriminator_loss(generator, discriminator, x, m, h): generator.trainable = False discriminator.trainable = True G_input = tf.concat(values=[x, m], axis=1) G_sample = generator(G_input) x_hat = x * m + G_sample * (1 - m) D_input = tf.concat(values=[x_hat, h], axis=1) D_prob = discriminator(D_input) return -tf.reduce_mean(m * tf.math.log(D_prob + 1e-8) \ + (1-m) * tf.math.log(1. - D_prob + 1e-8)) # Build generator = Generator() generator.build(input_shape=(None, 2 * dim)) g_optimizer = tf.keras.optimizers.Adam() discriminator = Discriminator() discriminator.build(input_shape=(None, 2 * dim)) d_optimizer = tf.keras.optimizers.Adam() # Training one_tensor = tf.constant(1., shape=(batch_size, dim), dtype=float) for _ in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = tf.gather(X, batch_idx) M_mb = tf.gather(M, batch_idx) Z_mb = tf.convert_to_tensor(uniform_sampler(0, 0.01, batch_size, dim), dtype=float) H_mb_tmp = tf.convert_to_tensor(binary_sampler(hint_rate, batch_size, dim), dtype=float) H_mb = tf.math.multiply(M_mb, H_mb_tmp) # Combine random vectors with observed vectors # X_mb = M_mb * X_mb + (1-M_mb) * Z_mb X_mb = tf.math.add(tf.math.multiply(M_mb, X_mb), \ tf.math.multiply(tf.math.subtract(one_tensor, M_mb), Z_mb)) # training Discriminator with tf.GradientTape() as tape: d_loss = discriminator_loss(generator, discriminator, X_mb, M_mb, H_mb) grads = tape.gradient(d_loss, discriminator.trainable_variables) d_optimizer.apply_gradients( zip(grads, discriminator.trainable_variables)) # training Generator with tf.GradientTape() as tape: g_loss = generator_loss(generator, discriminator, X_mb, M_mb) grads = tape.gradient(g_loss, generator.trainable_variables) g_optimizer.apply_gradients(zip(grads, generator.trainable_variables)) ## Return imputed data imputed_data = np.array([]).reshape(0, dim) train_data = tf.data.Dataset.from_tensor_slices(X_input).batch(batch_size) train_data_iter = iter(train_data) while True: try: batch = next(train_data_iter) except StopIteration: break X_tmp = generator(batch).numpy() imputed_data = np.vstack([imputed_data, X_tmp]) # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Recovery imputed_data = data_m * np.nan_to_num(data_x) + (1 - data_m) * imputed_data # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
from utils import sample_batch_index, binary_sampler from tqdm import trange if __name__ == '__main__': # Load data file_name = 'data/house.csv' house_df = pd.read_csv(file_name) no, dim = house_df.shape data_x = house_df.values.astype(np.float32) num_samples = 200 miss_rate = 0.3 for i in trange(num_samples): # random samples sample_idx = sample_batch_index(no, 10000) data_x_i = data_x[sample_idx, :] no_i, dim_i = data_x_i.shape np.savetxt("./samples/complete/sample_{}.csv".format(i), data_x_i, delimiter=",") # Introduce missing data data_m = binary_sampler(1 - miss_rate, no_i, dim_i) miss_data_x = data_x_i.copy() miss_data_x[data_m == 0] = np.nan np.savetxt("./samples/MCAR/sample_{}.csv".format(i), miss_data_x, delimiter=",")
def gain(miss_data_x, gain_parameters): '''Impute missing values in data_x Args: - miss_data_x: missing data - gain_parameters: GAIN network parameters: - batch_size: Batch size - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix m = 1 - np.isnan(miss_data_x) # System parameters batch_size = gain_parameters['batch_size'] # hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = miss_data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(miss_data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture tf1.reset_default_graph() # Input placeholders # Data vector X = tf1.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf1.placeholder(tf.float32, shape=[None, dim]) # # Hint vector # H = tf.placeholder(tf.float32, shape = [None, dim]) # B vector B = tf1.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf1.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf1.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf1.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf1.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf1.Variable(xavier_init([h_dim, dim])) D_b3 = tf1.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) H = B * M + 0.5 * (1 - B) D_prob_g = discriminator(X * M + G_sample * (1 - M), H) fake_X = tf1.placeholder(tf.float32, shape=[None, dim]) # Hint vector Hat_X = X * M + fake_X * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) # GAIN loss # D_loss_temp = -tf.reduce_mean((1-B)*(M * tf.log(D_prob + 1e-8) \ # + (1-M) * tf.log(1. - D_prob + 1e-8))) \ # / tf.reduce_mean(1-B) # # G_loss_temp = -tf.reduce_mean((1-B)*(1-M) * tf.log(D_prob + 1e-8)) / tf.reduce_mean(1-B) D_loss_temp = -tf.reduce_mean((M * tf1.log(D_prob + 1e-8) \ + (1-M) * tf1.log(1. - D_prob + 1e-8))) G_loss_temp = -tf.reduce_mean((1 - M) * tf1.log(D_prob_g + 1e-8)) MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf1.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf1.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf1.Session() sess.run(tf1.global_variables_initializer()) gen_new_params = [] params = [] for param in theta_G: params.append(sess.run(param)) gen_new_params.append(params) for it in range(iterations): # for it in tqdm(range(iterations)): # Sample batch # print(sess.run(theta_G[-1])) gen_old_params = copy.deepcopy(gen_new_params) batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim) # Sample hint vectors # H_mb_temp = binary_sampler(0.9, batch_size, dim) # H_mb = M_mb * H_mb_temp # H_mb_temp = binary_sampler(hint_rate, batch_size, dim) B_mb = sample_batch_binary(dim, batch_size) # H_mb = B_mb*M_mb + 0.5*(1-B_mb) # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb f_mb = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] # print(f_mb) for w in range(len(theta_G)): theta_G[w].load(gen_new_params[0][w], sess) _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ X: X_mb, M: M_mb, fake_X: f_mb, B: B_mb }) batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim) # Sample hint vectors # H_mb_temp = binary_sampler(0.9, batch_size, dim) # H_mb = M_mb * H_mb_temp # H_mb_temp = binary_sampler(hint_rate, batch_size, dim) B_mb = sample_batch_binary(dim, batch_size) # H_mb = B_mb*M_mb + 0.5*(1-B_mb) # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb for w in range(len(theta_G)): theta_G[w].load(gen_old_params[0][w], sess) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, B: B_mb}) params = [] for param in theta_G: params.append(sess.run(param)) gen_new_params[0] = params ## Return imputed data Z_mb = uniform_sampler(0.0, 0.01, no, dim) M_mb = m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb for w in range(len(theta_G)): theta_G[w].load(gen_new_params[0][w], sess) imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] sess.close() imputed_data = m * norm_data_x + (1 - m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, miss_data_x) return imputed_data