def enc_network(settings): 'get s and z samples as embeddings as well as the original dataframe (with relevelled factors & NA\'s=0!)' argvals = settings.split() args = parser_arguments.getArgs(argvals) print(args) #Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' #Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file, args.batch_size, learning_rate=args.learning_rate, z_dim=args.dim_latent_z, y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, y_dim_partition=args.dim_latent_y_partition) train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) #Get an integer number of batches n_batches = int(np.floor(np.shape(train_data)[0] / args.batch_size)) #Compute the real miss_mask miss_mask = np.multiply(miss_mask, true_miss_mask) with tf.Session(graph=sess_HVAE) as session: # Add ops to save and restore all the variables. saver = tf.train.Saver() saver.restore(session, network_file_name) print("Model restored: " + network_file_name) start_time = time.time() # Training cycle loglik_epoch = [] testloglik_epoch = [] epoch = 0 avg_loss = 0. avg_loss_reg = 0. avg_KL_s = 0. avg_KL_y = 0. avg_KL_z = 0. samples_list = [] samples_list_test = [] p_params_list = [] q_params_list = [] log_p_x_total = [] log_p_x_missing_total = [] # Constant Gumbel-Softmax parameter (where we have finished the annealing) tau = 1e-3 for i in range(n_batches): data_list, miss_list = read_functions.next_batch( train_data, types_dict, miss_mask, args.batch_size, index_batch=i) #Create train minibatch data_list_observed = [ data_list[i] * np.reshape(miss_list[:, i], [args.batch_size, 1]) for i in range(len(data_list)) ] #Delete not known data #Create feed dictionary feedDict = { i: d for i, d in zip(tf_nodes['ground_batch'], data_list) } feedDict.update({ i: d for i, d in zip(tf_nodes['ground_batch_observed'], data_list_observed) }) feedDict[tf_nodes['miss_list']] = miss_list feedDict[tf_nodes['miss_list_VP']] = np.ones( miss_list.shape) # unused feedDict[tf_nodes['tau_GS']] = tau feedDict[tf_nodes['zcodes']] = np.ones(args.batch_size).reshape( (args.batch_size, 1)) feedDict[tf_nodes['scodes']] = np.ones(args.batch_size).reshape( (args.batch_size, 1)) #Get samples from the model KL_s, loss, samples, log_p_x, log_p_x_missing, loss_total, KL_z, p_params, q_params, loss_reg = session.run( [ tf_nodes['KL_s'], tf_nodes['loss_re'], tf_nodes['samples'], tf_nodes['log_p_x'], tf_nodes['log_p_x_missing'], tf_nodes['loss'], tf_nodes['KL_z'], tf_nodes['p_params'], tf_nodes['q_params'], tf_nodes['loss_reg'] ], feed_dict=feedDict) samples_list.append(samples) q_params_list.append(q_params) # Compute average loss avg_loss += np.mean(loss) avg_loss_reg += np.mean(loss_reg) avg_KL_s += np.mean(KL_s) avg_KL_z += np.mean(KL_z) #Transform discrete variables to original values (this is for getting the original data frame) train_data_transformed = read_functions.discrete_variables_transformation( train_data, types_dict) #Create global dictionary of the distribution parameters q_params_complete = read_functions.q_distribution_params_concatenation( q_params_list, args.dim_latent_z, args.dim_latent_s) # return the deterministic and sampled s and z codes and the reconstructed dataframe (now imputed) encs = np.argmax(q_params_complete['s'], 1) encz = q_params_complete['z'][0, :, :] return [encs, encz, train_data_transformed]
def dec_network(settings, zcodes, scodes, VP=False): 'decode using set s and z values (if generated provide a generated miss_list) and return decoded data' argvals = settings.split() args = parser_arguments.getArgs(argvals) print(args) #Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' #Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file, args.batch_size, learning_rate=args.learning_rate, z_dim=args.dim_latent_z, y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, y_dim_partition=args.dim_latent_y_partition) train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) #Get an integer number of batches n_batches = int(np.floor(np.shape(train_data)[0] / args.batch_size)) ######Compute the real miss_mask miss_mask = np.multiply(miss_mask, true_miss_mask) with tf.Session(graph=sess_HVAE) as session: # Add ops to save and restore all the variables. saver = tf.train.Saver() saver.restore(session, network_file_name) print("Model restored: " + network_file_name) print('::::::DECODING:::::::::') start_time = time.time() # Training cycle epoch = 0 samples_list = [] # Constant Gumbel-Softmax parameter (where we have finished the annealing) tau = 1e-3 for i in range(n_batches): data_list, miss_list = read_functions.next_batch( train_data, types_dict, miss_mask, args.batch_size, index_batch=i) #Create inputs for the feed_dict data_list_observed = [ data_list[i] * np.reshape(miss_list[:, i], [args.batch_size, 1]) for i in range(len(data_list)) ] #Delete not known data #Create feed dictionary feedDict = { i: d for i, d in zip(tf_nodes['ground_batch'], data_list) } feedDict.update({ i: d for i, d in zip(tf_nodes['ground_batch_observed'], data_list_observed) }) feedDict[tf_nodes['miss_list']] = miss_list if VP == True: vpfile = 'VP_misslist/' + re.sub( 'data_python/|.csv', '', args.data_file) + '_vpmiss.csv' print('::::::::::::' + vpfile) feedDict[tf_nodes['miss_list_VP']] = pd.read_csv(vpfile, header=None) elif VP == 'nomiss': print(':::::::::::: ones for miss list VP') feedDict[tf_nodes['miss_list_VP']] = np.ones(miss_list.shape) else: feedDict[tf_nodes['miss_list_VP']] = miss_list feedDict[tf_nodes['tau_GS']] = tau feedDict[tf_nodes['zcodes']] = np.array(zcodes).reshape( (len(zcodes), 1)) feedDict[tf_nodes['scodes']] = np.array(scodes).reshape( (len(scodes), 1)) #Get samples from the fixed decoder function samples_zgen, log_p_x_test, log_p_x_missing_test, test_params = session.run( [ tf_nodes['samples_zgen'], tf_nodes['log_p_x_zgen'], tf_nodes['log_p_x_missing_zgen'], tf_nodes['test_params_zgen'] ], feed_dict=feedDict) samples_list.append(samples_zgen) #Separate the samples from the batch list s_aux, z_aux, y_total, est_data = read_functions.samples_concatenation( samples_list) #Transform discrete variables to original values est_data_transformed = read_functions.discrete_variables_transformation( est_data, types_dict) return est_data_transformed
# p_params_list.append(p_params) q_params_list.append(q_params) log_p_x_total.append(log_p_x_test) log_p_x_missing_total.append(log_p_x_missing_test) # Compute average loss avg_loss += np.mean(loss) avg_KL_s += np.mean(KL_s) avg_KL_z += np.mean(KL_z) # Concatenate samples in arrays s_total, z_total, y_total, est_data = read_functions.samples_concatenation( samples_list) # Transform discrete variables back to the original values train_data_transformed = read_functions.discrete_variables_transformation( train_data_aux[:n_batches * args.batch_size, :], types_dict) est_data_transformed = read_functions.discrete_variables_transformation( est_data, types_dict) est_data_imputed = read_functions.mean_imputation( train_data_transformed, miss_mask_aux[:n_batches * args.batch_size, :], types_dict) # est_data_transformed[np.isinf(est_data_transformed)] = 1e20 # Create global dictionary of the distribution parameters p_params_complete = read_functions.p_distribution_params_concatenation( p_params_list, types_dict, args.dim_latent_z, args.dim_latent_s) q_params_complete = read_functions.q_distribution_params_concatenation( q_params_list, args.dim_latent_z, args.dim_latent_s)