def data_loader(data_name, miss_rate, target_column=None): """Loads datasets and introduce missingness. Args: - data_name: letter, spam, or mnist - miss_rate: the probability of missing components Returns: data_x: original data miss_data_x: data with missing values data_m: indicator matrix for missing components """ file_name = 'data/' + data_name + '.csv' print(file_name) data_x = pd.read_csv(file_name, delimiter=',') data_x.fillna(0, inplace=True) try: _ = data_x.pop('datetime') except KeyError: pass train_x, test_x = train_test_split(data_x, test_size=0.35, random_state=666, shuffle=True, stratify=data_x[target_column].values) if target_column is not None: train_y = train_x.pop(target_column) else: train_y = None if target_column is not None: test_y = test_x.pop(target_column) else: test_y = None # Parameters no_train, dim_train = train_x.shape no_test, dim_test = test_x.shape # Introduce missing data data_m_train = binary_sampler(1 - miss_rate, no_train, dim_train) data_m_test = binary_sampler(1 - miss_rate, no_test, dim_test) miss_train_x = train_x.astype('float32').copy() ori_train_x = train_x.astype('float32').copy() miss_test_x = test_x.astype('float32').copy() ori_test_x = test_x.astype('float32').copy() miss_train_x = miss_train_x.values miss_test_x = miss_test_x.values miss_train_x[data_m_train == 0] = np.nan miss_test_x[data_m_test == 0] = np.nan miss_train_x = pd.DataFrame(data=miss_train_x, columns=train_x.columns) miss_test_x = pd.DataFrame(data=miss_test_x, columns=test_x.columns) return (ori_train_x, train_x, train_y, miss_train_x), (ori_test_x, test_x, test_y, miss_test_x)
def prepare_train_pipeline(norm_train_x, data_m): """ Prepares training Pipeline into a TF Dataset Format :param norm_train_x: :param data_m: :return: """ # Perform all the data augmentation BEFORE the training Loop (ffs) rows, columns = norm_train_x.shape X_mb = norm_train_x.values M_mb = data_m.values # Sample random vectors Z_mb = uniform_sampler(0, 0.01, rows, columns) # Sample hint vectors H_mb_temp = binary_sampler(0.9, rows, columns) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb tf_data = tf.data.Dataset.from_tensor_slices( (X_mb.astype('float32'), M_mb.astype('float32'), H_mb.astype('float32'))).shuffle(100000).batch(256) return tf_data
def data_loader(data_name, miss_rate): '''Loads datasets and introduce missingness. Args: - data_name: letter, spam, or mnist - miss_rate: the probability of missing components Returns: data_x: original data miss_data_x: data with missing values data_m: indicator matrix for missing components ''' # Load data if data_name in ['letter', 'spam', 'breast', 'credit', 'news']: file_name = 'data/' + data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) elif data_name == 'mnist': (data_x, _), _ = mnist.load_data() data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float) # Parameters no, dim = data_x.shape # Introduce missing data data_m = binary_sampler(1 - miss_rate, no, dim) miss_data_x = data_x.copy() miss_data_x[data_m == 0] = np.nan return data_x, miss_data_x, data_m
def data_loader (data_name, miss_rate, onehot, predict): '''Loads datasets and introduce missingness. Args: - data_name: the filename of dataset - miss_rate: the probability of missing components - onehot: the number of feature for onehot encoder (start from first feature) - predict: the option of prediction mode Returns: data_x: original data miss_data_x: data with missing values data_m: indicator matrix for missing components feature_name: feature namelist of original data onehotencoder: onehotencoder of this data ori_data_dim: dimensions of original data ''' # Load data file_name = 'data/'+data_name+'.csv' data = pd.read_csv(file_name) feature_name = list(data.columns) data = np.array(data) # Onehotencoding, if columns have exist missing value, skip encoding onehotencoder = OneHotEncoder() if np.sum(np.isnan(data[:,:onehot])) == 0 and onehot > 0: data_x = data[:,:onehot] onehotencoder.fit(data_x) data_x = onehotencoder.transform(data_x).toarray() data_x = np.concatenate((data_x, data[:,onehot:]),axis=1) elif onehot == 0: data_x = np.array(data) else: print("Missing value exist, skip onehotencoding") data_x = np.array(data) # Parameters ori_data_dim = data.shape[1] no, dim = data_x.shape # Introduce missing data if predict is False: data_m = binary_sampler(1-miss_rate, no, dim) else: data_m = 1-np.isnan(data_x) miss_data_x = data_x.copy() miss_data_x[data_m == 0] = np.nan return data_x, miss_data_x, data_m, feature_name, onehotencoder, ori_data_dim
def data_loader(data_name, miss_rate, mechanism): '''Loads datasets and introduce missingness. Args: - data_name: letter, spam, or mnist - miss_rate: the probability of missing components Returns: data_x: original data miss_data_x: data with missing values data_m: indicator matrix for missing components ''' # Load data if data_name in ['letter', 'spam']: file_name = 'data/' + data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) elif data_name == 'mnist': (data_x, _), _ = mnist.load_data() data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float) elif data_name == 'breast': data_x = load_breast_cancer()['data'] elif data_name == 'news': data_x = np.loadtxt('data/OnlineNewsPopularity1.csv', delimiter=",", skiprows=1) elif data_name == 'credit': data_x = np.loadtxt('data/default_of_credit_cards_clients.csv', delimiter=",", skiprows=2) else: raise Exception('Unknown dataset.') # Parameters no, dim = data_x.shape if mechanism == 'mcar': # Introduce missing data data_m = binary_sampler(1 - miss_rate, no, dim) else: data_m = 1 - MAR_mask( np.array(data_x, dtype=np.float32), p=miss_rate, p_obs=miss_rate) miss_data_x = data_x.copy() miss_data_x[data_m == 0] = np.nan return data_x, miss_data_x, data_m
def data_loader(data_name, miss_rate): file_name = 'datasets/' + data_name + '.csv' complete_data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) no, dim = complete_data_x.shape np.random.shuffle(complete_data_x) #Limit the amount of data if no > 10000: complete_data_x = complete_data_x[0:10000, 0:dim - 1] no = 10000 dim = dim - 1 else: complete_data_x = complete_data_x[0:no, 0:dim - 1] dim = dim - 1 data_m = binary_sampler(1 - miss_rate, no, dim) incomplete_data_x = complete_data_x.copy() incomplete_data_x[data_m == 0] = np.nan return complete_data_x, incomplete_data_x, data_m
def data_loader(data_name, miss_rate): if data_name in ['spam', 'letter']: file_name = 'data/' + data_name + '.csv' x = np.loadtxt(file_name, delimiter=",", skiprows=1) y = [] elif data_name in ['spam_full', 'breast_full']: file_name = 'data/' + data_name + '.csv' x = np.loadtxt(file_name, delimiter=",", skiprows=1) print("begin ", x.shape) y = np.array([x[-1]]) y = y.T x = np.array([x[:-1]]) x = x[0, :, :] print("x.shape & y", x.shape, y.shape) else: file_name = 'data/' + data_name + '.arff' data, _ = arff.loadarff(file_name) data = data.tolist() x = np.array([item[:-1] for item in data]) y = np.array([item[-1] for item in data]) le = LabelEncoder() y = le.fit_transform(y) print('Num of classes: ', len(le.classes_)) # Parameters no, dim = x.shape print('Num of samples:', no) print('Num of features: ', dim) print(type(x[0][0])) # Introduce missing data m = binary_sampler(1 - miss_rate, no, dim) miss_x = x.copy() miss_x[m == 0] = np.nan return x, y, miss_x, m
def data_loader(data_name, miss_rate): '''Loads datasets and introduce missingness. Args: - data_name: letter, spam, or mnist - miss_rate: the probability of missing components Returns: data_x: original data miss_data_x: data with missing values data_m: indicator matrix for missing components ''' usecols = 0 # Load data if data_name in ['letter', 'spam']: file_name = 'data/' + data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) elif data_name == 'breast_original': file_name = 'data/' + data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", usecols=(range(30)), skiprows=1) data_y = np.loadtxt(file_name, delimiter=",", usecols=(30), skiprows=1) usecols = range(30) #print(data_x.shape) #print(data_y.shape) elif data_name == 'Wine_original': file_name = 'data/' + data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", usecols=(range(12)), skiprows=1) data_y = np.loadtxt(file_name, delimiter=",", usecols=(13), skiprows=1) usecols = range(12) elif data_name == 'mnist': (data_x, data_y), _ = mnist.load_data() data_x = np.reshape(np.asarray(data_x), [60000, 28 * 28]).astype(float) data_y = np.reshape(np.asarray(data_y), [60000, 1]).astype(float) elif data_name == 'vals_test_df': train_data_name = "vals_train_df.csv" file_name = data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", usecols=(range(1, 10)), skiprows=1) data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1) elif data_name == 'vals_test_df_test_type1': train_data_name = "vals_train_df_test_type1.csv" file_name = data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", usecols=(range(1, 10)), skiprows=1) data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1) elif data_name == 'vals_test_df_test_type2': train_data_name = "vals_train_df_test_type2.csv" file_name = data_name + '.csv' data_x = np.loadtxt(file_name, delimiter=",", usecols=(range(1, 10)), skiprows=1) data_y = np.loadtxt(file_name, delimiter=",", usecols=(0), skiprows=1) # Parameters no, dim = data_x.shape print(data_x.shape) # Introduce missing data #create missing file name filename = "{dname}_Missing_rate_{value}_Index.csv".format(dname=data_name, value=miss_rate) missing_file_exist = path.exists(filename) if missing_file_exist: file = open(filename) data_m = np.loadtxt(file, delimiter=",", usecols=(range(9)), skiprows=1) #print(data_m.shape) else: data_m = binary_sampler(1 - miss_rate, no, dim) #print(data_m.shape) #print("datax", data_x.shape) miss_data_x = data_x.copy() miss_data_x[data_m == 0] = np.nan #data_m = binary_sampler(1 - miss_rate, no, dim) data_train_x = np.loadtxt(train_data_name, delimiter=",", usecols=(range(1, 10)), skiprows=1) data_train_y = np.loadtxt(train_data_name, delimiter=",", usecols=(0), skiprows=1) miss_data_x_new = np.concatenate([data_train_x, miss_data_x]) ### Saving the indexs missing_index = pd.DataFrame(data_m) missing_index.to_csv(filename, index=False) data_x_x = pd.DataFrame(miss_data_x) data_y_y = pd.DataFrame(data_y) data_x_s = pd.concat([data_y_y, data_x_x], ignore_index=True, axis=1) data_x_s.to_csv('{dbname}_generated.csv'.format(dbname=data_name), index=False) # print("data_x", data_x.shape) # print("miss_data_x_new", miss_data_x_new.shape) # print("data_m", data_m.shape) # print("data_y", data_y.shape) return data_x, miss_data_x_new, data_m, data_y
def cph(data_x, cph_parameters, data_image): seed = 25 random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) '''Impute missing values in data_x Args: - data_x: original data with missing values - parameters: CPH network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = cph_parameters['batch_size'] hint_rate = cph_parameters['hint_rate'] alpha = cph_parameters['alpha'] iterations = cph_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) #print(h_dim) # Normalization norm_data, norm_parameters = normalization(data_x) #norm_data_x = np.nan_to_num(norm_data, 0) norm_data_x = np.nan_to_num(data_x, 0) ## CPH architecture # Input placeholders X_pre = tf.placeholder(tf.float32, shape=[1, 483, dim, 3]) # Data vector #X = tf.placeholder(tf.float32, shape = [None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables conv_filter_w1 = tf.Variable(tf.random_normal([1, 4, 3, 3])) conv_filter_b1 = tf.Variable(tf.random_normal([3])) conv_filter_w2 = tf.Variable(tf.random_normal([1, 4, 3, 1])) conv_filter_b2 = tf.Variable(tf.random_normal([1])) # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [ G_W1, G_W2, G_W3, G_b1, G_b2, G_b3, conv_filter_w1, conv_filter_b1, conv_filter_w2, conv_filter_b2 ] ## CPH functions # CNN + Generator def generator(x, m): relu_feature_maps1 = tf.nn.relu( \ tf.nn.conv2d(x, conv_filter_w1, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b1) max_pool1 = tf.nn.max_pool(relu_feature_maps1, ksize=[1, 1, 4, 1], strides=[1, 1, 1, 1], padding='SAME') relu_feature_maps2 = tf.nn.relu( \ tf.nn.conv2d(max_pool1, conv_filter_w2, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b2) max_pool2 = tf.nn.max_pool(relu_feature_maps2, ksize=[1, 1, 4, 1], strides=[1, 1, 1, 1], padding='SAME') x2 = tf.reshape(max_pool2, [483, dim]) # Concatenate Mask and Data inputs = tf.concat(values=[x2, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## CPH structure # Generator G_sample = generator(X_pre, M) X2 = X_pre[0, :, :, 0] # Combine with observed data Hat_X = X2 * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## CPH loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X2 - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## CPH solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) #print(len(batch_idx)) image_mb = data_image[:, batch_idx, :, :] X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb image_mb[0, :, :, 0] = X_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X_pre: image_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X_pre: image_mb, M: M_mb, H: H_mb}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb image_mb = data_image image_mb[0, :, :, 0] = X_mb imputed_data = sess.run([G_sample], feed_dict={ X_pre: image_mb, M: M_mb })[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization #imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
def gain(data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - feature_name: feature namelist of original data - onehotencoder: onehotencoder of this data - ori_data_dim: dimensions of original data - gain_parameters: GAIN network parameters: - data_name: the file name of dataset - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations - onehot: the number of feature for onehot encoder (start from first feature) - predict: option for prediction mode Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters data_name = gain_parameters['data_name'] batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] onehot = gain_parameters['onehot'] predict = gain_parameters['predict'] # Model Path model_path = 'model/' + data_name # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector q X = tf.placeholder(tf.float32, shape=[None, dim], name='X') # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim], name='M') # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim], name='H') # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='D_W1') # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b1') D_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='D_W2') D_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b2') D_W3 = tf.Variable(xavier_init([h_dim, dim]), name='D_W3') D_b3 = tf.Variable(tf.zeros(shape=[dim]), name='D_b3') # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1') G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1') G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2') G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2') G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3') G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3') theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() saver = tf.train.Saver() if predict is True and os.path.exists(model_path + '.ckpt.meta'): print("Model Restore") saver.restore(sess, model_path + '.ckpt') else: sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) if predict is False: save_path = saver.save(sess, model_path + '.ckpt') ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) # Reverse encoding if onehot > 0: imputed_data = reverse_encoding(imputed_data, feature_name, onehotencoder, onehot, ori_data_dim) return imputed_data
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector X = tf.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
def PC_GAIN (incomplete_data_x , gain_parameters , data_m): '''Impute missing values in incomplete_data_x Args: - incomplete_data_x: original data with missing values - gain_parameters: PC_GAIN network parameters: - batch_size: Batch size,64 - hint_rate: Hint rate,0.9 - alpha: Hyperparameter,200 - beta: Hyperparameter,20 - lambda_: Hyperparameter,0.2 - k: Hyperparameter,4 - iterations: Iterations,10000 Returns: - imputed_data: imputed data ''' # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] beta = gain_parameters['beta'] lambda_ = gain_parameters['lambda_'] k = gain_parameters['k'] iterations = gain_parameters['iterations'] cluster_species = gain_parameters['cluster_species'] # Other parameters no, dim = incomplete_data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data , norm_parameters = normalization(incomplete_data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## PC_GAIN architecture X = tf.placeholder(tf.float32, shape = [None, dim]) M = tf.placeholder(tf.float32, shape = [None, dim]) H = tf.placeholder(tf.float32, shape = [None, dim]) Z = tf.placeholder(tf.float32, shape = [None, dim]) Y = tf.placeholder(tf.float32, shape = [None, k]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape = [h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape = [h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape = [dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim*2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape = [h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape = [h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape = [dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] C_W1 = tf.Variable(xavier_init([dim, h_dim])) C_b1 = tf.Variable(tf.zeros(shape = [h_dim])) C_W2 = tf.Variable(xavier_init([h_dim, h_dim])) C_b2 = tf.Variable(tf.zeros(shape = [h_dim])) C_W3 = tf.Variable(xavier_init([h_dim, k])) C_b3 = tf.Variable(tf.zeros(shape = [k])) # 分类器 theta_C = [C_W1, C_b1, C_W2, C_b2, C_W3, C_b3] ## PC_GAIN functions # Generator def generator(x,m): # Concatenate Mask and Data inputs = tf.concat(values = [x, m], axis = 1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values = [x, h], axis = 1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob, D_logit # Classer (neural network classifier mentioned in the paper) def classer(feature): C_h1 = tf.nn.relu(tf.matmul(feature, C_W1) + C_b1) C_h2 = tf.nn.relu(tf.matmul(C_h1, C_W2) + C_b2) C_h3 = tf.matmul(C_h2, C_W3) + C_b3 C_prob = tf.nn.softmax(C_h3) return C_prob ## PC_GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1-M) # Discriminator D_prob, D_logit = discriminator(Hat_X, H) ## PC_GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8)) G_loss_with_C = -tf.reduce_mean(Y * tf.log(Y + 1e-8)) MSE_loss = tf.reduce_mean((M * X - M * G_sample) * (M * X - M * G_sample)) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss_pre = G_loss_temp + alpha * MSE_loss G_loss = G_loss_temp + alpha * MSE_loss + beta * G_loss_with_C ## PC_GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver_pre = tf.train.AdamOptimizer().minimize(G_loss_pre, var_list=theta_G) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) ##Select pre-training data loss_rate = [] for i in range(no): index = 0 for j in range(dim): if data_m[i,j] == 0: index = index + 1 loss_rate.append([index , i]) loss_rate = sorted(loss_rate,key=(lambda x:x[0])) no_x_L = int(no * lambda_) index_x_L = [] for i in range(no_x_L): index_x_L.append(loss_rate[i][1]) norm_data_x_L = norm_data_x[index_x_L, :] data_m_L = data_m[index_x_L, :] ##Pre-training print('...Pre-training') for it in tqdm(range(int(iterations * 0.7))): batch_idx = sample_batch_index(no_x_L, batch_size) X_mb = norm_data_x_L[batch_idx, :] M_mb = data_m_L[batch_idx, :] Z_mb = uniform_sampler(0, 0.01, batch_size, dim) H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp X_mb = M_mb * X_mb + (1-M_mb) * Z_mb _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb}) _, G_loss_curr, MSE_loss_curr = sess.run([G_solver_pre, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H:H_mb}) Z_mb = uniform_sampler(0, 0.01, no_x_L, dim) M_mb = data_m_L X_mb = norm_data_x_L X_mb = M_mb * X_mb + (1-M_mb) * Z_mb imputed_data_L = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0] imputed_data_L = data_m_L * norm_data_x_L + (1 - data_m_L) * imputed_data_L ## Select different clustering methods if cluster_species == 'KM': data_c , data_class = KM(imputed_data_L, k) elif cluster_species == 'SC': data_c , data_class = SC(imputed_data_L, k) elif cluster_species == 'AC': data_c , data_class = AC(imputed_data_L, k) elif cluster_species == 'KMPP': data_c , data_class = KMPP(imputed_data_L, k) else: exit('have not this cluster methods') ## Pseudo-label training multi-classification SVM ## You can also choose other classifiers, ## such as the neural network classifier mentioned in the paper coder = preprocessing.OneHotEncoder() model = svm.SVC(kernel="linear", decision_function_shape="ovo") coder.fit(data_class.reshape(-1,1)) model.fit(imputed_data_L, data_class) ## Updata the generator G and the discriminator D ## To avoid the effects of pre-training, ## you can also choose to reinitialize the generator parameters for it in tqdm(range(iterations)): batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] Z_mb = uniform_sampler(0, 0.01, batch_size, dim) H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp X_mb = M_mb * X_mb + (1-M_mb) * Z_mb _, D_loss_curr, D_logit_curr, D_prob_curr = sess.run([D_solver, D_loss_temp, D_logit, D_prob], feed_dict = {M: M_mb, X: X_mb, H:H_mb}) ## Introducing pseudo label supervision Hat_X_curr = sess.run(Hat_X, feed_dict = {X: X_mb, M: M_mb, H:H_mb}) y_pred = model.predict(Hat_X_curr) sample_prob = coder.transform(y_pred.reshape(-1,1)).toarray() _, G_loss_curr, MSE_loss_curr , G_loss_with_C_curr = sess.run([G_solver, G_loss_temp, MSE_loss, G_loss_with_C], feed_dict = {X: X_mb, M: M_mb, H:H_mb , Y:sample_prob}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1-M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data imputed_data = renormalization(imputed_data, norm_parameters) return imputed_data
def gain(data_x, gain_parameters, ori_data_x, train_index, test_index, mechanism): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyper-parameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape no_train = len(train_index) # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, nan=0) # pytorch. generator = Generator(dim, h_dim) discriminator = Discriminator(dim, h_dim) discriminator2 = Discriminator(dim, h_dim) # Optimizers generator_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999)) discriminator_optimizer = torch.optim.SGD(discriminator.parameters(), lr=0.0001) discriminator2_optimizer = torch.optim.SGD(discriminator2.parameters(), lr=0.0001) for i in tqdm(range(iterations), desc='pytorch'): # if i % 2000 == 0 and i != 0: # test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index) # Sample batch batch_idx = sample_batch_index(no_train, batch_size) X_mb = norm_data_x[train_index][batch_idx, :] M_mb = data_m[train_index][batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors if mechanism == 'mar': H_mb = hint_for_mar(hint_rate, M_mb) # np.logical_or(1 - h, 1 - mask) H_mb_2 = hint_for_mar(hint_rate, M_mb) # np.logical_or(1 - h, 1 - mask) else: H_mb = M_mb * binary_sampler(hint_rate, batch_size, dim) H_mb_2 = M_mb * binary_sampler(hint_rate, batch_size, dim) # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb X_mb = torch.Tensor(X_mb) M_mb = torch.Tensor(M_mb) H_mb = torch.Tensor(H_mb) H_mb_2 = torch.Tensor(H_mb_2) G_sample = generator(X_mb, M_mb) Hat_X = X_mb * M_mb + G_sample * (1 - M_mb) D_prob = discriminator(Hat_X, H_mb) D_prob2 = discriminator2(Hat_X, H_mb_2) d_loss_value = d_loss(M_mb, D_prob) d_loss_value2 = d_loss(M_mb, D_prob2) # y = torch.rand(1) g_loss_value = g_loss(M_mb, 0.5 * D_prob + 0.5 * D_prob2, alpha, X_mb, G_sample) discriminator2_optimizer.zero_grad() discriminator_optimizer.zero_grad() generator_optimizer.zero_grad() g_loss_value.backward(retain_graph=True) d_loss_value.backward(retain_graph=True) d_loss_value2.backward(retain_graph=True) generator_optimizer.step() discriminator_optimizer.step() discriminator2_optimizer.step() test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index)
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = (1 - np.isnan(data_x)).astype(float) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) # parameter initialization X = tf.convert_to_tensor(norm_data_x) X = tf.dtypes.cast(X, tf.float32) M = tf.convert_to_tensor(data_m) M = tf.dtypes.cast(M, tf.float32) X_input = tf.concat(values=[X, M], axis=1) ## GAIN architecture # Generator class Generator(tf.keras.Model): def __init__(self): super().__init__() self.flatten = layers.Flatten(input_shape=[dim * 2]) self.dense1 = layers.Dense(h_dim, activation='relu') self.dense2 = layers.Dense(h_dim, activation='relu') self.dense_output = layers.Dense(dim, activation='sigmoid') return def call(self, inputs, training=None): x = self.flatten(inputs) x = self.dense1(x) x = self.dense2(x) x = self.dense_output(x) return x # Discriminator class Discriminator(tf.keras.Model): def __init__(self): super().__init__() self.flatten = layers.Flatten(input_shape=[dim * 2]) self.dense1 = layers.Dense(h_dim, activation='relu') self.dense2 = layers.Dense(h_dim, activation='relu') self.dense_output = layers.Dense(dim, activation='sigmoid') return def call(self, inputs, training=None): x = self.flatten(inputs) x = self.dense1(x) x = self.dense2(x) x = self.dense_output(x) return x ## GAIN loss # Generator def generator_loss(generator, discriminator, x, m): generator.trainable = True discriminator.trainable = False G_input = tf.concat(values=[x, m], axis=1) G_sample = generator(G_input) MSE_loss = tf.reduce_mean( (m * x - m * G_sample)**2) / tf.reduce_mean(m) D_input = tf.concat(values=[G_sample, m], axis=1) D_prob = discriminator(D_input) G_loss_tmp = -tf.reduce_mean((1 - m) * tf.math.log(D_prob + 1e-8)) return G_loss_tmp + alpha * MSE_loss # Discriminator def discriminator_loss(generator, discriminator, x, m, h): generator.trainable = False discriminator.trainable = True G_input = tf.concat(values=[x, m], axis=1) G_sample = generator(G_input) x_hat = x * m + G_sample * (1 - m) D_input = tf.concat(values=[x_hat, h], axis=1) D_prob = discriminator(D_input) return -tf.reduce_mean(m * tf.math.log(D_prob + 1e-8) \ + (1-m) * tf.math.log(1. - D_prob + 1e-8)) # Build generator = Generator() generator.build(input_shape=(None, 2 * dim)) g_optimizer = tf.keras.optimizers.Adam() discriminator = Discriminator() discriminator.build(input_shape=(None, 2 * dim)) d_optimizer = tf.keras.optimizers.Adam() # Training one_tensor = tf.constant(1., shape=(batch_size, dim), dtype=float) for _ in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = tf.gather(X, batch_idx) M_mb = tf.gather(M, batch_idx) Z_mb = tf.convert_to_tensor(uniform_sampler(0, 0.01, batch_size, dim), dtype=float) H_mb_tmp = tf.convert_to_tensor(binary_sampler(hint_rate, batch_size, dim), dtype=float) H_mb = tf.math.multiply(M_mb, H_mb_tmp) # Combine random vectors with observed vectors # X_mb = M_mb * X_mb + (1-M_mb) * Z_mb X_mb = tf.math.add(tf.math.multiply(M_mb, X_mb), \ tf.math.multiply(tf.math.subtract(one_tensor, M_mb), Z_mb)) # training Discriminator with tf.GradientTape() as tape: d_loss = discriminator_loss(generator, discriminator, X_mb, M_mb, H_mb) grads = tape.gradient(d_loss, discriminator.trainable_variables) d_optimizer.apply_gradients( zip(grads, discriminator.trainable_variables)) # training Generator with tf.GradientTape() as tape: g_loss = generator_loss(generator, discriminator, X_mb, M_mb) grads = tape.gradient(g_loss, generator.trainable_variables) g_optimizer.apply_gradients(zip(grads, generator.trainable_variables)) ## Return imputed data imputed_data = np.array([]).reshape(0, dim) train_data = tf.data.Dataset.from_tensor_slices(X_input).batch(batch_size) train_data_iter = iter(train_data) while True: try: batch = next(train_data_iter) except StopIteration: break X_tmp = generator(batch).numpy() imputed_data = np.vstack([imputed_data, X_tmp]) # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Recovery imputed_data = data_m * np.nan_to_num(data_x) + (1 - data_m) * imputed_data # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
from utils import sample_batch_index, binary_sampler from tqdm import trange if __name__ == '__main__': # Load data file_name = 'data/house.csv' house_df = pd.read_csv(file_name) no, dim = house_df.shape data_x = house_df.values.astype(np.float32) num_samples = 200 miss_rate = 0.3 for i in trange(num_samples): # random samples sample_idx = sample_batch_index(no, 10000) data_x_i = data_x[sample_idx, :] no_i, dim_i = data_x_i.shape np.savetxt("./samples/complete/sample_{}.csv".format(i), data_x_i, delimiter=",") # Introduce missing data data_m = binary_sampler(1 - miss_rate, no_i, dim_i) miss_data_x = data_x_i.copy() miss_data_x[data_m == 0] = np.nan np.savetxt("./samples/MCAR/sample_{}.csv".format(i), miss_data_x, delimiter=",")