示例#1
0
文件: reward.py 项目: microsoft/VAEM
 def completion(self, x, mask, vae,):
     '''
     function to generate new samples conditioned on observations
     :param x: underlying partially observed data
     :param mask: mask of missingness
     :param M: number of MC samples
     :param cat_dims: a list that indicates the number of potential outcomes for non-continuous variables.
     :param dic_var_type: a list that indicates the whether a variable is continuous.
     :param vae: a pre-trained vae.
     :param list_discrete: list of discrete variables
     :return: sampled missing data, a M by N by D matrix, where M is the number of samples.
     '''
     ## decompress mask
     mask_flt = mask[:, np.ndarray.flatten(np.argwhere(self._dic_var_type == 0))]
     mask_cat_oh = np.array([]).reshape(x.shape[0], 0)
     for d in range(len(self._cat_dims)):
         temp = np.ones((x.shape[0], self._cat_dims[d]))
         temp[mask[:, d] == 0, :] = 0
         mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1)
     mask = np.concatenate([mask_cat_oh, mask_flt ], 1)
     im = np.zeros((self._M, x.shape[0], x.shape[1]))
     for m in range(self._M):
         #tf.reset_default_graph()
         np.random.seed(42 + m)  ### added for bar plots only
         noisy_samples = vae.im(x, mask)
         noisy_samples_mix = x*mask + noisy_samples*(1-mask)
         inverted_samples = process.invert_noise(noisy_samples_mix,self._list_discrete,self._records_d)  
         im[m, :, :] = inverted_samples
 #         im[m,:,:] = noisy_samples_mix
     return im
示例#2
0
 def get_imputation(self, x, mask_obs, cat_dims, dic_var_type,):
     mask_flt = mask_obs[:, np.ndarray.flatten(np.argwhere(dic_var_type == 0))]
     mask_cat_oh = np.array([]).reshape(x.shape[0], 0)
     for d in range(len(cat_dims)):
         temp = np.ones((x.shape[0], cat_dims[d]))
         temp[mask_obs[:, d] == 0, :] = 0
         mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1)
     mask_obs = np.concatenate([mask_cat_oh,mask_flt], 1)
     decoded_noisy = self._sesh.run(self.decoded,feed_dict={self.x: x, self.mask: mask_obs,self.x_induce:self._x_train})
     z_posterior = self._sesh.run(self.z,feed_dict={self.x: x, self.mask: mask_obs,self.x_induce:self._x_train})
     decoded = process.invert_noise(decoded_noisy,self._list_discrete,self._records_d)
     # revert decode
     dim_cat = len(np.argwhere(cat_dims != -1))
     decoded_cat = decoded[:,0:self._DIM_CAT]
     decoded_flt = decoded[:,self._DIM_CAT:]
     decoded_cat_int = np.zeros((decoded.shape[0],dim_cat))
     cumsum_cat_dims = np.concatenate( ([0],np.cumsum(cat_dims)))
     decoded_cat_p = []
     for d in range(len(cat_dims)):
         decoded_cat_int_p = decoded_cat[:,cumsum_cat_dims[d]:cumsum_cat_dims[d+1]]
         decoded_cat_int_p = decoded_cat_int_p/np.sum(decoded_cat_int_p,1,keepdims=True)
         if d==0:
             decoded_cat_p = decoded_cat_int_p
         else:
             decoded_cat_p = np.concatenate([decoded_cat_p,decoded_cat_int_p],1)
         for n in range(decoded.shape[0]):
             decoded_cat_int[n,d] = np.random.choice(len(decoded_cat_int_p[n,:]), 1 , p=decoded_cat_int_p[n,:])
         print(decoded_cat_int[:,d].max())
     decoded = np.concatenate((decoded_cat_int,decoded_flt),axis=1)
     return decoded,z_posterior,decoded_cat_p
示例#3
0
 def predictive_loss(self, x, mask, cat_dims, dic_var_type, M):
     '''
     This function computes predictive losses (negative llh).
     This is used for active learning phase.
     We assume that the last column of x is the target variable of interest
     :param x: data matrix, the last column of x is the target variable of interest
     :param mask: mask that indicates observed data and missing data locations
     :return: MAE and RMSE
     '''
     lh = 0
     rmse = 0
     ae = 0
     uncertainty_data = np.zeros((x.shape[0], M))
     # decompress mask
     mask_flt = mask[:, np.ndarray.flatten(np.argwhere(dic_var_type == 0))]
     mask_cat_oh = np.array([]).reshape(x.shape[0], 0)
     for d in range(len(cat_dims)):
         temp = np.ones((x.shape[0], cat_dims[d]))
         temp[mask[:, d] == 0, :] = 0
         mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1)
     mask = np.concatenate([mask_cat_oh, mask_flt], 1)
     auto_std = self._sesh.run(self.auto_std,
                               feed_dict={
                                   self.x: x,
                                   self.mask: mask,
                                   self.x_induce: self._x_train
                               })
     for m in range(M):
         decoded_noisy = self._sesh.run(self.decoded,
                                        feed_dict={
                                            self.x: x,
                                            self.mask: mask,
                                            self.x_induce: self._x_train
                                        })
         decoded = process.invert_noise(decoded_noisy, self._list_discrete,
                                        self._records_d)
         target = x[:, -1]
         output = decoded[:, -1]
         uncertainty_data[:, m] = decoded[:, -1]
         lh += np.exp(-0.5 * np.square(target - output) /
                      (np.square(auto_std[:, -1])) -
                      np.log(auto_std[:, -1]) - 0.5 * np.log(2 * np.pi))
         rmse += np.sqrt(
             np.sum(np.square(target - output)) / np.sum(mask.shape[0]))
         ae += np.abs(target - output)
     nllh = -np.log(lh / M)
     rmse /= M
     ae /= M
     return nllh, ae
示例#4
0
def encode2(data_decode, list_discrete, records_d, fast_plot):
    # Extracting Masked Decomp Data from data_decode function obtained from load_data function
    Data_train_decomp, Data_train_noisy_decomp, mask_train_decomp, Data_test_decomp, mask_test_comp, mask_test_decomp, cat_dims, DIM_FLT, dic_var_type = data_decode

    vae = p_vae_active_learning(Data_train_decomp, Data_train_noisy_decomp,
                                mask_train_decomp, Data_test_decomp,
                                mask_test_comp, mask_test_decomp, cat_dims,
                                DIM_FLT, dic_var_type, args, list_discrete,
                                records_d)

    x_real = process.compress_data(
        Data_train_decomp, cat_dims,
        dic_var_type)  ## x_real still needs conversion
    x_real_cat_p = Data_train_decomp[:, 0:(cat_dims.sum()).astype(int)]
    tf.reset_default_graph()

    x_recon, z_posterior, x_recon_cat_p = vae.get_imputation(
        Data_train_noisy_decomp, mask_train_decomp * 0, cat_dims,
        dic_var_type)  ## one hot already converted to integer

    max_Data = 0.7
    min_Data = 0.3
    Data_std = (x_real - x_real.min(axis=0)) / (x_real.max(axis=0) -
                                                x_real.min(axis=0))
    scaling_factor = (x_real.max(axis=0) - x_real.min(axis=0)) / (max_Data -
                                                                  min_Data)
    Data_real = Data_std * (max_Data - min_Data) + min_Data

    fast_plot = 1

    sub_id = [1, 2, 10]

    if fast_plot:
        Data_real = pd.DataFrame(Data_real[:, sub_id])
        g = sns.pairplot(Data_real.sample(min(1000, x_real.shape[0])),
                         diag_kind='kde')
        g = g.map_diag(sns.distplot, bins=50, norm_hist=True)
        g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data))
    else:
        Data_real = pd.DataFrame(Data_real[:, sub_id])
        g = sns.pairplot(Data_real.sample(min(10000, x_real.shape[0])),
                         diag_kind='kde')
        g = g.map_diag(sns.distplot, bins=50, norm_hist=True)
        g = g.map_upper(plt.scatter, marker='+')
        g = g.map_lower(sns.kdeplot, cmap="hot", shade=True, bw=.1)
        g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data))

    Data_fake_noisy = x_recon
    Data_fake = process.invert_noise(Data_fake_noisy, list_discrete, records_d)

    Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) -
                                                   x_real.min(axis=0))
    Data_fake = Data_std * (max_Data - min_Data) + min_Data

    sub_id = [1, 2, 10]

    if fast_plot:
        g = sns.pairplot(pd.DataFrame(Data_fake[:, sub_id]).sample(
            min(1000, x_real.shape[0])),
                         diag_kind='kde')
        g = g.map_diag(sns.distplot, bins=50, norm_hist=True)
        g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data))
    else:
        g = sns.pairplot(pd.DataFrame(Data_fake[:, sub_id]).sample(
            min(1000, x_real.shape[0])),
                         diag_kind='kde')
        g = g.map_diag(sns.distplot, bins=50, norm_hist=True)
        g = g.map_upper(plt.scatter, marker='+')
        g = g.map_lower(sns.kdeplot, cmap="hot", shade=True, bw=.1)
        g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data))

    return vae, scaling_factor
示例#5
0
1.0
2.0
0.0
1.0
76.0
11.0
3.0
7.0
2.0
2.0
2.0
1.0

In [7]:
Data_fake_noisy= x_recon
Data_fake = process.invert_noise(Data_fake_noisy,list_discrete_compressed,records_d) 

Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0))
Data_fake = Data_std * (max_Data - min_Data) + min_Data


sub_id = [1,2,10]

if fast_plot ==1:
    g = sns.pairplot(pd.DataFrame(Data_fake[:,sub_id]).sample(min(1000,x_real.shape[0])),diag_kind = 'kde')
    g = g.map_diag(sns.distplot, bins = 50,norm_hist = True)
    g.set(xlim=(min_Data,max_Data), ylim = (min_Data,max_Data))
else:
    g = sns.pairplot(pd.DataFrame(Data_fake[:,sub_id]).sample(min(1000,x_real.shape[0])),diag_kind = 'kde')
    g = g.map_diag(sns.distplot, bins = 50,norm_hist = True)
    g = g.map_upper(plt.scatter,marker='+')