def ens_john(X, y, x): from sklearn.cluster import KMeans from utils import dist from itertools import chain mean_psu = 1 mean_ssu = 50 mean_M = 60 var_psu = 3 var_ssu = 7 var_M = 10 kmeans = KMeans(n_clusters=10) kmeans.fit(np.concatenate([X], axis=0)) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) class translatedSigmoid(nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta * (6.9077542789816375) return torch.sigmoid((x + alpha) / beta) class GPNNModel(nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1)) self.alph = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1), nn.Softplus()) self.bet = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1), nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8)) if self.training: samples_var = gamma_dist.rsample(torch.Size([50])) x_var = (1.0 / (samples_var + 1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([2000])) x_var = (1.0 / (samples_var + 1e-8)) var = (1 - s) * x_var + s * torch.tensor([3.5**2 ]) # HYPERPARAMETER else: var = torch.tensor([0.05]) return mean, var ens_mean, ens_var = [], [] for i in range(5): model = GPNNModel() optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-3) n_iter = 6000 it = 0 mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) mean_pseupoch = get_pseupoch(mean_w, 0.5) var_pseupoch = get_pseupoch(var_w, 0.5) opt_switch = 1 mean_w = torch.Tensor(mean_w) var_w = torch.Tensor(var_w) model.train() while it < n_iter: model.train() switch = 1.0 if it > 5000 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer if not switch: optimizer.zero_grad() m, v = model(X, switch) loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).sum() loss.backward() optimizer.step() else: if opt_switch % 2 == 0: for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w) m, v = model(X[batch], switch) loss = -t_likelihood( y[batch], m, v, mean_w[batch] ) #-(-v.log() - ((m.flatten()-y[batch])**2).reshape(1,-1,1) / (2 * v)) / mean_w[batch].reshape(1,-1,1) loss = loss.sum() # why the f*** is it so slow loss.backward() optimizer.step() else: for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w) m, v = model(X[batch], switch) loss = -t_likelihood( y[batch], m, v, var_w[batch] ) #-(-(diff.log() / 2 + diff/v + v.log() / 2)) / var_w[batch].reshape(1,-1,1) loss = loss.sum() # why the f*** is it so slow loss.backward() optimizer2.step() if it % 500 == 0: model.eval() m, v = model(X, switch) loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).mean() print('Iter {0}/{1}, Loss {2}'.format(it, n_iter, loss.item())) it += 1 model.eval() with torch.no_grad(): mean, var = model(x, switch) ens_mean.append(mean) ens_var.append(var.mean(dim=0)) ens_mean = torch.stack(ens_mean) ens_var = torch.stack(ens_var) mean = ens_mean.mean(dim=0) var = (ens_var + ens_mean**2).mean(dim=0) - mean**2 return mean.numpy(), var.sqrt().numpy()
def john(args, X, y, Xval, yval): from sklearn.cluster import KMeans from utils import dist from itertools import chain from torch import distributions as D from locality_sampler import gen_Qw, locality_sampler2 from sklearn.decomposition import PCA if args.dataset == 'protein' or args.dataset == 'year_prediction': n_neurons = 100 else: n_neurons = 50 args.n_clusters = min(args.n_clusters, X.shape[0]) y, y_mean, y_std = normalize_y(y) mean_psu = 1 mean_ssu = 40 mean_M = 50 var_psu = 2 var_ssu = 10 var_M = 10 num_draws_train = 20 kmeans = KMeans(n_clusters=args.n_clusters) if args.dataset != 'year_prediction': kmeans.fit(np.concatenate([X], axis=0)) else: kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))]) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) if torch.cuda.is_available() and args.cuda: c = torch.tensor(c).to(torch.float32).to('cuda') else: c = torch.tensor(c).to(torch.float32) class translatedSigmoid(torch.nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = torch.nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta*(6.9077542789816375) return torch.sigmoid((x+alpha)/beta) class GPNNModel(torch.nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1])) self.alph = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1]), torch.nn.Softplus()) self.bet = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1]), torch.nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a+1e-8, 1.0/(b+1e-8)) if self.training: samples_var = gamma_dist.rsample(torch.Size([num_draws_train])) x_var = (1.0/(samples_var+1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([2000])) x_var = (1.0/(samples_var+1e-8)) var = (1-s) * x_var + s * y_std ** 2 else: var = 0.05*torch.ones_like(mean) return mean, var model = GPNNModel() if torch.cuda.is_available() and args.cuda: model.cuda() device=torch.device('cuda') else: device=torch.device('cpu') optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-4) mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) if X.shape[0] > 100000 and X.shape[1] > 10: pca = PCA(n_components=0.5) temp = pca.fit_transform(X) var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M) else: var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) #mean_pseupoch = get_pseupoch(mean_w,0.5) #var_pseupoch = get_pseupoch(var_w,0.5) opt_switch = 1 mean_w = torch.tensor(mean_w).to(torch.float32).to(device) var_w = torch.tensor(var_w).to(torch.float32).to(device) model.train() X = torch.tensor(X).to(torch.float32).to(device) y = torch.tensor(y).to(torch.float32).to(device) batches = batchify(X, y, batch_size = args.batch_size, shuffel=args.shuffel) # validation data and performance measures ll_list = [] mae_list = [] rmse_list = [] x_eval = torch.tensor(Xval).to(torch.float32).to(device) y_eval = torch.tensor(yval).to(torch.float32).to(device) y_mean = torch.tensor(y_mean).to(torch.float32).to(device) y_std = torch.tensor(y_std).to(torch.float32).to(device) it = 0 its_per_epoch = int(np.ceil(X.shape[0] / args.batch_size)) epochs = round(args.iters / its_per_epoch) while it < args.iters: switch = 1.0 if it > args.iters/2.0 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer with torch.autograd.detect_anomaly(): data, label = next(batches) if not switch: optimizer.zero_grad() m, v = model(data, switch) loss = -t_likelihood(label, m, v.unsqueeze(0)) loss.backward() optimizer.step() else: if opt_switch % 2 == 0: #for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu,mean_ssu,mean_Q,mean_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch], m, v, mean_w[batch]) loss.backward() optimizer.step() else: #for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu,var_ssu,var_Q,var_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch], m, v, var_w[batch]) loss.backward() optimizer2.step() # test on validation set once per epoch if it % its_per_epoch == 0: model.eval() with torch.no_grad(): m, v = model(x_eval, switch) m = m * y_std + y_mean v = v * y_std ** 2 if switch == 0: ll = t_likelihood(y_eval, m, v.unsqueeze(0)).item() else: ll = t_likelihood(y_eval, m, v).item() # if it % (500 * its_per_epoch) == 0: # print('Epoch {:d}/{:d},'.format(it // its_per_epoch, epochs), 'Loss {:.4f},'.format(ll)) # log validation performance after we are stable in the second optimization regime if it > args.iters * 0.60: ll_list.append(ll) error = torch.norm(y_eval - m, p=2, dim=1) mae_list.append(error.abs().mean().item()) rmse_list.append((error ** 2).mean().sqrt().item()) model.train() # early stop check if len(ll_list) - np.argmax(ll_list) > 50: it = args.iters print('Early Stop!') it+=1 # get best LL i_best = np.argmax(ll_list) # evaluate model moments with torch.no_grad(): model.training = False m, v = model(x_eval, 1.0) m = m * y_std + y_mean v = v * y_std ** 2 return ll_list[i_best], rmse_list[i_best], m.cpu().numpy(), v.cpu().numpy()
def fit(self, Xtrain, x_test, Xplot, n_iters=100, lr=1e-3, batch_size=250, n_clusters=50, beta=1.0, its_per_epoch=2500): self.train() if self.device == torch.device('cuda'): self.cuda() optimizer1 = torch.optim.Adam( chain( #self.enc_mu.parameters(), #self.enc_var.parameters(), self.enc.parameters(), self.dec_mu.parameters()), lr=lr) optimizer2 = torch.optim.Adam( chain( #self.enc_mu.parameters(), #self.enc_var.parameters(), self.enc.parameters(), self.dec_mu.parameters()), lr=lr) optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(), self.beta.parameters()), lr=lr) it = 0 batches = batchify(Xtrain, batch_size=batch_size, shuffel=True) local_batches = local_batchify(Xtrain) progressBar = tqdm(desc='training', total=n_iters, unit='iter') loss, var = [[], [], []], [] x_plot = torch.tensor(Xplot).to(torch.float32).to(self.device) ll_best = -np.inf epoch_best = np.inf while it < n_iters: self.switch = 1.0 if it > n_iters / 2 else 0.0 anneling = np.minimum(1, it / (n_iters / 2)) * beta #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch # if self.switch and (it % 1000 == 0 or not hasattr(self, "C")): if self.switch and not hasattr(self, "C"): kmeans = KMeans(n_clusters=n_clusters) kmeans.fit( self.encoder(torch.tensor(Xtrain).to( self.device))[0].detach().cpu().numpy()) self.C = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(self.device) if not self.switch: x = next(batches) x = torch.tensor(x).to(torch.float32).to(self.device) optimizer1.zero_grad() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) (-elbo).backward() optimizer1.step() else: x, mean_w, var_w = next(local_batches) x = torch.tensor(x).to(torch.float32).to(self.device) mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device) var_w = torch.tensor(var_w).to(torch.float32).to(self.device) elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) if self.opt_switch % 2 == 0: optimizer2.zero_grad() elbo = t_likelihood(x, x_mu, x_var, mean_w) / Xtrain.shape[0] - kl.mean() (-elbo).backward() optimizer2.step() else: optimizer3.zero_grad() elbo = t_likelihood(x, x_mu, x_var, mean_w) / Xtrain.shape[0] - kl.mean() (-elbo).backward() optimizer3.step() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) progressBar.update() progressBar.set_postfix({ 'elbo': (-elbo).item(), 'x_var': x_var.mean().item(), 'anneling': anneling }) # loss[0].append((-elbo).item()) # loss[1].append(log_px.mean().item()) # loss[2].append(kl.mean().item()) # var.append(x_var.mean().item()) if it % its_per_epoch == 0: self.eval() with torch.no_grad(): ll = [] mean_x = [] var_x = [] for i in range(int(np.ceil(x_test.shape[0] / batch_size))): i_start = i * batch_size i_end = min((i + 1) * batch_size, x_test.shape[0]) x = torch.tensor(x_test[i_start:i_end]).to( torch.float32).to(self.device) _, l, _, m, v, _, _, _ = self.forward(x, anneling) ll.append(l.cpu().numpy()) mean_x.append(m.cpu().numpy()) var_x.append(v.cpu().numpy()) ll = np.mean(np.concatenate(ll)) mean_x = np.concatenate(mean_x, axis=0) var_x = np.concatenate(var_x, axis=0) print('\nEpoch {:d}/{:d}: LL = {:.4f}'.format( it // its_per_epoch, n_iters // its_per_epoch, ll)) if ll > ll_best and it > n_iters * 0.6: ll_best = ll px = D.Independent( D.Normal( torch.tensor(mean_x).to(torch.float32).to( self.device), torch.tensor(var_x**0.5).to(torch.float32).to( self.device)), 1) rmse_best = np.sqrt( np.mean((x_test - px.sample().cpu().numpy())**2)) h_best = px.entropy().mean().item() epoch_best = it // its_per_epoch _, _, _, mean_x, var_x, _, _, _ = self.forward( x_plot, anneling) px = D.Independent(D.Normal(mean_x, var_x**0.5), 1) mean_best = mean_x.cpu().numpy() var_best = var_x.cpu().numpy() sample_best = px.sample().cpu().numpy() elif self.switch and it // its_per_epoch > epoch_best + 50: print('Early Stop!') break self.train() it += 1 progressBar.close() return ll_best, rmse_best, h_best, mean_best, var_best, sample_best
def john(args, X, y, Xval, yval): from sklearn.cluster import KMeans from utils import dist from itertools import chain from torch import distributions as D from locality_sampler import gen_Qw, locality_sampler2 from sklearn.decomposition import PCA if args.dataset == 'protein' or args.dataset == 'year_prediction': n_neurons = 100 else: n_neurons = 50 args.n_clusters = min(args.n_clusters, X.shape[0]) y, y_mean, y_std = normalize_y(y) mean_psu = 1 mean_ssu = 40 mean_M = 50 var_psu = 2 var_ssu = 10 var_M = 10 num_draws_train = 20 kmeans = KMeans(n_clusters=args.n_clusters) if args.dataset != 'year_prediction': kmeans.fit(np.concatenate([X], axis=0)) else: kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))]) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) if torch.cuda.is_available() and args.cuda: c = torch.tensor(c).to(torch.float32).to('cuda') else: c = torch.tensor(c).to(torch.float32) class translatedSigmoid(torch.nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = torch.nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta * (6.9077542789816375) return torch.sigmoid((x + alpha) / beta) class GPNNModel(torch.nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1)) self.alph = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.bet = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8)) if self.training: samples_var = gamma_dist.rsample( torch.Size([num_draws_train])) x_var = (1.0 / (samples_var + 1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([1000])) x_var = (1.0 / (samples_var + 1e-8)) var = (1 - s) * x_var + s * torch.tensor( [y_std**2], device=x.device) # HYPERPARAMETER else: var = 0.05 * torch.ones_like(mean) return mean, var model = GPNNModel() if torch.cuda.is_available() and args.cuda: model.cuda() device = torch.device('cuda') else: device = torch.device('cpu') optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-4) mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) if X.shape[0] > 100000 and X.shape[1] > 10: pca = PCA(n_components=0.5) temp = pca.fit_transform(X) var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M) else: var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) #mean_pseupoch = get_pseupoch(mean_w,0.5) #var_pseupoch = get_pseupoch(var_w,0.5) opt_switch = 1 mean_w = torch.tensor(mean_w).to(torch.float32).to(device) var_w = torch.tensor(var_w).to(torch.float32).to(device) model.train() X = torch.tensor(X).to(torch.float32).to(device) y = torch.tensor(y).to(torch.float32).to(device) batches = batchify(X, y, batch_size=args.batch_size, shuffel=args.shuffel) it = 0 while it < args.iters: switch = 1.0 if it > args.iters / 2.0 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer with torch.autograd.detect_anomaly(): data, label = next(batches) if not switch: optimizer.zero_grad() m, v = model(data, switch) loss = -t_likelihood(label.reshape(-1, 1), m, v.reshape(1, -1, 1)) / X.shape[0] loss.backward() optimizer.step() else: if opt_switch % 2 == 0: #for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch].reshape(-1, 1), m, v, mean_w[batch]) / X.shape[0] loss.backward() optimizer.step() else: #for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch].reshape(-1, 1), m, v, var_w[batch]) / X.shape[0] loss.backward() optimizer2.step() if it % 500 == 0: m, v = model(data, switch) loss = -(-v.log() / 2 - ((m.flatten() - label)**2).reshape(1, -1, 1) / (2 * v)).mean() print('Iter {0}/{1}, Loss {2}'.format(it, args.iters, loss.item())) it += 1 model.eval() data = torch.tensor(Xval).to(torch.float32).to(device) label = torch.tensor(yval).to(torch.float32).to(device) with torch.no_grad(): m, v = model(data, switch) m = m * y_std + y_mean v = v * y_std**2 #log_px = normal_log_prob(label, m, v).mean(dim=0) # check for correctness log_px = t_likelihood(label.reshape(-1, 1), m, v) / Xval.shape[0] # check rmse = ((label - m.flatten())**2).mean().sqrt() return log_px.mean().item(), rmse.item()
def jnlsmv(args, X, y, Xval, yval): from sklearn.cluster import KMeans from utils import dist from torch import distributions as D if args.dataset == 'protein' or args.dataset == 'year_prediction': n_neurons = 100 else: n_neurons = 50 args.n_clusters = min(args.n_clusters, X.shape[0]) y, y_mean, y_std = normalize_y(y) kmeans = KMeans(n_clusters=args.n_clusters) kmeans.fit(np.concatenate([X], axis=0)) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) if torch.cuda.is_available() and args.cuda: c = torch.tensor(c).to(torch.float32).to('cuda') else: c = torch.tensor(c).to(torch.float32) class translatedSigmoid(torch.nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = torch.nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta * (6.9077542789816375) return torch.sigmoid((x + alpha) / beta) class GPNNModel(torch.nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.Sigmoid(), torch.nn.Linear(n_neurons, 1)) self.alph = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.bet = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8)) if self.training: samples_var = gamma_dist.rsample(torch.Size([20])) x_var = (1.0 / (samples_var + 1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([1000])) x_var = (1.0 / (samples_var + 1e-8)).mean(dim=0) var = (1 - s) * x_var + s * torch.tensor( [3.5**2], device=x.device) # HYPERPARAMETER else: var = torch.tensor([0.05], device=x.device) return mean, var model = GPNNModel() if torch.cuda.is_available() and args.cuda: model.cuda() device = torch.device('cuda') else: device = torch.device('cpu') optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-4) it = 0 opt_switch = 0 progressBar = tqdm(desc='Training nn', total=args.iters, unit='iter') batches = local_batchify(X, y, batch_size=args.batch_size, shuffel=args.shuffel) while it < args.iters: switch = 1.0 if it > args.iters / 2 else 0.0 if it % 11 == 0 and switch: opt_switch = opt_switch + 1 # change between var and mean optimizer data, label, mean_w, var_w = next(batches) data = torch.tensor(data).to(torch.float32).to(device) label = torch.tensor(label).to(torch.float32).to(device) mean_w = torch.tensor(mean_w).to(torch.float32).to(device) var_w = torch.tensor(var_w).to(torch.float32).to(device) if opt_switch % 2 == 0: #for b in range(mean_pseupoch): optimizer.zero_grad() #batch = locality_sampler2(mean_psu,mean_ssu,mean_Q,mean_w) m, v = model(data, switch) loss = -t_likelihood(label.reshape(-1, 1), m, v, mean_w) / X.shape[0] loss.backward() optimizer.step() else: #for b in range(var_pseupoch): optimizer2.zero_grad() #batch = locality_sampler2(var_psu,var_ssu,var_Q,var_w) m, v = model(data, switch) loss = -t_likelihood(label.reshape(-1, 1), m, v, var_w) / X.shape[0] loss.backward() optimizer2.step() it += 1 progressBar.update() progressBar.set_postfix({'loss': loss.item()}) progressBar.close() data = torch.tensor(Xval).to(torch.float32).to(device) label = torch.tensor(yval).to(torch.float32).to(device) m, v = model(data, switch) m = m * y_std + y_mean v = v * y_std**2 log_px = t_likelihood(label.reshape(-1, 1), m, v) rmse = ((label - m.flatten())**2).mean().sqrt() return log_px.mean().item(), rmse.item()
def fit(self, Xtrain, n_iters=100, lr=1e-3, batch_size=250, n_clusters=50, beta=1.0): self.train() if self.device == torch.device('cuda'): self.cuda() optimizer1 = torch.optim.Adam(chain(self.enc_mu.parameters(), self.enc_var.parameters(), self.dec_mu.parameters()), lr=lr) optimizer2 = torch.optim.Adam(chain(self.enc_mu.parameters(), self.enc_var.parameters(), self.dec_mu.parameters()), lr=lr) optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(), self.beta.parameters()), lr=lr) it = 0 batches = batchify(Xtrain, batch_size=batch_size, shuffel=True) local_batches = local_batchify(Xtrain) progressBar = tqdm(desc='training', total=n_iters, unit='iter') loss, var = [[], [], []], [] while it < n_iters: self.switch = 1.0 if it > n_iters / 2 else 0.0 anneling = np.minimum(1, it / (n_iters / 2)) * beta #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch if self.switch and (it % 1000 == 0 or not hasattr(self, "C")): kmeans = KMeans(n_clusters=n_clusters) kmeans.fit( self.encoder(torch.tensor(Xtrain).to( self.device))[0].detach().cpu().numpy()) self.C = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(self.device) if not self.switch: x = next(batches) x = torch.tensor(x).to(torch.float32).to(self.device) optimizer1.zero_grad() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) (-elbo).backward() optimizer1.step() else: x, mean_w, var_w = next(local_batches) x = torch.tensor(x).to(torch.float32).to(self.device) mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device) var_w = torch.tensor(var_w).to(torch.float32).to(self.device) elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) if self.opt_switch % 2 == 0: optimizer2.zero_grad() elbo = t_likelihood(x, x_mu, x_var, mean_w) / Xtrain.shape[0] - kl.mean() (-elbo).backward() optimizer2.step() else: optimizer3.zero_grad() elbo = t_likelihood(x, x_mu, x_var, mean_w) / Xtrain.shape[0] - kl.mean() (-elbo).backward() optimizer3.step() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) progressBar.update() progressBar.set_postfix({ 'elbo': (-elbo).item(), 'x_var': x_var.mean().item(), 'anneling': anneling }) loss[0].append((-elbo).item()) loss[1].append(log_px.mean().item()) loss[2].append(kl.mean().item()) var.append(x_var.mean().item()) it += 1 if it % 2500 == 0: self.save_something('it' + str(it), Xtrain[::20]) progressBar.close() return loss, var
def fit(self, Xtrain, x_test, Xplot, n_iters=100, lr=1e-3, batch_size=250, n_clusters=50, beta=1.0, its_per_epoch=2500): self.train() if self.device == torch.device('cuda'): self.cuda() optimizer1 = torch.optim.Adam( chain( #self.enc_mu.parameters(), #self.enc_var.parameters(), self.enc.parameters(), self.dec_mu.parameters()), lr=lr) optimizer2 = torch.optim.Adam( chain( #self.enc_mu.parameters(), #self.enc_var.parameters(), self.enc.parameters(), self.dec_mu.parameters()), lr=lr) optimizer3 = torch.optim.Adam(chain(self.alpha.parameters(), self.beta.parameters()), lr=lr) it = 0 batches = batchify(Xtrain, batch_size=batch_size, shuffel=True) local_batches = local_batchify(Xtrain) progressBar = tqdm(desc='training', total=n_iters, unit='iter') x_plot = torch.tensor(Xplot).to(torch.float32).to(self.device) ll_best = -np.inf epoch_best = np.inf while it < n_iters: self.switch = 1.0 if it > n_iters / 2 else 0.0 anneling = np.minimum(1, it / (n_iters / 2)) * beta #self.opt_switch = (self.opt_switch+1) if (it % 11 == 0 and self.switch) else self.opt_switch # if self.switch and (it % 1000 == 0 or not hasattr(self, "C")): if self.switch and not hasattr(self, "C"): kmeans = KMeans(n_clusters=n_clusters) kmeans.fit( self.encoder(torch.tensor(Xtrain).to( self.device))[0].detach().cpu().numpy()) self.C = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(self.device) if not self.switch: x = next(batches) x = torch.tensor(x).to(torch.float32).to(self.device) optimizer1.zero_grad() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) (-elbo).backward() optimizer1.step() else: x, mean_w, var_w = next(local_batches) x = torch.tensor(x).to(torch.float32).to(self.device) mean_w = torch.tensor(mean_w).to(torch.float32).to(self.device) var_w = torch.tensor(var_w).to(torch.float32).to(self.device) elbo, logpx, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) if self.opt_switch % 2 == 0: optimizer2.zero_grad() elbo = t_likelihood(x, x_mu, x_var, mean_w) - kl.mean() (-elbo).backward() optimizer2.step() else: optimizer3.zero_grad() elbo = t_likelihood(x, x_mu, x_var, var_w) - kl.mean() (-elbo).backward() optimizer3.step() elbo, log_px, kl, x_mu, x_var, z, z_mu, z_var = self.forward( x, anneling) progressBar.update() progressBar.set_postfix({ 'elbo': (-elbo).item(), 'x_var': x_var.mean().item(), 'anneling': anneling }) # epoch complete and in second phase of training (i.e. fitting variance) if it % its_per_epoch == 0 and self.switch: self.eval() with torch.no_grad(): # initialize containers ll = [] elbo = [] mean_residuals = [] var_residuals = [] sample_residuals = [] # loop over batches for i in range(int(np.ceil(x_test.shape[0] / batch_size))): # run Detlefsen network i_start = i * batch_size i_end = min((i + 1) * batch_size, x_test.shape[0]) x = torch.tensor(x_test[i_start:i_end]).to( torch.float32).to(self.device) _, _, _, mu_x, sigma2_x, _, _, _ = self.forward( x, anneling) elbo_test = t_likelihood(x, mu_x, sigma2_x) - kl.mean() mean = mu_x.cpu().numpy() variance = sigma2_x.cpu().numpy() # create p(x|x): a uniform mixture of Normals over the variance samples components = [] for v in tf.unstack(variance): normal = tfp.distributions.Normal(loc=mean, scale=v**0.5) components.append( tfp.distributions.Independent( normal, reinterpreted_batch_ndims=1)) cat = tfp.distributions.Categorical( logits=tf.ones((variance.shape[1], variance.shape[0]))) px_x = tfp.distributions.Mixture(cat=cat, components=components) # append results x = x.cpu().numpy() elbo.append(elbo_test.cpu().numpy()) ll.append(px_x.log_prob(x)) mean_residuals.append(px_x.mean() - x) var_residuals.append(px_x.variance() - mean_residuals[-1]**2) sample_residuals.append(px_x.sample() - x) # if mean likelihood is new best ll = tf.reduce_mean(tf.concat(ll, axis=0)).numpy() if ll > ll_best and it > n_iters * 0.6: # record best ll ll_best = ll # compute metrics metrics = { 'LL': ll_best, 'ELBO': tf.reduce_mean(tf.concat(elbo, axis=0)).numpy(), 'Best Epoch': it // its_per_epoch, 'Mean Bias': tf.reduce_mean(tf.concat(mean_residuals, axis=0)).numpy(), 'Mean RMSE': tf.sqrt( tf.reduce_mean( tf.concat(mean_residuals, axis=0)**2)).numpy(), 'Var Bias': tf.reduce_mean(tf.concat(var_residuals, axis=0)).numpy(), 'Var RMSE': tf.sqrt( tf.reduce_mean( tf.concat(var_residuals, axis=0)**2)).numpy(), 'Sample Bias': tf.reduce_mean(tf.concat(sample_residuals, axis=0)).numpy(), 'Sample RMSE': tf.sqrt( tf.reduce_mean( tf.concat(sample_residuals, axis=0)**2)).numpy() } # get p(x|x) for the held-out plotting data _, _, _, mu_x, sigma2_x, _, _, _ = self.forward( x_plot, anneling) mean = mu_x.cpu().numpy() variance = sigma2_x.cpu().numpy() components = [] for v in tf.unstack(variance): normal = tfp.distributions.Normal(loc=mean, scale=v**0.5) components.append( tfp.distributions.Independent( normal, reinterpreted_batch_ndims=1)) cat = tfp.distributions.Categorical( logits=tf.ones((variance.shape[1], variance.shape[0]))) px_x = tfp.distributions.Mixture(cat=cat, components=components) # save first two moments and samples for the plotting data reconstruction = { 'mean': px_x.mean().numpy(), 'std': px_x.stddev().numpy(), 'sample': px_x.sample().numpy() } # early stop check elif self.switch and it // its_per_epoch > epoch_best + 50: print('Early Stop!') break self.train() it += 1 progressBar.close() return metrics, reconstruction