def local_batchify(*arrays, **kwargs): from locality_sampler import gen_Qw, locality_sampler2 mean_psu = 1 mean_ssu = 100 mean_M = 200 var_psu = 1 var_ssu = 50 var_M = 60 mean_Q, mean_w = gen_Qw(arrays[0], mean_psu, mean_ssu, mean_M) var_Q, var_w = gen_Qw(arrays[0], var_psu, var_ssu, var_M) arrays = (*arrays, mean_w, var_w) count = 0 while True: if count % 2 == 0: batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w).astype(np.int32) else: batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w).astype(np.int32) count += 1 yield [a[batch] for a in arrays]
def local_batchify(*arrays, **kwargs): from locality_sampler import gen_Qw, locality_sampler, locality_sampler2 mean_psu = 1 mean_ssu = 100 mean_M = 150 var_psu = 3 var_ssu = 7 var_M = 15 mean_Q, mean_w = gen_Qw(arrays[0], mean_psu, mean_ssu, mean_M) var_Q, var_w = gen_Qw(arrays[0], var_psu, var_ssu, var_M) arrays = (*arrays, mean_w, var_w) while True: batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w).astype(np.int32) yield [a[batch] for a in arrays]
def john(args, X, y, Xval, yval): from sklearn.cluster import KMeans from utils import dist from itertools import chain from torch import distributions as D from locality_sampler import gen_Qw, locality_sampler2 from sklearn.decomposition import PCA if args.dataset == 'protein' or args.dataset == 'year_prediction': n_neurons = 100 else: n_neurons = 50 args.n_clusters = min(args.n_clusters, X.shape[0]) y, y_mean, y_std = normalize_y(y) mean_psu = 1 mean_ssu = 40 mean_M = 50 var_psu = 2 var_ssu = 10 var_M = 10 num_draws_train = 20 kmeans = KMeans(n_clusters=args.n_clusters) if args.dataset != 'year_prediction': kmeans.fit(np.concatenate([X], axis=0)) else: kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))]) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) if torch.cuda.is_available() and args.cuda: c = torch.tensor(c).to(torch.float32).to('cuda') else: c = torch.tensor(c).to(torch.float32) class translatedSigmoid(torch.nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = torch.nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta*(6.9077542789816375) return torch.sigmoid((x+alpha)/beta) class GPNNModel(torch.nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1])) self.alph = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1]), torch.nn.Softplus()) self.bet = torch.nn.Sequential(torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, y.shape[1]), torch.nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a+1e-8, 1.0/(b+1e-8)) if self.training: samples_var = gamma_dist.rsample(torch.Size([num_draws_train])) x_var = (1.0/(samples_var+1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([2000])) x_var = (1.0/(samples_var+1e-8)) var = (1-s) * x_var + s * y_std ** 2 else: var = 0.05*torch.ones_like(mean) return mean, var model = GPNNModel() if torch.cuda.is_available() and args.cuda: model.cuda() device=torch.device('cuda') else: device=torch.device('cpu') optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-4) mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) if X.shape[0] > 100000 and X.shape[1] > 10: pca = PCA(n_components=0.5) temp = pca.fit_transform(X) var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M) else: var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) #mean_pseupoch = get_pseupoch(mean_w,0.5) #var_pseupoch = get_pseupoch(var_w,0.5) opt_switch = 1 mean_w = torch.tensor(mean_w).to(torch.float32).to(device) var_w = torch.tensor(var_w).to(torch.float32).to(device) model.train() X = torch.tensor(X).to(torch.float32).to(device) y = torch.tensor(y).to(torch.float32).to(device) batches = batchify(X, y, batch_size = args.batch_size, shuffel=args.shuffel) # validation data and performance measures ll_list = [] mae_list = [] rmse_list = [] x_eval = torch.tensor(Xval).to(torch.float32).to(device) y_eval = torch.tensor(yval).to(torch.float32).to(device) y_mean = torch.tensor(y_mean).to(torch.float32).to(device) y_std = torch.tensor(y_std).to(torch.float32).to(device) it = 0 its_per_epoch = int(np.ceil(X.shape[0] / args.batch_size)) epochs = round(args.iters / its_per_epoch) while it < args.iters: switch = 1.0 if it > args.iters/2.0 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer with torch.autograd.detect_anomaly(): data, label = next(batches) if not switch: optimizer.zero_grad() m, v = model(data, switch) loss = -t_likelihood(label, m, v.unsqueeze(0)) loss.backward() optimizer.step() else: if opt_switch % 2 == 0: #for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu,mean_ssu,mean_Q,mean_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch], m, v, mean_w[batch]) loss.backward() optimizer.step() else: #for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu,var_ssu,var_Q,var_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch], m, v, var_w[batch]) loss.backward() optimizer2.step() # test on validation set once per epoch if it % its_per_epoch == 0: model.eval() with torch.no_grad(): m, v = model(x_eval, switch) m = m * y_std + y_mean v = v * y_std ** 2 if switch == 0: ll = t_likelihood(y_eval, m, v.unsqueeze(0)).item() else: ll = t_likelihood(y_eval, m, v).item() # if it % (500 * its_per_epoch) == 0: # print('Epoch {:d}/{:d},'.format(it // its_per_epoch, epochs), 'Loss {:.4f},'.format(ll)) # log validation performance after we are stable in the second optimization regime if it > args.iters * 0.60: ll_list.append(ll) error = torch.norm(y_eval - m, p=2, dim=1) mae_list.append(error.abs().mean().item()) rmse_list.append((error ** 2).mean().sqrt().item()) model.train() # early stop check if len(ll_list) - np.argmax(ll_list) > 50: it = args.iters print('Early Stop!') it+=1 # get best LL i_best = np.argmax(ll_list) # evaluate model moments with torch.no_grad(): model.training = False m, v = model(x_eval, 1.0) m = m * y_std + y_mean v = v * y_std ** 2 return ll_list[i_best], rmse_list[i_best], m.cpu().numpy(), v.cpu().numpy()
def ens_john(X, y, x): from sklearn.cluster import KMeans from utils import dist from itertools import chain mean_psu = 1 mean_ssu = 50 mean_M = 60 var_psu = 3 var_ssu = 7 var_M = 10 kmeans = KMeans(n_clusters=10) kmeans.fit(np.concatenate([X], axis=0)) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) class translatedSigmoid(nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta * (6.9077542789816375) return torch.sigmoid((x + alpha) / beta) class GPNNModel(nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1)) self.alph = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1), nn.Softplus()) self.bet = nn.Sequential(nn.Linear(1, n_neurons), nn.Sigmoid(), nn.Linear(n_neurons, 1), nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8)) if self.training: samples_var = gamma_dist.rsample(torch.Size([50])) x_var = (1.0 / (samples_var + 1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([2000])) x_var = (1.0 / (samples_var + 1e-8)) var = (1 - s) * x_var + s * torch.tensor([3.5**2 ]) # HYPERPARAMETER else: var = torch.tensor([0.05]) return mean, var ens_mean, ens_var = [], [] for i in range(5): model = GPNNModel() optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-3) n_iter = 6000 it = 0 mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) mean_pseupoch = get_pseupoch(mean_w, 0.5) var_pseupoch = get_pseupoch(var_w, 0.5) opt_switch = 1 mean_w = torch.Tensor(mean_w) var_w = torch.Tensor(var_w) model.train() while it < n_iter: model.train() switch = 1.0 if it > 5000 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer if not switch: optimizer.zero_grad() m, v = model(X, switch) loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).sum() loss.backward() optimizer.step() else: if opt_switch % 2 == 0: for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w) m, v = model(X[batch], switch) loss = -t_likelihood( y[batch], m, v, mean_w[batch] ) #-(-v.log() - ((m.flatten()-y[batch])**2).reshape(1,-1,1) / (2 * v)) / mean_w[batch].reshape(1,-1,1) loss = loss.sum() # why the f*** is it so slow loss.backward() optimizer.step() else: for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w) m, v = model(X[batch], switch) loss = -t_likelihood( y[batch], m, v, var_w[batch] ) #-(-(diff.log() / 2 + diff/v + v.log() / 2)) / var_w[batch].reshape(1,-1,1) loss = loss.sum() # why the f*** is it so slow loss.backward() optimizer2.step() if it % 500 == 0: model.eval() m, v = model(X, switch) loss = -(-v.log() - (m.flatten() - y)**2 / (2 * v)).mean() print('Iter {0}/{1}, Loss {2}'.format(it, n_iter, loss.item())) it += 1 model.eval() with torch.no_grad(): mean, var = model(x, switch) ens_mean.append(mean) ens_var.append(var.mean(dim=0)) ens_mean = torch.stack(ens_mean) ens_var = torch.stack(ens_var) mean = ens_mean.mean(dim=0) var = (ens_var + ens_mean**2).mean(dim=0) - mean**2 return mean.numpy(), var.sqrt().numpy()
def john(args, X, y, Xval, yval): from sklearn.cluster import KMeans from utils import dist from itertools import chain from torch import distributions as D from locality_sampler import gen_Qw, locality_sampler2 from sklearn.decomposition import PCA if args.dataset == 'protein' or args.dataset == 'year_prediction': n_neurons = 100 else: n_neurons = 50 args.n_clusters = min(args.n_clusters, X.shape[0]) y, y_mean, y_std = normalize_y(y) mean_psu = 1 mean_ssu = 40 mean_M = 50 var_psu = 2 var_ssu = 10 var_M = 10 num_draws_train = 20 kmeans = KMeans(n_clusters=args.n_clusters) if args.dataset != 'year_prediction': kmeans.fit(np.concatenate([X], axis=0)) else: kmeans.fit(X[np.random.randint(0, X.shape[0], size=(10000))]) c = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32) if torch.cuda.is_available() and args.cuda: c = torch.tensor(c).to(torch.float32).to('cuda') else: c = torch.tensor(c).to(torch.float32) class translatedSigmoid(torch.nn.Module): def __init__(self): super(translatedSigmoid, self).__init__() self.beta = torch.nn.Parameter(torch.tensor([1.5])) def forward(self, x): beta = torch.nn.functional.softplus(self.beta) alpha = -beta * (6.9077542789816375) return torch.sigmoid((x + alpha) / beta) class GPNNModel(torch.nn.Module): def __init__(self): super(GPNNModel, self).__init__() self.mean = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1)) self.alph = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.bet = torch.nn.Sequential( torch.nn.Linear(X.shape[1], n_neurons), torch.nn.ReLU(), torch.nn.Linear(n_neurons, 1), torch.nn.Softplus()) self.trans = translatedSigmoid() def forward(self, x, switch): d = dist(x, c) d_min = d.min(dim=1, keepdim=True)[0] s = self.trans(d_min) mean = self.mean(x) if switch: a = self.alph(x) b = self.bet(x) gamma_dist = D.Gamma(a + 1e-8, 1.0 / (b + 1e-8)) if self.training: samples_var = gamma_dist.rsample( torch.Size([num_draws_train])) x_var = (1.0 / (samples_var + 1e-8)) else: samples_var = gamma_dist.rsample(torch.Size([1000])) x_var = (1.0 / (samples_var + 1e-8)) var = (1 - s) * x_var + s * torch.tensor( [y_std**2], device=x.device) # HYPERPARAMETER else: var = 0.05 * torch.ones_like(mean) return mean, var model = GPNNModel() if torch.cuda.is_available() and args.cuda: model.cuda() device = torch.device('cuda') else: device = torch.device('cpu') optimizer = torch.optim.Adam(model.mean.parameters(), lr=1e-2) optimizer2 = torch.optim.Adam(chain(model.alph.parameters(), model.bet.parameters(), model.trans.parameters()), lr=1e-4) mean_Q, mean_w = gen_Qw(X, mean_psu, mean_ssu, mean_M) if X.shape[0] > 100000 and X.shape[1] > 10: pca = PCA(n_components=0.5) temp = pca.fit_transform(X) var_Q, var_w = gen_Qw(temp, var_psu, var_ssu, var_M) else: var_Q, var_w = gen_Qw(X, var_psu, var_ssu, var_M) #mean_pseupoch = get_pseupoch(mean_w,0.5) #var_pseupoch = get_pseupoch(var_w,0.5) opt_switch = 1 mean_w = torch.tensor(mean_w).to(torch.float32).to(device) var_w = torch.tensor(var_w).to(torch.float32).to(device) model.train() X = torch.tensor(X).to(torch.float32).to(device) y = torch.tensor(y).to(torch.float32).to(device) batches = batchify(X, y, batch_size=args.batch_size, shuffel=args.shuffel) it = 0 while it < args.iters: switch = 1.0 if it > args.iters / 2.0 else 0.0 if it % 11: opt_switch = opt_switch + 1 # change between var and mean optimizer with torch.autograd.detect_anomaly(): data, label = next(batches) if not switch: optimizer.zero_grad() m, v = model(data, switch) loss = -t_likelihood(label.reshape(-1, 1), m, v.reshape(1, -1, 1)) / X.shape[0] loss.backward() optimizer.step() else: if opt_switch % 2 == 0: #for b in range(mean_pseupoch): optimizer.zero_grad() batch = locality_sampler2(mean_psu, mean_ssu, mean_Q, mean_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch].reshape(-1, 1), m, v, mean_w[batch]) / X.shape[0] loss.backward() optimizer.step() else: #for b in range(var_pseupoch): optimizer2.zero_grad() batch = locality_sampler2(var_psu, var_ssu, var_Q, var_w) m, v = model(X[batch], switch) loss = -t_likelihood(y[batch].reshape(-1, 1), m, v, var_w[batch]) / X.shape[0] loss.backward() optimizer2.step() if it % 500 == 0: m, v = model(data, switch) loss = -(-v.log() / 2 - ((m.flatten() - label)**2).reshape(1, -1, 1) / (2 * v)).mean() print('Iter {0}/{1}, Loss {2}'.format(it, args.iters, loss.item())) it += 1 model.eval() data = torch.tensor(Xval).to(torch.float32).to(device) label = torch.tensor(yval).to(torch.float32).to(device) with torch.no_grad(): m, v = model(data, switch) m = m * y_std + y_mean v = v * y_std**2 #log_px = normal_log_prob(label, m, v).mean(dim=0) # check for correctness log_px = t_likelihood(label.reshape(-1, 1), m, v) / Xval.shape[0] # check rmse = ((label - m.flatten())**2).mean().sqrt() return log_px.mean().item(), rmse.item()