def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) eps = self.loc.new(shape).cauchy_() return self.loc + eps * self.scale
def testALEBOGP(self): # First non-batch B = torch.tensor( [[1.0, 2.0, 3.0, 4.0, 5.0], [2.0, 3.0, 4.0, 5.0, 6.0]], dtype=torch.double) train_X = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=torch.double) train_Y = torch.tensor([[1.0], [2.0], [3.0]], dtype=torch.double) train_Yvar = 0.1 * torch.ones(3, 1, dtype=torch.double) mll = get_map_model( B=B, train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar, restarts=1, init_state_dict=None, ) m = mll.model m.eval() self.assertIsInstance(m, ALEBOGP) self.assertIsInstance(m.covar_module.base_kernel, ALEBOKernel) X = torch.tensor([[2.0, 2.0], [3.0, 3.0], [4.0, 4.0]], dtype=torch.double) f = m(X) self.assertEqual(f.mean.shape, torch.Size([3])) self.assertEqual(f.variance.shape, torch.Size([3])) self.assertEqual(f.covariance_matrix.shape, torch.Size([3, 3])) # Batch Uvec_b = m.covar_module.base_kernel.Uvec.repeat(5, 1) mean_b = m.mean_module.constant.repeat(5, 1) output_scale_b = m.covar_module.raw_outputscale.repeat(5) m_b = get_batch_model( B=B, train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar, Uvec_batch=Uvec_b, mean_constant_batch=mean_b, output_scale_batch=output_scale_b, ) self.assertEqual(m_b._aug_batch_shape, torch.Size([5])) f = m_b(X) self.assertEqual(f.mean.shape, torch.Size([3])) self.assertEqual(f.variance.shape, torch.Size([3])) self.assertEqual(f.covariance_matrix.shape, torch.Size([3, 3])) self.assertEqual( m_b.posterior(X).mvn.covariance_matrix.shape, torch.Size([3, 3])) # The whole process in get_fitted_model init_state_dict = m.state_dict() m_b2 = get_fitted_model( B=B, train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar, restarts=1, nsamp=5, init_state_dict=init_state_dict, ) self.assertEqual(m_b2._aug_batch_shape, torch.Size([5])) # Test extract_map_statedict map_sds = extract_map_statedict(m_b=m_b, num_outputs=1) self.assertEqual(len(map_sds), 1) self.assertEqual(len(map_sds[0]), 5) self.assertEqual( set(map_sds[0]), { "covar_module.base_kernel.Uvec", "covar_module.raw_outputscale", "mean_module.constant", "covar_module.raw_outputscale_constraint.lower_bound", "covar_module.raw_outputscale_constraint.upper_bound", }, ) self.assertEqual(map_sds[0]["covar_module.base_kernel.Uvec"].shape, torch.Size([3])) ml = ModelListGP(m_b, m_b2) map_sds = extract_map_statedict(m_b=ml, num_outputs=2) self.assertEqual(len(map_sds), 2) for i in range(2): self.assertEqual(len(map_sds[i]), 5) self.assertEqual( set(map_sds[i]), { "covar_module.base_kernel.Uvec", "covar_module.raw_outputscale", "mean_module.constant", "covar_module.raw_outputscale_constraint.lower_bound", "covar_module.raw_outputscale_constraint.upper_bound", }, ) self.assertEqual(map_sds[i]["covar_module.base_kernel.Uvec"].shape, torch.Size([3]))
def test_phase_vocoder(complex_specgrams, rate, hop_length): # Using a decorator here causes parametrize to fail on Python 2 if not IMPORT_LIBROSA: raise unittest.SkipTest('Librosa is not available') # Due to cummulative sum, numerical error in using torch.float32 will # result in bottom right values of the stretched sectrogram to not # match with librosa. complex_specgrams = complex_specgrams.type(torch.float64) phase_advance = torch.linspace(0, np.pi * hop_length, complex_specgrams.shape[-3], dtype=torch.float64)[..., None] complex_specgrams_stretch = F.phase_vocoder(complex_specgrams, rate=rate, phase_advance=phase_advance) # == Test shape expected_size = list(complex_specgrams.size()) expected_size[-2] = int(np.ceil(expected_size[-2] / rate)) assert complex_specgrams.dim() == complex_specgrams_stretch.dim() assert complex_specgrams_stretch.size() == torch.Size(expected_size) # == Test values index = [0] * (complex_specgrams.dim() - 3) + [slice(None)] * 3 mono_complex_specgram = complex_specgrams[index].numpy() mono_complex_specgram = mono_complex_specgram[..., 0] + \ mono_complex_specgram[..., 1] * 1j expected_complex_stretch = librosa.phase_vocoder(mono_complex_specgram, rate=rate, hop_length=hop_length) complex_stretch = complex_specgrams_stretch[index].numpy() complex_stretch = complex_stretch[..., 0] + 1j * complex_stretch[..., 1] assert np.allclose(complex_stretch, expected_complex_stretch, atol=1e-5) def test_torchscript_create_fb_matrix(self): n_stft = 100 f_min = 0.0 f_max = 20.0 n_mels = 10 sample_rate = 16000 _test_torchscript_functional(F.create_fb_matrix, n_stft, f_min, f_max, n_mels, sample_rate) def test_torchscript_amplitude_to_DB(self): spec = torch.rand((6, 201)) multiplier = 10.0 amin = 1e-10 db_multiplier = 0.0 top_db = 80.0 _test_torchscript_functional(F.amplitude_to_DB, spec, multiplier, amin, db_multiplier, top_db) def test_torchscript_create_dct(self): n_mfcc = 40 n_mels = 128 norm = "ortho" _test_torchscript_functional(F.create_dct, n_mfcc, n_mels, norm) def test_torchscript_mu_law_encoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_encoding, tensor, qc) def test_torchscript_mu_law_decoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_decoding, tensor, qc) def test_torchscript_complex_norm(self): complex_tensor = torch.randn(1, 2, 1025, 400, 2), power = 2 _test_torchscript_functional(F.complex_norm, complex_tensor, power) def test_mask_along_axis(self): specgram = torch.randn(2, 1025, 400), mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis, specgram, mask_param, mask_value, axis) def test_mask_along_axis_iid(self): specgram = torch.randn(2, 1025, 400), specgrams = torch.randn(4, 2, 1025, 400), mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis_iid, specgrams, mask_param, mask_value, axis) def test_torchscript_gain(self): tensor = torch.rand((1, 1000)) gainDB = 2.0 _test_torchscript_functional(F.gain, tensor, gainDB) def test_torchscript_dither(self): tensor = torch.rand((1, 1000)) _test_torchscript_functional(F.dither, tensor) _test_torchscript_functional(F.dither, tensor, "RPDF") _test_torchscript_functional(F.dither, tensor, "GPDF")
def paired_transform_torch(image, transform, output_size, block_size=(1, 1)): transform = seg_transforms.SegTransformCompose([ transform, datapipe.seg_transforms_cv.SegCVTransformNormalizeToTensor(None, None) ]) pipe = seg_transforms.SegDataPipeline(block_size, transform) torch_device = torch.device('cpu') x0, m0, xf0, x1, m1, xf1 = pipe.prepare_unsupervised_paired_batch([image]) padded_shape = x0.shape[2:4] xf0_to_1 = affine.cat_nx2x3(xf1, affine.inv_nx2x3(xf0)) t_image_xf0 = affine.cv_to_torch(xf0, padded_shape, image.shape[:2]) t_image_xf1 = affine.cv_to_torch(xf1, padded_shape, image.shape[:2]) t_xf0_to_1 = affine.cv_to_torch(xf0_to_1, padded_shape) image_f = img_as_float(image).astype(np.float32) t_image = torch.tensor(image_f.transpose(2, 0, 1)[None, ...], dtype=torch.float, device=torch_device) t_x0 = torch.tensor(x0, dtype=torch.float, device=torch_device) t_m0 = torch.tensor(m0, dtype=torch.float, device=torch_device) t_m1 = torch.tensor(m1, dtype=torch.float, device=torch_device) t_image_xf0 = torch.tensor(t_image_xf0, dtype=torch.float, device=torch_device) t_image_xf1 = torch.tensor(t_image_xf1, dtype=torch.float, device=torch_device) t_xf0_to_1 = torch.tensor(t_xf0_to_1, dtype=torch.float, device=torch_device) output_shape = torch.Size(len(x0), 3, output_size[0], output_size[1]) grid_image0 = F.affine_grid(t_image_xf0, output_shape) grid_image1 = F.affine_grid(t_image_xf1, output_shape) grid_0to1 = F.affine_grid(t_xf0_to_1, output_shape) t_a = F.grid_sample(t_image, grid_image0) t_b = F.grid_sample(t_image, grid_image1) t_x01 = F.grid_sample(t_x0, grid_0to1) t_m01 = F.grid_sample(t_m0, grid_0to1) * t_m1 t_a_np = t_a.detach().cpu().numpy()[0].transpose(1, 2, 0) t_b_np = t_b.detach().cpu().numpy()[0].transpose(1, 2, 0) t_x01_np = t_x01.detach().cpu().numpy()[0].transpose(1, 2, 0) t_m01_np = t_m01.detach().cpu().numpy()[0].transpose(1, 2, 0) x0 = x0[0].transpose(1, 2, 0) x1 = x1[0].transpose(1, 2, 0) return dict(x0=x0, torch0=t_a_np, x1=x1, torch1=t_b_np, x01=t_x01_np, m01=t_m01_np[:, :, 0])
def testAcq(self): B = torch.tensor( [[1.0, 2.0, 3.0, 4.0, 5.0], [2.0, 3.0, 4.0, 5.0, 6.0]], dtype=torch.double) train_X = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=torch.double) train_Y = torch.tensor([[1.0], [2.0], [3.0]], dtype=torch.double) train_Yvar = 0.1 * torch.ones(3, 1, dtype=torch.double) m = ALEBOGP(B=B, train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) m.eval() objective_weights = torch.tensor([1.0], dtype=torch.double) acq = ei_or_nei( model=m, objective_weights=objective_weights, outcome_constraints=None, X_observed=train_X, X_pending=None, q=1, noiseless=True, ) self.assertIsInstance(acq, ExpectedImprovement) self.assertEqual(acq.best_f.item(), 3.0) objective_weights = torch.tensor([-1.0], dtype=torch.double) acq = ei_or_nei( model=m, objective_weights=objective_weights, outcome_constraints=None, X_observed=train_X, X_pending=None, q=1, noiseless=True, ) self.assertEqual(acq.best_f.item(), 1.0) with mock.patch( "ax.models.torch.alebo.optimize_acqf", autospec=True, return_value=(train_X, train_Y), ) as optim_mock: alebo_acqf_optimizer( acq_function=acq, bounds=None, n=1, inequality_constraints=5.0, fixed_features=None, rounding_func=None, raw_samples=100, num_restarts=5, B=B, ) self.assertEqual(optim_mock.call_count, 1) self.assertIsInstance(optim_mock.mock_calls[0][2]["acq_function"], ExpectedImprovement) acq = ei_or_nei( model=m, objective_weights=objective_weights, outcome_constraints=None, X_observed=train_X, X_pending=None, q=1, noiseless=False, ) self.assertIsInstance(acq, qNoisyExpectedImprovement) with mock.patch( "ax.models.torch.alebo.optimize_acqf", autospec=True, return_value=(train_X, train_Y), ) as optim_mock: alebo_acqf_optimizer( acq_function=acq, bounds=None, n=2, inequality_constraints=5.0, fixed_features=None, rounding_func=None, raw_samples=100, num_restarts=5, B=B, ) self.assertEqual(optim_mock.call_count, 2) self.assertIsInstance(optim_mock.mock_calls[0][2]["acq_function"], qNoisyExpectedImprovement) self.assertEqual(optim_mock.mock_calls[0][2]["num_restarts"], 5) self.assertEqual(optim_mock.mock_calls[0][2]["inequality_constraints"], 5.0) X = optim_mock.mock_calls[0][2]["batch_initial_conditions"] self.assertEqual(X.shape, torch.Size([5, 1, 2])) # Make sure initialization is inside subspace Z = (B @ torch.pinverse(B) @ X[:, 0, :].t()).t() self.assertTrue(torch.allclose(Z, X[:, 0, :]))
def forward(self, theta): theta = theta.contiguous() batch_size = theta.size()[0] out_size = torch.Size( (batch_size, self.out_ch, self.out_h, self.out_w)) return F.affine_grid(theta, out_size)
def make_sparse_eye(size): eye_idx = torch.arange(size) eye_idx = torch.stack([eye_idx, eye_idx], dim=1).t() vals = torch.ones(size) eye = torch.sparse.FloatTensor(eye_idx, vals, torch.Size([size, size])) return eye
features = sp.coo_matrix(sp.identity(adj.shape[0])) # featureless support = None num_supports = None model_func = None if cfg.model == 'gcn': support = preprocess_adj(adj) num_supports = 1 model_func = GCN values = features.data indices = np.vstack((features.row, features.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = features.shape t_features = torch.sparse.FloatTensor(i, v, torch.Size(shape)) t_features = t_features.float() t_y_train = torch.from_numpy(y_train) t_y_val = torch.from_numpy(y_val) t_y_test = torch.from_numpy(y_test) t_train_mask = torch.from_numpy(train_mask.astype(np.float32)) tm_train_mask = torch.transpose(torch.unsqueeze(t_train_mask, 0), 1, 0).repeat(1, y_train.shape[1]) values = support.data indices = np.vstack((support.row, support.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = support.shape t_support = torch.sparse.FloatTensor(i, v, torch.Size(shape))
def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) rand = torch.rand(shape, dtype=self.low.dtype, device=self.low.device) return self.low + rand * (self.high - self.low)
def __init__(self, d_in, d_rule, d_out): super(ConsequentLayer, self).__init__() c_shape = torch.Size([d_rule, d_out, d_in + 1]) self._coeff = torch.zeros(c_shape, dtype=dtype, requires_grad=True)
def test_shufflenetv1_backbone(): with pytest.raises(ValueError): # frozen_stages must be in range(-1, 4) ShuffleNetV1(frozen_stages=10) with pytest.raises(ValueError): # the item in out_indices must be in range(0, 4) ShuffleNetV1(out_indices=[5]) with pytest.raises(ValueError): # groups must be in [1, 2, 3, 4, 8] ShuffleNetV1(groups=10) with pytest.raises(TypeError): # pretrained must be str or None model = ShuffleNetV1() model.init_weights(pretrained=1) # Test ShuffleNetV1 norm state model = ShuffleNetV1() model.init_weights() model.train() assert check_norm_state(model.modules(), True) # Test ShuffleNetV1 with first stage frozen frozen_stages = 1 model = ShuffleNetV1(frozen_stages=frozen_stages, out_indices=(0, 1, 2)) model.init_weights() model.train() for param in model.conv1.parameters(): assert param.requires_grad is False for i in range(frozen_stages): layer = model.layers[i] for mod in layer.modules(): if isinstance(mod, _BatchNorm): assert mod.training is False for param in layer.parameters(): assert param.requires_grad is False # Test ShuffleNetV1 forward with groups=1 model = ShuffleNetV1(groups=1, out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 144, 28, 28)) assert feat[1].shape == torch.Size((1, 288, 14, 14)) assert feat[2].shape == torch.Size((1, 576, 7, 7)) # Test ShuffleNetV1 forward with groups=2 model = ShuffleNetV1(groups=2, out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 200, 28, 28)) assert feat[1].shape == torch.Size((1, 400, 14, 14)) assert feat[2].shape == torch.Size((1, 800, 7, 7)) # Test ShuffleNetV1 forward with groups=3 model = ShuffleNetV1(groups=3, out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 240, 28, 28)) assert feat[1].shape == torch.Size((1, 480, 14, 14)) assert feat[2].shape == torch.Size((1, 960, 7, 7)) # Test ShuffleNetV1 forward with groups=4 model = ShuffleNetV1(groups=4, out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 272, 28, 28)) assert feat[1].shape == torch.Size((1, 544, 14, 14)) assert feat[2].shape == torch.Size((1, 1088, 7, 7)) # Test ShuffleNetV1 forward with groups=8 model = ShuffleNetV1(groups=8, out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 384, 28, 28)) assert feat[1].shape == torch.Size((1, 768, 14, 14)) assert feat[2].shape == torch.Size((1, 1536, 7, 7)) # Test ShuffleNetV1 forward with GroupNorm forward model = ShuffleNetV1(groups=3, norm_cfg=dict(type='GN', num_groups=2, requires_grad=True), out_indices=(0, 1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, GroupNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 3 assert feat[0].shape == torch.Size((1, 240, 28, 28)) assert feat[1].shape == torch.Size((1, 480, 14, 14)) assert feat[2].shape == torch.Size((1, 960, 7, 7)) # Test ShuffleNetV1 forward with layers 1, 2 forward model = ShuffleNetV1(groups=3, out_indices=(1, 2)) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert len(feat) == 2 assert feat[0].shape == torch.Size((1, 480, 14, 14)) assert feat[1].shape == torch.Size((1, 960, 7, 7)) # Test ShuffleNetV1 forward with layers 2 forward model = ShuffleNetV1(groups=3, out_indices=(2, )) model.init_weights() model.train() for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert isinstance(feat, torch.Tensor) assert feat.shape == torch.Size((1, 960, 7, 7)) # Test ShuffleNetV1 forward with checkpoint forward model = ShuffleNetV1(groups=3, with_cp=True) for m in model.modules(): if is_block(m): assert m.with_cp # Test ShuffleNetV1 with norm_eval model = ShuffleNetV1(norm_eval=True) model.init_weights() model.train() assert check_norm_state(model.modules(), False)
import scipy from scipy import stats import matplotlib.pyplot as plt import seaborn as sns # Settings #torch.manual_seed(0) batch_dim = 100000 input_dim = 128 # Create non-lazy parameters base_dist = torch.distributions.Normal(torch.zeros(input_dim), torch.ones(input_dim)) bijection = bijectors.AffineAutoregressive() lazy_params = params.DenseAutoregressive(hidden_dims=[256,256,256,256,256,256,256]) #, permutation=torch.Tensor([0, 1, 2, 3])) params = lazy_params(torch.Size([input_dim]), bijection.param_shapes(base_dist)) x = base_dist.rsample(torch.Size([batch_dim])) mean, log_scale = [y.detach().numpy() for y in params(x)] print(mean.shape, log_scale.shape) #print(mean[:10,0]) #print(mean[:10,1]) print(mean[:,1].mean(), mean[:,1].std()) #plt.plot(mean[:,0], mean[:,1], 'o', color='blue', alpha=0.7, label='mean') sns.distplot(mean[:,1], hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'mean') #plt.plot(z_base[:,0], z_base[:,1], 'o', color='red', alpha=0.7, label='base') plt.title('Samples from MADE') #plt.xlabel('$x_1$')
def output_shape(self): return torch.Size(self._output_shape[1:])
def predict(self, triplet, s_hist, o_hist, global_model): s = triplet[0] r = triplet[1] o = triplet[2] t = triplet[3].cpu() if self.latest_time != t: _, sub, prob_sub = global_model.predict(self.latest_time, self.graph_dict, subject=True) m = torch.distributions.categorical.Categorical(prob_sub) subjects = m.sample(torch.Size([self.num_k])) prob_subjects = prob_sub[subjects] s_done = set() for s, prob_s in zip(subjects, prob_subjects): if s in s_done: continue else: s_done.add(s) ss = torch.LongTensor([s]).repeat(self.num_rels) rr = torch.arange(0, self.num_rels) probs = prob_s * self.pred_r_rank2(ss, rr, subject=True) probs, indices = torch.topk(probs.view(-1), self.num_k, sorted=False) self.preds_list_s[s] = probs.view(-1) self.preds_ind_s[s] = indices.view(-1) s_to_id = dict() s_num = len(self.preds_list_s.keys()) prob_tensor = torch.zeros(s_num * self.num_k) idx = 0 for i, s in enumerate(self.preds_list_s.keys()): s_to_id[idx] = s prob_tensor[i * self.num_k:(i + 1) * self.num_k] = self.preds_list_s[s] idx += 1 _, triple_candidates = torch.topk(prob_tensor, self.num_k, sorted=False) indices = triple_candidates // self.num_k for i, idx in enumerate(indices): s = s_to_id[idx.item()] num_r_num_s = self.preds_ind_s[s][triple_candidates[i] % self.num_k] rr = num_r_num_s // self.in_dim o_s = num_r_num_s % self.in_dim self.s_his_cache[s] = self.update_cache( self.s_his_cache[s], rr, o_s.view(-1, 1)) self.s_his_cache_t[s] = self.latest_time.item() _, ob, prob_ob = global_model.predict(t, self.graph_dict, subject=False) prob_ob = torch.softmax(ob.view(-1), dim=0) m = torch.distributions.categorical.Categorical(prob_ob) objects = m.sample(torch.Size([self.num_k])) prob_objects = prob_ob[objects] o_done = set() for o, prob_o in zip(objects, prob_objects): if o in o_done: continue else: o_done.add(o) oo = torch.LongTensor([o]).repeat(self.num_rels) rr = torch.arange(0, self.num_rels) probs = prob_o * self.pred_r_rank2(oo, rr, subject=False) probs, indices = torch.topk(probs.view(-1), self.num_k, sorted=False) self.preds_list_o[o] = probs.view(-1) self.preds_ind_o[o] = indices.view(-1) o_to_id = dict() o_num = len(self.preds_list_o.keys()) prob_tensor = torch.zeros(o_num * self.num_k) idx = 0 for i, o in enumerate(self.preds_list_o.keys()): o_to_id[idx] = o prob_tensor[i * self.num_k:(i + 1) * self.num_k] = self.preds_list_o[o] idx += 1 _, triple_candidates = torch.topk(prob_tensor, self.num_k, sorted=False) indices = triple_candidates // self.num_k for i, idx in enumerate(indices): o = o_to_id[idx.item()] num_r_num_o = self.preds_ind_o[o][triple_candidates[i] % self.num_k] rr = num_r_num_o // self.in_dim s_o = num_r_num_o % self.in_dim # rr = torch.tensor(rr) self.o_his_cache[o] = self.update_cache( self.o_his_cache[o], rr, s_o.view(-1, 1)) self.o_his_cache_t[o] = self.latest_time.item() self.data = get_data(self.s_his_cache, self.o_his_cache) self.graph_dict[self.latest_time.item()] = get_big_graph( self.data, self.num_rels) global_emb_prev_t, _, _ = global_model.predict(self.latest_time, self.graph_dict, subject=True) self.global_emb[self.latest_time.item()] = global_emb_prev_t for ee in range(self.in_dim): if len(self.s_his_cache[ee]) != 0: while len(self.s_hist_test[ee]) >= self.seq_len: self.s_hist_test[ee].pop(0) self.s_hist_test_t[ee].pop(0) self.s_hist_test[ee].append( self.s_his_cache[ee].cpu().numpy().copy()) self.s_hist_test_t[ee].append(self.s_his_cache_t[ee]) self.s_his_cache[ee] = [] self.s_his_cache_t[ee] = None if len(self.o_his_cache[ee]) != 0: while len(self.o_hist_test[ee]) >= self.seq_len: self.o_hist_test[ee].pop(0) self.o_hist_test_t[ee].pop(0) self.o_hist_test[ee].append( self.o_his_cache[ee].cpu().numpy().copy()) self.o_hist_test_t[ee].append(self.o_his_cache_t[ee]) self.o_his_cache[ee] = [] self.o_his_cache_t[ee] = None self.latest_time = t self.data = None self.preds_list_s = defaultdict(lambda: torch.zeros(self.num_k)) self.preds_ind_s = defaultdict(lambda: torch.zeros(self.num_k)) self.preds_list_o = defaultdict(lambda: torch.zeros(self.num_k)) self.preds_ind_o = defaultdict(lambda: torch.zeros(self.num_k)) if len(s_hist[0]) == 0 or len(self.s_hist_test[s]) == 0: s_h = torch.zeros(self.h_dim).cuda() else: s_history = self.s_hist_test[s] s_history_t = self.s_hist_test_t[s] inp, _ = self.aggregator.predict((s_history, s_history_t), s, r, self.ent_embeds, self.rel_embeds[:self.num_rels], self.graph_dict, self.global_emb, reverse=False) tt, s_h = self.encoder(inp.view(1, len(s_history), 4 * self.h_dim)) s_h = s_h.squeeze() if len(o_hist[0]) == 0 or len(self.o_hist_test[o]) == 0: o_h = torch.zeros(self.h_dim).cuda() else: o_history = self.o_hist_test[o] o_history_t = self.o_hist_test_t[o] inp, _ = self.aggregator.predict((o_history, o_history_t), o, r, self.ent_embeds, self.rel_embeds[self.num_rels:], self.graph_dict, self.global_emb, reverse=True) tt, o_h = self.encoder(inp.view(1, len(o_history), 4 * self.h_dim)) o_h = o_h.squeeze() ob_pred = self.linear( torch.cat( (self.ent_embeds[s], s_h, self.rel_embeds[:self.num_rels][r]), dim=0)) sub_pred = self.linear( torch.cat( (self.ent_embeds[o], o_h, self.rel_embeds[self.num_rels:][r]), dim=0)) loss_sub = self.criterion(ob_pred.view(1, -1), o.view(-1)) loss_ob = self.criterion(sub_pred.view(1, -1), s.view(-1)) loss = loss_sub + loss_ob return loss, sub_pred, ob_pred
def test_condition_on_observations(self): for (train_iteration_fidelity, train_data_fidelity) in [ (False, True), (True, False), (True, True), ]: for batch_shape in (torch.Size(), torch.Size([2])): for num_outputs in (1, 2): for double in (False, True): num_dim = 1 + train_iteration_fidelity + train_data_fidelity tkwargs = { "device": self.device, "dtype": torch.double if double else torch.float, } model, model_kwargs = self._get_model_and_data( batch_shape=batch_shape, num_outputs=num_outputs, train_iteration_fidelity=train_iteration_fidelity, train_data_fidelity=train_data_fidelity, **tkwargs, ) # evaluate model model.posterior( torch.rand(torch.Size([4, num_dim]), **tkwargs)) # test condition_on_observations fant_shape = torch.Size([2]) # fantasize at different input points X_fant, Y_fant = _get_random_data_with_fidelity( fant_shape + batch_shape, num_outputs, n=3, train_iteration_fidelity=train_iteration_fidelity, train_data_fidelity=train_data_fidelity, **tkwargs, ) c_kwargs = ({ "noise": torch.full_like(Y_fant, 0.01) } if isinstance(model, FixedNoiseGP) else {}) cm = model.condition_on_observations( X_fant, Y_fant, **c_kwargs) # fantasize at different same input points c_kwargs_same_inputs = ({ "noise": torch.full_like(Y_fant[0], 0.01) } if isinstance(model, FixedNoiseGP) else {}) cm_same_inputs = model.condition_on_observations( X_fant[0], Y_fant, **c_kwargs_same_inputs) test_Xs = [ # test broadcasting single input across fantasy and # model batches torch.rand(4, num_dim, **tkwargs), # separate input for each model batch and broadcast across # fantasy batches torch.rand(batch_shape + torch.Size([4, num_dim]), **tkwargs), # separate input for each model and fantasy batch torch.rand( fant_shape + batch_shape + torch.Size([4, num_dim]), **tkwargs, ), ] for test_X in test_Xs: posterior = cm.posterior(test_X) self.assertEqual( posterior.mean.shape, fant_shape + batch_shape + torch.Size([4, num_outputs]), ) posterior_same_inputs = cm_same_inputs.posterior( test_X) self.assertEqual( posterior_same_inputs.mean.shape, fant_shape + batch_shape + torch.Size([4, num_outputs]), ) # check that fantasies of batched model are correct if len(batch_shape) > 0 and test_X.dim() == 2: state_dict_non_batch = { key: (val[0] if val.numel() > 1 else val) for key, val in model.state_dict().items() } model_kwargs_non_batch = { "train_X": model_kwargs["train_X"][0], "train_Y": model_kwargs["train_Y"][0], "train_iteration_fidelity": model_kwargs["train_iteration_fidelity"], "train_data_fidelity": model_kwargs["train_data_fidelity"], } if "train_Yvar" in model_kwargs: model_kwargs_non_batch[ "train_Yvar"] = model_kwargs[ "train_Yvar"][0] model_non_batch = type(model)( **model_kwargs_non_batch) model_non_batch.load_state_dict( state_dict_non_batch) model_non_batch.eval() model_non_batch.likelihood.eval() model_non_batch.posterior( torch.rand(torch.Size([4, num_dim]), **tkwargs)) c_kwargs = ({ "noise": torch.full_like(Y_fant[0, 0, :], 0.01) } if isinstance(model, FixedNoiseGP) else {}) mnb = model_non_batch cm_non_batch = mnb.condition_on_observations( X_fant[0][0], Y_fant[:, 0, :], **c_kwargs) non_batch_posterior = cm_non_batch.posterior( test_X) self.assertTrue( torch.allclose( posterior_same_inputs.mean[:, 0, ...], non_batch_posterior.mean, atol=1e-3, )) self.assertTrue( torch.allclose( posterior_same_inputs.mvn. covariance_matrix[:, 0, :, :], non_batch_posterior.mvn. covariance_matrix, atol=1e-3, ))
def test_multitask_forward(): """Test multitask forward.""" # build MultiTask detector model_cfg = dict( backbone=dict(type='ResNet', depth=50), heads=[ dict(type='DeepposeRegressionHead', in_channels=2048, num_joints=17, loss_keypoint=dict(type='SmoothL1Loss', use_target_weight=False)), ], necks=[dict(type='GlobalAveragePooling')], head2neck={0: 0}, pretrained=None, ) model = MultiTask(**model_cfg) # build inputs and target mm_inputs = _demo_mm_inputs() inputs = mm_inputs['img'] target = [mm_inputs['target_keypoints']] target_weight = [mm_inputs['target_weight']] img_metas = mm_inputs['img_metas'] # Test forward train losses = model(inputs, target, target_weight, return_loss=True) assert 'reg_loss' in losses and 'acc_pose' in losses # Test forward test outputs = model(inputs, img_metas=img_metas, return_loss=False) assert 'preds' in outputs # Test dummy forward outputs = model.forward_dummy(inputs) assert outputs[0].shape == torch.Size([1, 17, 2]) # Build multitask detector with no neck model_cfg = dict( backbone=dict(type='ResNet', depth=50), heads=[ dict(type='TopdownHeatmapSimpleHead', in_channels=2048, out_channels=17, num_deconv_layers=3, num_deconv_filters=(256, 256, 256), num_deconv_kernels=(4, 4, 4), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)) ], pretrained=None, ) model = MultiTask(**model_cfg) # build inputs and target target = [mm_inputs['target_heatmap']] # Test forward train losses = model(inputs, target, target_weight, return_loss=True) assert 'heatmap_loss' in losses and 'acc_pose' in losses # Test forward test outputs = model(inputs, img_metas=img_metas, return_loss=False) assert 'preds' in outputs # Test dummy forward outputs = model.forward_dummy(inputs) assert outputs[0].shape == torch.Size([1, 17, 64, 64])
def test_gp(self): for (train_iteration_fidelity, train_data_fidelity) in [ (False, True), (True, False), (True, True), ]: for batch_shape in (torch.Size(), torch.Size([2])): for num_outputs in (1, 2): for double in (False, True): num_dim = 1 + train_iteration_fidelity + train_data_fidelity tkwargs = { "device": self.device, "dtype": torch.double if double else torch.float, } model, _ = self._get_model_and_data( batch_shape=batch_shape, num_outputs=num_outputs, train_iteration_fidelity=train_iteration_fidelity, train_data_fidelity=train_data_fidelity, **tkwargs, ) mll = ExactMarginalLogLikelihood( model.likelihood, model).to(**tkwargs) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", category=OptimizationWarning) fit_gpytorch_model(mll, sequential=False, options={"maxiter": 1}) # test init self.assertIsInstance(model.mean_module, ConstantMean) self.assertIsInstance(model.covar_module, ScaleKernel) # test param sizes params = dict(model.named_parameters()) for p in params: self.assertEqual( params[p].numel(), num_outputs * torch.tensor(batch_shape).prod().item(), ) # test posterior # test non batch evaluation X = torch.rand(batch_shape + torch.Size([3, num_dim]), **tkwargs) posterior = model.posterior(X) self.assertIsInstance(posterior, GPyTorchPosterior) self.assertEqual( posterior.mean.shape, batch_shape + torch.Size([3, num_outputs]), ) # test batch evaluation X = torch.rand( torch.Size([2]) + batch_shape + torch.Size([3, num_dim]), **tkwargs, ) posterior = model.posterior(X) self.assertIsInstance(posterior, GPyTorchPosterior) self.assertEqual( posterior.mean.shape, torch.Size([2]) + batch_shape + torch.Size([3, num_outputs]), )
node_attns[out_id] = a_recv # for node 1-16 for nd_id in range(1, 17): x = None for in_id in trans_attns[nd_id]: trans_a = trans_attns[nd_id][in_id] * trans_norm_factors[nd_id][in_id] out, subb_idx = node_outs[in_id] subb_idx2 = trans_a.data[subb_idx].nonzero().squeeze(1) if subb_idx2.size(0) == 0: continue out = out.index_select(0, subb_idx2) w = a2w_fn(trans_a[subb_idx][subb_idx2]) sp_out = torch.sparse_coo_tensor(torch.unsqueeze(subb_idx[subb_idx2], 0), out * w.view(w.size(0), 1), torch.Size([batch_size, emb_dims])) x = sp_out if x is None else x + sp_out if x is None: continue x = x.coalesce() aggr_x = x.values() subbat_idx = x.indices().squeeze(0) out = transform_fn[nd_id](aggr_x) node_outs[nd_id] = (out, subbat_idx) if nd_id == master_output: break out_nei_ids = nodes[nd_id]['out_neis']
def train_GP_NPL(xtrain, ytrain, x_prior_dist, y_prior_dist, xtest=None, return_mean=True, return_samples=False, num_bootstraps=1, samples_per_bootstrap=10, num_iters=5000, alpha=1., num_pseudo=10, verbose=True): ''' Samples from the NPL xtrain: NxD torch tensor of training covariates ytrain: Nx1 torch tensor of training targets x_prior_dist: torch prior distribution over covariates y_prior_dist: torch prior distribution over targets xtest (optional): MxD torch tensor of test covariates return_mean: Boolean, if true and xtest is not None, includes mean and standard deviation of predictions at xtest samples_per_bootstrap: Number of samples to generate per bootstrap of predictive distribution (if xtest is not None) return_samples: Boolean, if true and xtest is not None, includes the raw samples in the return object num_bootstraps: Integer, number of bootstrap samples samples_per_bootstrap: integer, number of predictive samples to generate per bootstrap if xtest is not None num_iters: number of iterations alpha: concentration parameter (>0) num_pseudo: number of pseudo-samples returns: Dict with following entries: 'lengthscale': Sampled lengthscales 'sigma_n': Sampled noise standard deviations 'mean': Mean function at xtest (only included if xtest is not None and return_mean=True) 'std': Standard deviation at xtest (only included if xtest is not None and return_mean=True) 'samples': return_samples samples from posterior (only included if xtest is not None and return_samples=True) ''' dirichlet_weight = torch.cat( (torch.ones(len(ytrain)), (alpha / num_pseudo) * torch.ones(num_pseudo)), 0) weight_generator = d.Dirichlet(dirichlet_weight) lengthscale = [] sigma_n = [] if xtest is not None: samples = np.zeros((0, len(xtest))) for b in range(num_bootstraps): weights = weight_generator.sample() * (len(ytrain) + alpha) pseudo_x = x_prior_dist.sample(sample_shape=torch.Size([num_pseudo])) pseudo_y = y_prior_dist.sample(sample_shape=torch.Size([num_pseudo])) both_x = torch.cat((xtrain, pseudo_x), 0) both_y = torch.cat((ytrain, pseudo_y), 0) bres = train_GP_weighted(both_x, both_y, xtest=xtest, return_mean=False, num_samples=samples_per_bootstrap, num_iters=5000, weights=weights, verbose=verbose) if xtest is not None: samples = np.vstack((samples, bres['samples'])) lengthscale.append(bres['lengthscale']) sigma_n.append(bres['sigma_n']) res = {'lengthscale': lengthscale, 'sigma_n': sigma_n} if return_mean: res['mean'] = np.mean(samples, axis=0) res['std'] = np.std(samples, axis=0) if return_samples: res['samples'] = samples return res
def load_gcn_data(prefix): #npz_file = 'data/{}/{}_{}.npz'.format(dataset_str, dataset_str, "gcn") npz_file = prefix + "_" + "gcn.npz" if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] train_adj = sp.csr_matrix( (data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape']) full_adj = sp.csr_matrix( (data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape']) feats = sp.csr_matrix( (data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape']) train_feats = sp.csr_matrix( (data['train_feats_data'], data['train_feats_indices'], data['train_feats_indptr']), shape=data['train_feats_shape']) test_feats = sp.csr_matrix( (data['test_feats_data'], data['test_feats_indices'], data['test_feats_indptr']), shape=data['test_feats_shape']) labels = np.argmax(labels, axis=1) coo_adj = full_adj.tocoo() values = coo_adj.data indices = np.vstack((coo_adj.row, coo_adj.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = coo_adj.shape adj_t = torch.sparse.FloatTensor(i, v, torch.Size(shape)) feats = feats.todense() print('Finished in {} seconds.'.format(time() - start_time)) else: """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) if dataset_str != 'nell': test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix( (len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_test[test_mask, :] = labels[test_mask, :] else: test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) features = allx.tocsr() adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = ally idx_test = test_idx_reorder idx_train = range(len(y)) idx_val = range(len(y), len(y) + 969) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] # num_data, (v, coords), feats, labels, train_d, val_d, test_d num_data = features.shape[0] def _normalize_adj(adj): rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum + 1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj).tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords def gcn_normalize_adj(adj): adj = adj + sp.eye(adj.shape[0]) rowsum = np.array(adj.sum(1)) + 1e-20 d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt, 0) adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt) adj = adj.tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords # Normalize features rowsum = np.array(features.sum(1)) + 1e-9 r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv, 0) features = r_mat_inv.dot(features) #if FLAGS.normalization == 'gcn': full_v, full_coords = gcn_normalize_adj(adj) full_v = full_v.astype(np.float32) full_coords = full_coords.astype(np.int32) train_v, train_coords = full_v, full_coords labels = (y_train + y_val + y_test).astype(np.float32) train_data = np.nonzero(train_mask)[0].astype(np.int32) val_data = np.nonzero(val_mask)[0].astype(np.int32) test_data = np.nonzero(test_mask)[0].astype(np.int32) feats = (features.data, features.indices, features.indptr, features.shape) def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0, :], coords[1, :])), shape=(num_data, num_data)) return adj train_adj = _get_adj(train_v, train_coords) full_adj = _get_adj(full_v, full_coords) feats = sp.csr_matrix((feats[0], feats[1], feats[2]), shape=feats[-1], dtype=np.float32) train_feats = train_adj.dot(feats) test_feats = full_adj.dot(feats) with open(npz_file, 'wb') as fwrite: np.savez(fwrite, num_data=num_data, train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape, full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape, feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape, train_feats_data=train_feats.data, train_feats_indices=train_feats.indices, train_feats_indptr=train_feats.indptr, train_feats_shape=train_feats.shape, test_feats_data=test_feats.data, test_feats_indices=test_feats.indices, test_feats_indptr=test_feats.indptr, test_feats_shape=test_feats.shape, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data, adj_t
('pow', uniform_scalar(1e-3, requires_grad=True), (3.14, ), 'scalar_constant'), ('__rpow__', uniform_scalar(1e-3, requires_grad=True), (3.14, ), 'scalar_constant'), ('transpose', (1, 2, 3), (1, 2), 'dim', [0, 1]), ('transpose', (), (0, 0), 'scalar'), ('transpose', (1, ), (0, 0), '1d'), ('transpose', torch.rand(L, L), (0, 1), '2d'), ('transpose', torch.rand(S, S, S), (2, 0), '3d'), ('t', (1, 2), NO_ARGS), ( 'view', (S, S, S), (S * S, S), ), ('view', (S, S, S), (torch.Size([S * S, S]), ), 'size'), ('view', (S, ), (S, ), '1d'), ('view', (), (dont_convert(()), ), 'scalar_to_scalar'), ('view', (), (1, ), 'scalar_to_1d'), ( 'reshape', (S, S, S), (S * S, S), ), ('reshape', (S, S, S), (torch.Size([S * S, S]), ), 'size'), ('reshape', (S, ), (S, ), '1d'), ('reshape', (), (dont_convert(()), ), 'scalar_to_scalar'), ('reshape', (), (1, ), 'scalar_to_1d'), ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)), )), ('reshape_as', (), (non_differentiable(torch.tensor(42.)), ), 'scalar'), ('reshape_as', (), (non_differentiable(torch.rand(1, 1)), ),
def load_graphsage_data(prefix, normalize=True): version_info = list(map(int, nx.__version__.split('.'))) major = version_info[0] minor = version_info[1] assert (major <= 1) and ( minor <= 11 ), "networkx major version must be <= 1.11 in order to load graphsage data" # Save normalized version max_degree = -1 if max_degree == -1: npz_file = prefix + '.npz' else: npz_file = '{}_deg{}.npz'.format(prefix, max_degree) if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] feats = data['feats'] train_feats = data['train_feats'] test_feats = data['test_feats'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] train_adj = sp.csr_matrix( (data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape']) full_adj = sp.csr_matrix( (data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape']) ###### to make it compatible with pytorch gcn coo_adj = full_adj.tocoo() values = coo_adj.data indices = np.vstack((coo_adj.row, coo_adj.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = coo_adj.shape adj_t = torch.sparse.FloatTensor(i, v, torch.Size(shape)) #labels = np.argmax(labels,axis=1) print('Finished in {} seconds.'.format(time() - start_time)) else: print('Loading data...') start_time = time() G_data = json.load(open(prefix + "-G.json")) G = json_graph.node_link_graph(G_data) feats = np.load(prefix + "-feats.npy").astype(np.float32) id_map = json.load(open(prefix + "-id_map.json")) # print(id_map) if list(id_map.keys())[0].isdigit(): conversion = lambda n: int(n) else: conversion = lambda n: n id_map = {conversion(k): int(v) for k, v in id_map.items()} walks = [] class_map = json.load(open(prefix + "-class_map.json")) if isinstance(list(class_map.values())[0], list): lab_conversion = lambda n: n else: lab_conversion = lambda n: int(n) class_map = { conversion(k): lab_conversion(v) for k, v in class_map.items() } ## Remove all nodes that do not have val/test annotations ## (necessary because of networkx weirdness with the Reddit data) broken_count = 0 to_remove = [] for node in G.nodes(): if node not in id_map: #if not G.node[node].has_key('val') or not G.node[node].has_key('test'): to_remove.append(node) broken_count += 1 for node in to_remove: G.remove_node(node) print( "Removed {:d} nodes that lacked proper annotations due to networkx versioning issues" .format(broken_count)) # Construct adjacency matrix print("Loaded data ({} seconds).. now preprocessing..".format( time() - start_time)) start_time = time() edges = [] for edge in G.edges(): if edge[0] in id_map and edge[1] in id_map: edges.append((id_map[edge[0]], id_map[edge[1]])) print('{} edges'.format(len(edges))) num_data = len(id_map) if max_degree != -1: print('Subsampling edges...') edges = subsample_edges(edges, num_data, FLAGS.max_degree) val_data = np.array([id_map[n] for n in G.nodes() if G.node[n]['val']], dtype=np.int32) test_data = np.array( [id_map[n] for n in G.nodes() if G.node[n]['test']], dtype=np.int32) is_train = np.ones((num_data), dtype=np.bool) is_train[val_data] = False is_train[test_data] = False train_data = np.array([n for n in range(num_data) if is_train[n]], dtype=np.int32) train_edges = [(e[0], e[1]) for e in edges if is_train[e[0]] and is_train[e[1]]] edges = np.array(edges, dtype=np.int32) train_edges = np.array(train_edges, dtype=np.int32) # Process labels if isinstance(list(class_map.values())[0], list): num_classes = len(class_map.values()[0]) labels = np.zeros((num_data, num_classes), dtype=np.float32) for k in class_map.keys(): labels[id_map[k], :] = np.array(class_map[k]) else: num_classes = len(set(class_map.values())) labels = np.zeros((num_data, num_classes), dtype=np.float32) for k in class_map.keys(): labels[id_map[k], class_map[k]] = 1 if normalize: from sklearn.preprocessing import StandardScaler train_ids = np.array([ id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test'] ]) train_feats = feats[train_ids] scaler = StandardScaler() scaler.fit(train_feats) feats = scaler.transform(feats) def _normalize_adj(edges): adj = sp.csr_matrix((np.ones((edges.shape[0]), dtype=np.float32), (edges[:, 0], edges[:, 1])), shape=(num_data, num_data)) adj += adj.transpose() rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum + 1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj).tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data, coords train_v, train_coords = _normalize_adj(train_edges) full_v, full_coords = _normalize_adj(edges) def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0, :], coords[1, :])), shape=(num_data, num_data)) return adj train_adj = _get_adj(train_v, train_coords) full_adj = _get_adj(full_v, full_coords) train_feats = train_adj.dot(feats) test_feats = full_adj.dot(feats) print("Done. {} seconds.".format(time() - start_time)) with open(npz_file, 'wb') as fwrite: #print('Saving {} edges'.format(full_adj.nnz)) np.savez(fwrite, num_data=num_data, train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape, full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape, feats=feats, train_feats=train_feats, test_feats=test_feats, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) # Pytorch stuff coo_adj = full_adj.tocoo() values = coo_adj.data indices = np.vstack((coo_adj.row, coo_adj.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = coo_adj.shape adj_t = torch.sparse.FloatTensor(i, v, torch.Size(shape)) labels = np.argmax(labels, axis=1) return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data, adj_t
def get_fantasy_strategy(self, inputs, targets, full_inputs, full_targets, full_output, **kwargs): """ Returns a new PredictionStrategy that incorporates the specified inputs and targets as new training data. This method is primary responsible for updating the mean and covariance caches. To add fantasy data to a GP model, use the :meth:`~gpytorch.models.ExactGP.get_fantasy_model` method. Args: - :attr:`inputs` (Tensor `b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`): Locations of fantasy observations. - :attr:`targets` (Tensor `b1 x ... x bk x m` or `f x b1 x ... x bk x m`): Labels of fantasy observations. - :attr:`full_inputs` (Tensor `b1 x ... x bk x n+m x d` or `f x b1 x ... x bk x n+m x d`): Training data concatenated with fantasy inputs - :attr:`full_targets` (Tensor `b1 x ... x bk x n+m` or `f x b1 x ... x bk x n+m`): Training labels concatenated with fantasy labels. - :attr:`full_output` (:class:`gpytorch.distributions.MultivariateNormal`): Prior called on full_inputs Returns: - :class:`DefaultPredictionStrategy` A `DefaultPredictionStrategy` model with `n + m` training examples, where the `m` fantasy examples have been added and all test-time caches have been updated. """ full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix batch_shape = full_inputs[0].shape[:-2] full_mean = full_mean.view(*batch_shape, -1) num_train = self.num_train # Evaluate fant x train and fant x fant covariance matrices, leave train x train unevaluated. fant_fant_covar = full_covar[..., num_train:, num_train:] fant_mean = full_mean[..., num_train:] mvn = self.train_prior_dist.__class__(fant_mean, fant_fant_covar) fant_likelihood = self.likelihood.get_fantasy_likelihood(**kwargs) mvn_obs = fant_likelihood(mvn, inputs, **kwargs) fant_fant_covar = mvn_obs.covariance_matrix fant_train_covar = delazify(full_covar[..., num_train:, :num_train]) self.fantasy_inputs = inputs self.fantasy_targets = targets """ Compute a new mean cache given the old mean cache. We have \\alpha = K^{-1}y, and we want to solve [K U; U' S][a; b] = [y; y_f], where U' is fant_train_covar, S is fant_fant_covar, and y_f is (targets - fant_mean) To do this, we solve the bordered linear system of equations for [a; b]: AQ = U # Q = fant_solve [S - U'Q]b = y_f - U'\\alpha ==> b = [S - U'Q]^{-1}(y_f - U'\\alpha) a = \\alpha - Qb """ # Get cached K inverse decomp. (or compute if we somehow don't already have the covariance cache) K_inverse = self.lik_train_train_covar.root_inv_decomposition() fant_solve = K_inverse.matmul(fant_train_covar.transpose(-2, -1)) # Solve for "b", the lower portion of the *new* \\alpha corresponding to the fantasy points. schur_complement = fant_fant_covar - fant_train_covar.matmul( fant_solve) # we'd like to use a less hacky approach for the following, but einsum can be much faster than # than unsqueezing/squeezing here (esp. in backward passes), unfortunately it currenlty has some # issues with broadcasting: https://github.com/pytorch/pytorch/issues/15671 prefix = string.ascii_lowercase[:max( fant_train_covar.dim() - self.mean_cache.dim() - 1, 0)] ftcm = torch.einsum(prefix + "...yz,...z->" + prefix + "...y", [fant_train_covar, self.mean_cache]) small_system_rhs = targets - fant_mean - ftcm small_system_rhs = small_system_rhs.unsqueeze(-1) # Schur complement of a spd matrix is guaranteed to be positive definite fant_cache_lower = torch.cholesky_solve( small_system_rhs, psd_safe_cholesky(schur_complement)) # Get "a", the new upper portion of the cache corresponding to the old training points. fant_cache_upper = self.mean_cache.unsqueeze(-1) - fant_solve.matmul( fant_cache_lower) fant_cache_upper = fant_cache_upper.squeeze(-1) fant_cache_lower = fant_cache_lower.squeeze(-1) # New mean cache. fant_mean_cache = torch.cat((fant_cache_upper, fant_cache_lower), dim=-1) """ Compute a new covariance cache given the old covariance cache. We have access to K \\approx LL' and K^{-1} \\approx R^{-1}R^{-T}, where L and R are low rank matrices resulting from Lanczos (see the LOVE paper). To update R^{-1}, we first update L: [K U; U' S] = [L 0; A B][L' A'; 0 B'] Solving this matrix equation, we get: K = LL' ==> L = L U = LA' ==> A = UR^{-1} S = AA' + BB' ==> B = cholesky(S - AA') Once we've computed Z = [L 0; A B], we have that the new kernel matrix [K U; U' S] \approx ZZ'. Therefore, we can form a pseudo-inverse of Z directly to approximate [K U; U' S]^{-1/2}. """ # [K U; U' S] = [L 0; lower_left schur_root] batch_shape = fant_train_covar.shape[:-2] L_inverse = self.covar_cache L = delazify(self.lik_train_train_covar.root_decomposition().root) m, n = L.shape[-2:] lower_left = fant_train_covar.matmul(L_inverse) schur_root = psd_safe_cholesky( fant_fant_covar - lower_left.matmul(lower_left.transpose(-2, -1))) # Form new root Z = [L 0; lower_left schur_root] num_fant = schur_root.size(-2) m, n = L.shape[-2:] new_root = torch.zeros(*batch_shape, m + num_fant, n + num_fant, device=L.device, dtype=L.dtype) new_root[..., :m, :n] = L new_root[..., m:, :n] = lower_left new_root[..., m:, n:] = schur_root # Use pseudo-inverse of Z as new inv root Q, R = torch.qr(new_root) Rdiag = torch.diagonal(R, dim1=-2, dim2=-1) # if R is almost singular, add jitter (Rdiag is a view, so this works) zeroish = Rdiag.abs() < 1e-6 if torch.any(zeroish): # can't use in-place operation here b/c it would mess up backward pass # haven't found a more elegant way to add a jitter diagonal yet... jitter_diag = 1e-6 * torch.sign(Rdiag) * zeroish.to(Rdiag) R = R + torch.diag_embed(jitter_diag) new_covar_cache = torch.triangular_solve(Q.transpose(-2, -1), R)[0].transpose(-2, -1) # Expand inputs accordingly if necessary (for fantasies at the same points) if full_inputs[0].dim() <= full_targets.dim(): fant_batch_shape = full_targets.shape[:1] n_batch = len(full_mean.shape[:-1]) repeat_shape = fant_batch_shape + torch.Size([1] * n_batch) full_inputs = [ fi.expand(fant_batch_shape + fi.shape) for fi in full_inputs ] full_mean = full_mean.expand(fant_batch_shape + full_mean.shape) full_covar = BatchRepeatLazyTensor(full_covar, repeat_shape) new_root = BatchRepeatLazyTensor(NonLazyTensor(new_root), repeat_shape) # no need to repeat the covar cache, broadcasting will do the right thing # Create new DefaultPredictionStrategy object fant_strat = self.__class__( train_inputs=full_inputs, train_prior_dist=self.train_prior_dist.__class__( full_mean, full_covar), train_labels=full_targets, likelihood=fant_likelihood, root=new_root, inv_root=new_covar_cache, ) fant_strat._memoize_cache = { "mean_cache": fant_mean_cache, "covar_cache": new_covar_cache } return fant_strat
def BetaSample(alpha, beta, sample_shape=torch.Size()): concentration = torch.stack([alpha, beta], -1) shape = sample_shape + concentration.shape[:-1] + concentration.shape[-1:] concentration = concentration.expand(shape) return _Dirichlet.apply(concentration).select(-1, 0)
def testALEBO(self): B = torch.tensor( [[1.0, 2.0, 3.0, 4.0, 5.0], [2.0, 3.0, 4.0, 5.0, 6.0]], dtype=torch.double) train_X = torch.tensor( [ [0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0, 2.0], ], dtype=torch.double, ) train_Y = torch.tensor([[1.0], [2.0], [3.0]], dtype=torch.double) train_Yvar = 0.1 * torch.ones(3, 1, dtype=torch.double) m = ALEBO(B=B, laplace_nsamp=5, fit_restarts=1) self.assertTrue(torch.equal(B, m.B)) self.assertEqual(m.laplace_nsamp, 5) self.assertEqual(m.fit_restarts, 1) self.assertEqual(m.refit_on_update, True) self.assertEqual(m.refit_on_cv, False) self.assertEqual(m.warm_start_refitting, False) # Test fit m.fit( Xs=[train_X, train_X], Ys=[train_Y, train_Y], Yvars=[train_Yvar, train_Yvar], search_space_digest=SearchSpaceDigest( feature_names=[], bounds=[(-1, 1)] * 5, ), metric_names=[], ) self.assertIsInstance(m.model, ModelListGP) self.assertTrue(torch.allclose(m.Xs[0], (B @ train_X.t()).t())) # Test predict f, cov = m.predict(X=B) self.assertEqual(f.shape, torch.Size([2, 2])) self.assertEqual(cov.shape, torch.Size([2, 2, 2])) # Test best point objective_weights = torch.tensor([1.0, 0.0], dtype=torch.double) with self.assertRaises(NotImplementedError): m.best_point(bounds=[(-1, 1)] * 5, objective_weights=objective_weights) # Test gen # With clipping with mock.patch( "ax.models.torch.alebo.optimize_acqf", autospec=True, return_value=(m.Xs[0], torch.tensor([])), ): Xopt, _, _, _ = m.gen( n=1, bounds=[(-1, 1)] * 5, objective_weights=torch.tensor([1.0, 0.0], dtype=torch.double), ) self.assertFalse(torch.allclose(Xopt, train_X)) self.assertTrue(Xopt.min() >= -1) self.assertTrue(Xopt.max() <= 1) # Without with mock.patch( "ax.models.torch.alebo.optimize_acqf", autospec=True, return_value=(torch.ones(1, 2, dtype=torch.double), torch.tensor([])), ): Xopt, _, _, _ = m.gen( n=1, bounds=[(-1, 1)] * 5, objective_weights=torch.tensor([1.0, 0.0], dtype=torch.double), ) self.assertTrue( torch.allclose( Xopt, torch.tensor([[-0.2, -0.1, 0.0, 0.1, 0.2]], dtype=torch.double))) # Test update train_X2 = torch.tensor( [ [3.0, 3.0, 3.0, 3.0, 3.0], [1.0, 1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0, 2.0], ], dtype=torch.double, ) m.update( Xs=[train_X, train_X2], Ys=[train_Y, train_Y], Yvars=[train_Yvar, train_Yvar], ) self.assertTrue(torch.allclose(m.Xs[0], (B @ train_X.t()).t())) self.assertTrue(torch.allclose(m.Xs[1], (B @ train_X2.t()).t())) m.refit_on_update = False m.update( Xs=[train_X, train_X2], Ys=[train_Y, train_Y], Yvars=[train_Yvar, train_Yvar], ) # Test get_and_fit with single meric gp = m.get_and_fit_model(Xs=[(B @ train_X.t()).t()], Ys=[train_Y], Yvars=[train_Yvar]) self.assertIsInstance(gp, ALEBOGP) # Test cross_validate f, cov = m.cross_validate( Xs_train=[train_X], Ys_train=[train_Y], Yvars_train=[train_Yvar], X_test=train_X2, ) self.assertEqual(f.shape, torch.Size([3, 1])) self.assertEqual(cov.shape, torch.Size([3, 1, 1])) m.refit_on_cv = True f, cov = m.cross_validate( Xs_train=[train_X], Ys_train=[train_Y], Yvars_train=[train_Yvar], X_test=train_X2, ) self.assertEqual(f.shape, torch.Size([3, 1])) self.assertEqual(cov.shape, torch.Size([3, 1, 1]))
def _log_prob_shape(dist, x_size=torch.Size()): event_dims = len(dist.event_shape) expected_shape = broadcast_shape(dist.shape(), x_size, strict=True) if event_dims > 0: expected_shape = expected_shape[:-event_dims] return expected_shape
input = input.flatten(1, 2) hidden = torch.nn.functional.leaky_relu(self.fc1(input)) return self.fc2(hidden) class Receiver(nn.Module): def __init__(self, n_hidden, n_features, n_attributes): super(Receiver, self).__init__() self.fc1 = nn.Linear(n_hidden, n_features) self.fc2 = nn.Linear(n_features, n_features) def forward(self, input, _): hidden = torch.nn.functional.leaky_relu(self.fc1(input)) return self.fc2(hidden).squeeze(dim=0) if __name__ == "__main__": from teamwork.data import TupleDataset from torch.utils import data train, dev = TupleDataset.create_train_and_dev( perceptual_dimensions=[10, 10]) loader = data.DataLoader(train, batch_size=16, drop_last=True, shuffle=True) batch = next(iter(loader)) (idx1, idx2), target = batch sender = Sender(n_hidden=200, n_features=10, n_attributes=2) emb = sender((idx1, idx2)) assert emb.shape == torch.Size((32, 200))
def build_adj_matrix(corpus, vocabulary, num_documents, doc_offset=0, window_size=20): """ Builds the adjacency matrix A for the text-document graph A_ij = max(0, PMI(i,j)) if i, j are words (0 if PMI <= 0) = TF-IDF(i,j) if i is a document and j is a word (or opposite) = 1 if i = j = 0 otherwise """ num_words = len(vocabulary) num_vertices = num_words + num_documents word_to_index = {w: i for i, w in enumerate(vocabulary)} # nonzero entries in the adjacency matrix entries = [] # indices for the entries row = [] col = [] print('Building word frequencies per doc') word_freqs_per_doc = {} for i, doc in enumerate(tqdm(corpus)): doc_idx = num_words + doc_offset + i word_freq = {} text = doc.text for word in text: if word not in vocabulary: word = '<unk>' if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 word_freqs_per_doc[doc_idx] = word_freq ### PMI calculations print('Building word frequencies per window') num_windows = 0 word_window_occurrences = defaultdict(int) word_pair_window_occurrences = defaultdict( int) # keys are tuples (x,y) where x < y are strings for doc in tqdm(corpus): text = doc.text for window_start_idx in range(max(1, len(text) - window_size + 1)): num_windows += 1 window = text[window_start_idx:window_start_idx + window_size] distinct_words = list(set(window)) for word in distinct_words: word_window_occurrences[word] += 1 for i in range(len(distinct_words)): for j in range(i + 1, len(distinct_words)): word1, word2 = distinct_words[i], distinct_words[j] if word2 < word1: word1, word2 = word2, word1 key = (word1, word2) word_pair_window_occurrences[key] += 1 # get rid of defaultdict behavior word_window_occurrences = dict(word_window_occurrences) word_pair_window_occurrences = dict(word_pair_window_occurrences) print('Calculating PMIs') for pair, pair_freq in tqdm(word_pair_window_occurrences.items()): word1, word2 = pair w1_idx, w2_idx = word_to_index[word1], word_to_index[word2] freq1 = word_window_occurrences[word1] freq2 = word_window_occurrences[word2] # PMI = P(i and j) / (P(i) * P(j)) where P(i and j) = #windows with i and j / #windows # and P(i) = #windows with i / #windows pmi = log((pair_freq * num_windows) / (freq1 * freq2)) if pmi <= 0: continue entries.append(pmi) row.append(w1_idx) col.append(w2_idx) entries.append(pmi) row.append(w2_idx) col.append(w1_idx) ### TF-IDF calculations print('Calculating TF-IDF') for w_idx, word in enumerate(tqdm(vocabulary)): doc_occurrences = 0 for doc_idx, freqs in word_freqs_per_doc.items(): if word in freqs: doc_occurrences += 1 if doc_occurrences == 0: continue idf = log(num_documents / doc_occurrences) for doc_idx, freqs in word_freqs_per_doc.items(): if word in freqs and freqs[word] > 0: entries.append(freqs[word] * idf) row.append(w_idx) col.append(doc_idx) entries.append(freqs[word] * idf) row.append(doc_idx) col.append(w_idx) ### 1s for identities print('Identities') for i in trange(num_vertices): entries.append(1) row.append(i) col.append(i) indices = torch.LongTensor([row, col]) entries = torch.FloatTensor(entries) return sparse.FloatTensor(indices, entries, torch.Size([num_vertices, num_vertices]))
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, need_weights=True, static_kv=False, attn_mask=None, fast_weights=None): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Timesteps can be masked by supplying a T x T mask in the `attn_mask` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr() kv_same = key.data_ptr() == value.data_ptr() tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if 'prev_key' in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert kv_same and not qkv_same key = value = None else: saved_state = None if qkv_same: # self-attention q, k, v = self.in_proj_qkv(query) elif kv_same: # encoder-decoder attention q = self.in_proj_q(query) if key is None: assert value is None k = v = None else: k, v = self.in_proj_kv(key) else: q = self.in_proj_q(query) k = self.in_proj_k(key) v = self.in_proj_v(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1) ], dim=1) q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if 'prev_key' in saved_state: prev_key = saved_state['prev_key'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: k = torch.cat((prev_key, k), dim=1) if 'prev_value' in saved_state: prev_value = saved_state['prev_value'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: v = torch.cat((prev_value, v), dim=1) saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim) self._set_input_buffer(incremental_state, saved_state) src_len = k.size(1) # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.shape == torch.Size( []): key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask) ], dim=1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if self.onnx_trace: attn_weights = torch.where( key_padding_mask.unsqueeze(1).unsqueeze(2), torch.Tensor([float("-Inf")]), attn_weights.float()).type_as(attn_weights) else: attn_weights = attn_weights.float().masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2) == 1, float('-1e30'), ).type_as(attn_weights) # FP16 support: cast to float and back attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) from fairseq import utils attn_weights = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace, ).type_as(attn_weights) attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training) attn = torch.bmm(attn_weights, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if (self.onnx_trace and attn.size(1) == 1): # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) if need_weights: # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads else: attn_weights = None return attn, attn_weights
def sample_params(self): shape = torch.Size( (*self.particles, 1)) if isinstance(self.filter, ParticleFilter) else self.particles self.filter.ssm.sample_params(shape)