def l2l_validate(model, cluster_center, n_epoch=100): val_accuracy = [] for epoch in range(n_epoch): data_l = generate_data_l(cluster_center) data_n = generate_data_n(cluster_center, model.n_class_n) x_l, y_l = Variable(torch.from_numpy(data_l[0])).float(), Variable( torch.from_numpy(data_l[1])) x_n, y_n = Variable(torch.from_numpy(data_n[0])).float(), Variable( torch.from_numpy(data_n[1])) pred_ll, pred_nl, w, b = model(x_l, x_n) M = Variable(torch.zeros(model.n_class_n, model.n_dim)) B = Variable(torch.zeros(model.n_class_n)) for k in range(model.n_class_n): M[k] = torch.cat((w[:, 0][y_n == model.n_class_l + k].view(-1, 1), w[:, 1][y_n == model.n_class_l + k].view(-1, 1)), 1).mean(0) B[k] = b[y_n == model.n_class_l + k].mean() pred_ln = torch.mm(x_l, M.t()) + B.view(1, -1).expand_as(torch.mm(x_l, M.t())) pred_nn = torch.mm(x_n, M.t()) + B.view(1, -1).expand_as(torch.mm(x_n, M.t())) pred = torch.cat((torch.cat((pred_ll, pred_nl)), torch.cat((pred_ln, pred_nn))), 1) pred = pred.data.max(1)[1] y = torch.cat((y_l, y_n)) accuracy = pred.eq(y.data).cpu().sum() * 1.0 / y.size()[0] # print('accuracy: %.2f' % accuracy) val_accuracy.append(accuracy) acc_l = pred.eq(y.data).cpu()[0:100].sum() * 1.0 / 100 acc_n = pred.eq(y.data).cpu()[100:150].sum() * 1.0 / 50 print('accuracy: %.2f, lifelong accuracy: %.2f, new accuracy: %.2f' % (accuracy, acc_l, acc_n)) return numpy.mean(numpy.asarray(val_accuracy))
def backward(ctx, grad_output): input1, input2, weight, bias = ctx.saved_variables grad_input1 = grad_input2 = grad_weight = grad_bias = None buff = Variable(input1.data.new()) if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: grad_input1 = torch.mm(input2, weight[0].t()) grad_input1 = grad_input1.mul(grad_output.narrow(1, 0, 1).expand(grad_input1.size())) grad_input2 = torch.mm(input1, weight[0]) grad_input2 = grad_input2.mul(grad_output.narrow(1, 0, 1).expand(grad_input2.size())) for k in range(1, weight.size(0)): buff = input2.mm(weight[k].t()) buff = buff.mul(grad_output.narrow(1, k, 1).expand(grad_input1.size())) grad_input1.add_(buff) buff = input1.mm(weight[k]) buff = buff.mul(grad_output.narrow(1, k, 1).expand(grad_input2.size())) grad_input2.add_(buff) grad_weight = Variable(weight.data.new(weight.size())) if ctx.needs_input_grad[2]: # accumulate parameter gradients: for k in range(weight.size(0)): buff = input1.mul(grad_output.narrow(1, k, 1).expand_as(input1)) grad_weight[k] = torch.mm(buff.t(), input2) if bias is not None and ctx.needs_input_grad[3]: grad_bias = grad_output.sum(0, keepdim=False) return grad_input1, grad_input2, grad_weight, grad_bias
def updateGradInput(self, input, gradOutput): if self.gradInput is None: return self._assertInputGradOutput(input, gradOutput) # compute d output / d input: self.gradInput[0].resize_as_(input[0]).fill_(0) self.gradInput[1].resize_as_(input[1]).fill_(0) #: first slice of weight tensor (k = 1) self.gradInput[0].addmm_(input[1], self.weight[0].t()) self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0), self.gradInput[0].size(1))) self.gradInput[1].addmm_(input[0], self.weight[0]) self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0), self.gradInput[1].size(1))) #: remaining slices of weight tensor if self.weight.size(0) > 1: if self.buff1 is None: self.buff1 = input[0].new() self.buff1.resize_as_(input[0]) for k in range(1, self.weight.size(0)): torch.mm(input[1], self.weight[k].t(), out=self.buff1) self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0), self.gradInput[0].size(1))) self.gradInput[0].add_(self.buff1) torch.mm(input[0], self.weight[k], out=self.buff2) self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0), self.gradInput[1].size(1))) self.gradInput[1].add_(self.buff2) return self.gradInput
def forward(self, input_, hx): """ Args: input_: A (batch, input_size) tensor containing input features. hx: A tuple (h_0, c_0), which contains the initial hidden and cell state, where the size of both states is (batch, hidden_size). time: The current timestep value, which is used to get appropriate running statistics. Returns: h_1, c_1: Tensors containing the next hidden and cell state. """ h_0, c_0 = hx batch_size = h_0.size(0) bias_batch = (self.bias.unsqueeze(0) .expand(batch_size, *self.bias.size())) wh = torch.mm(h_0, self.weight_hh) wh = torch.mm(h_0, self.weight_hh) wi = torch.mm(input_, self.weight_ih) bn_wh = self.bn_hh(wh) bn_wi = self.bn_ih(wi) f, i, o, g = torch.split(bn_wh + bn_wi + bias_batch, split_size=self.hidden_size, dim=1) c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) h_1 = torch.sigmoid(o) * torch.tanh(self.bn_c(c_1)) return h_1, c_1
def backward(ctx, grad_output): matrix1, matrix2 = ctx.saved_variables grad_add_matrix = grad_matrix1 = grad_matrix2 = None if ctx.needs_input_grad[0]: grad_add_matrix = maybe_unexpand(grad_output, ctx.add_matrix_size) if ctx.alpha != 1: grad_add_matrix = grad_add_matrix.mul(ctx.alpha) if ctx.needs_input_grad[1]: if matrix1.stride() == (1, matrix1.size(0)): # column major gradient if input is column major grad_matrix1 = torch.mm(matrix2, grad_output.t()).t() else: grad_matrix1 = torch.mm(grad_output, matrix2.t()) if ctx.beta != 1: grad_matrix1 *= ctx.beta if ctx.needs_input_grad[2]: if matrix2.stride() == (1, matrix2.size(0)): # column major gradient if input is column major grad_matrix2 = torch.mm(grad_output.t(), matrix1).t() else: grad_matrix2 = torch.mm(matrix1.t(), grad_output) if ctx.beta != 1: grad_matrix2 *= ctx.beta return grad_add_matrix, grad_matrix1, grad_matrix2, None, None, None
def forward(self, attn_mem, n_step): """atten_mem: Tensor of size [num_sents, input_dim]""" attn_feat = torch.mm(attn_mem, self._attn_wm) hop_feat = torch.mm(attn_mem, self._hop_wm) outputs = [] lstm_in = self._init_i.unsqueeze(0) lstm_states = (self._init_h.unsqueeze(1), self._init_c.unsqueeze(1)) for _ in range(n_step): h, c = self._lstm_cell(lstm_in, lstm_states) query = h[:, -1, :] for _ in range(self._n_hop): query = PtrExtractorRL.attention(hop_feat, query, self._hop_v, self._hop_wq) score = PtrExtractorRL.attention_score( attn_feat, query, self._attn_v, self._attn_wq) if self.training: prob = F.softmax(score, dim=-1) out = torch.distributions.Categorical(prob) else: for o in outputs: score[0, o[0, 0].item()][0] = -1e18 out = score.max(dim=1, keepdim=True)[1] outputs.append(out) lstm_in = attn_mem[out[0, 0].item()].unsqueeze(0) lstm_states = (h, c) return outputs
def l2l_train(model, cluster_center, n_epoch=10000, trunc_step=10): optimizer = optim.Adam(model.parameters(), lr=0.01) M_all = Variable(torch.zeros(model.n_class, model.n_dim)) B_all = Variable(torch.zeros(model.n_class)) for epoch in range(n_epoch): loss = 0 M_step, B_step = [], [] for step in range(trunc_step): data = generate_data(cluster_center) optimizer.zero_grad() x, y = Variable(torch.from_numpy(data[0])).float(), Variable(torch.from_numpy(data[1])) w, b = model(x) M = Variable(torch.zeros(model.n_class_n, model.n_dim)) B = Variable(torch.zeros(model.n_class_n)) for k in range(model.n_class_n): M[k] = torch.cat((w[:, 0][y == model.n_class_l + k].view(-1, 1), w[:, 1][y == model.n_class_l + k].view(-1, 1)), 1).mean(0) B[k] = b[y == model.n_class_l + k].mean() if step == 0: M_ = M B_ = B else: M_ = step / (step + 1) * M_step[-1] + 1 / (step + 1) * M B_ = step / (step + 1) * B_step[-1] + 1 / (step + 1) * B M_step.append(M_) B_step.append(B_) pred = torch.mm(x, M_.t()) + B_.view(1, -1).expand_as(torch.mm(x, M_.t())) loss += F.cross_entropy(pred, y) loss.backward() optimizer.step() print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.data[0])) return M_all, B_all, cluster_center
def forward(self, embbedings, label): if self.device_id == None: kernel_norm = l2_norm(self.kernel, axis = 0) cos_theta = torch.mm(embbedings, kernel_norm) else: x = embbedings sub_kernels = torch.chunk(self.kernel, len(self.device_id), dim=1) temp_x = x.cuda(self.device_id[0]) kernel_norm = l2_norm(sub_kernels[0], axis = 0).cuda(self.device_id[0]) cos_theta = torch.mm(temp_x, kernel_norm) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) kernel_norm = l2_norm(sub_kernels[i], axis = 0).cuda(self.device_id[i]) cos_theta = torch.cat((cos_theta, torch.mm(temp_x, kernel_norm).cuda(self.device_id[0])), dim=1) cos_theta = cos_theta.clamp(-1, 1) # for numerical stability phi = cos_theta - self.m label = label.view(-1, 1) # size=(B,1) index = cos_theta.data * 0.0 # size=(B,Classnum) index.scatter_(1, label.data.view(-1, 1), 1) index = index.byte() output = cos_theta * 1.0 output[index] = phi[index] # only change the correct predicted output output *= self.s # scale up in order to make softmax work, first introduced in normface return output
def forward(self, input, hidden, encoder_outputs, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, coverage): """ :param input: (B) :param hidden: (1, B, H), (1, B, H) :param encoder_outputs: (B, L, 2*H) :param enc_padding_mask: (B, L) :param context: (B, 2*H); Since beam search will use context, so we need to send context out. :param extra_zeros: (B, n) :param enc_batch_extend_vocab: (B, L) :param coverage: (B, L) :return: (B, V), ((1, B, H), (1, B, H)), (B, 2*H), (B, L), (B, 1), (B, L) """ input = self.embed(input) # B -> (B, D) x = self.x_context(torch.cat((context, input), 1)) # (B, 2*H), (B, D) -> (B, 2*H + D) -> (B, D) output, hidden = self.lstm(x.unsqueeze(1), hidden) # (B, 1, D), ((1, B, H), (1, B, H)) -> (B, 1, H), hidden h_decoder, c_decoder = hidden # (1, B, H), (1, B, H) hidden_hat = torch.cat((h_decoder.view(-1, self.args.hidden_dim), c_decoder.view(-1, self.args.hidden_dim)), 1) # (B, H), (B, H) -> (B, 2*H) context, attn_dist, coverage = self.attention(hidden_hat, encoder_outputs, enc_padding_mask, coverage) # (B, 2*H), (B, L), (B, L) <- (B, 2*H), (B, L, 2*H), (B, L), (B, L) p_gen = None if self.args.pointer_gen: p_gen_input = torch.cat((context, hidden_hat, x), 1) # (B, 2*H), (B, 2*H), (B, D) -> (B, 2*2*H + D) p_gen = self.p_gen_linear(p_gen_input) # (B, 2*2*H + D) -> (B, 1) p_gen = torch.sigmoid(p_gen) # (B, 1) output = torch.cat((output.view(-1, self.args.hidden_dim), context), 1) # (B, H), (B, 2*H) -> (B, 3*H) output = self.out_linear(output) # (B, 3*H) -> (B, H) # output = F.relu(output) ## map (B, H) -> (B, V) # output = self.out2(output) # (B, H) -> (B, V); change to below matrix multiply output_pos = self.hidden2dim_pos(output) # (B, H) -> (B, D) output_neg = self.hidden2dim_neg(output) # (B, H) -> (B, D) output_pos = F.relu(torch.mm(output_pos, self.embed.weight.t())) # (B, D) * (D, V) -> (B, V) output_neg = F.relu(torch.mm(output_neg, self.embed.weight.t())) # (B, D) * (D, V) -> (B, V) output = output_pos - output_neg # (B, V) ## change output to vocab_dist vocab_dist = F.softmax(output, dim=1) # (B, V) if self.args.pointer_gen: vocab_dist_ = p_gen * vocab_dist # (B, 1) * (B, V) -> (B, V) attn_dist_ = (1 - p_gen) * attn_dist # (B, 1) * (B, L) -> (B, L) if extra_zeros is not None: vocab_dist_ = torch.cat([vocab_dist_, extra_zeros], 1) # (B, V), (B, n) -> (B, V + n) final_dist = vocab_dist_.scatter_add(1, enc_batch_extend_vocab, attn_dist_) # (B, V) -> (B, V + n) else: final_dist = vocab_dist # (B, V) return final_dist, hidden, context, attn_dist, p_gen, coverage # (B, V), ((1, B, H), (1, B, H)), (B, 2*H), (B, L), (B, 1), (B, L)
def addDecovRegularizer(loss, regParam, activations) : for i in range( len(activations) ) : x = activations[i] batch_size = x.shape[0] #print("x.shape is: " , x.shape) # 2048, 100 h_centered = x - torch.mean(x, dim=0, keepdim=True) # mean center activations covariance = torch.mm( h_centered.t(), h_centered) # get small x small covariance matrix n = covariance.shape[0] covariance[np.diag_indices(n)] = 0 # zero out the diagonals of the covariance matrix (as we don't want to penalize the neurons against themselves) # alternative: t[torch.eye(n).byte()] = 5 covariance /= batch_size # normalize by the length of the minibatch cost = ( 0.5 * regParam) * torch.sum( torch.mm(covariance, covariance) ) loss += cost
def merge(tbl): inp = scn.InputBatch(2, spatial_size) center = spatial_size.float().view(1, 2) / 2 p = torch.LongTensor(2) v = torch.FloatTensor([1, 0, 0]) for char in tbl['input']: inp.addSample() m = torch.eye(2) r = random.randint(1, 3) alpha = random.uniform(-0.2, 0.2) if alpha == 1: m[0][1] = alpha elif alpha == 2: m[1][0] = alpha else: m = torch.mm(m, torch.FloatTensor( [[math.cos(alpha), math.sin(alpha)], [-math.sin(alpha), math.cos(alpha)]])) c = center + torch.FloatTensor(1, 2).uniform_(-8, 8) for stroke in char: stroke = stroke.float() / 255 - 0.5 stroke = c.expand_as(stroke) + \ torch.mm(stroke, m * (Scale - 0.01)) ############################################################### # To avoid GIL problems use a helper function: scn.dim_fn( 2, 'drawCurve')( inp.metadata.ffi, inp.features, stroke) ############################################################### # Above is equivalent to : # x1,x2,y1,y2,l=0,stroke[0][0],0,stroke[0][1],0 # for i in range(1,stroke.size(0)): # x1=x2 # y1=y2 # x2=stroke[i][0] # y2=stroke[i][1] # l=1e-10+((x2-x1)**2+(y2-y1)**2)**0.5 # v[1]=(x2-x1)/l # v[2]=(y2-y1)/l # l=max(x2-x1,y2-y1,x1-x2,y1-y2,0.9) # for j in numpy.arange(0,1,1/l): # p[0]=math.floor(x1*j+x2*(1-j)) # p[1]=math.floor(y1*j+y2*(1-j)) # inp.setLocation(p,v,False) ############################################################### inp.precomputeMetadata(precomputeStride) return {'input': inp, 'target': torch.LongTensor(tbl['target']) - 1}
def train(self, x): self.model.train() o = self.model(x) loss = torch.mean(torch.pow(torch.mm(o, self.B.t()) - x, 2)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() U, _, V = torch.svd(torch.mm(x.t().data, o.data)) self.B = torch.autograd.Variable(torch.mm(U, V.t())) return loss.data.cpu().numpy()
def anomalyScore(args, model, dataset, mean, cov, channel_idx=0, score_predictor=None): predictions = [] rearranged = [] errors = [] hiddens = [] predicted_scores = [] with torch.no_grad(): # Turn on evaluation mode which disables dropout. model.eval() pasthidden = model.init_hidden(1) for t in range(len(dataset)): out, hidden = model.forward(dataset[t].unsqueeze(0), pasthidden) predictions.append([]) rearranged.append([]) errors.append([]) hiddens.append(model.extract_hidden(hidden)) if score_predictor is not None: predicted_scores.append(score_predictor.predict(model.extract_hidden(hidden).numpy())) predictions[t].append(out.data.cpu()[0][0][channel_idx]) pasthidden = model.repackage_hidden(hidden) for prediction_step in range(1, args.prediction_window_size): out, hidden = model.forward(out, hidden) predictions[t].append(out.data.cpu()[0][0][channel_idx]) if t >= args.prediction_window_size: for step in range(args.prediction_window_size): rearranged[t].append( predictions[step + t - args.prediction_window_size][args.prediction_window_size - 1 - step]) rearranged[t] =torch.FloatTensor(rearranged[t]).to(args.device).unsqueeze(0) errors[t] = rearranged[t] - dataset[t][0][channel_idx] else: rearranged[t] = torch.zeros(1,args.prediction_window_size).to(args.device) errors[t] = torch.zeros(1, args.prediction_window_size).to(args.device) predicted_scores = np.array(predicted_scores) scores = [] for error in errors: mult1 = error-mean.unsqueeze(0) # [ 1 * prediction_window_size ] mult2 = torch.inverse(cov) # [ prediction_window_size * prediction_window_size ] mult3 = mult1.t() # [ prediction_window_size * 1 ] score = torch.mm(mult1,torch.mm(mult2,mult3)) scores.append(score[0][0]) scores = torch.stack(scores) rearranged = torch.cat(rearranged,dim=0) errors = torch.cat(errors,dim=0) return scores, rearranged, errors, hiddens, predicted_scores
def forward(self, input_n, hidden, phi, nh): self.batch_size = input_n.size()[0] hidden = torch.cat((hidden, input_n), 2) # Aggregate reresentations h_conv = torch.div(torch.bmm(phi, hidden), nh) hidden = hidden.view(-1, self.hidden_size + self.input_size) h_conv = h_conv.view(-1, self.hidden_size + self.input_size) # h_conv has shape (batch_size, n, hidden_size + input_size) m1 = (torch.mm(hidden, self.W1) .view(self.batch_size, -1, self.hidden_size)) m2 = (torch.mm(h_conv, self.W2) .view(self.batch_size, -1, self.hidden_size)) m3 = self.b.unsqueeze(0).unsqueeze(1).expand_as(m2) hidden = torch.sigmoid(m1 + m2 + m3) return hidden
def count_accuracy(X, true_counts, air, batch_size): assert X.size(0) == true_counts.size(0), 'Size mismatch.' assert X.size(0) % batch_size == 0, 'Input size must be multiple of batch_size.' counts = torch.LongTensor(3, 4).zero_() error_latents = [] error_indicators = [] def count_vec_to_mat(vec, max_index): out = torch.LongTensor(vec.size(0), max_index + 1).zero_() out.scatter_(1, vec.type(torch.LongTensor).view(vec.size(0), 1), 1) return out for i in range(X.size(0) // batch_size): X_batch = X[i * batch_size:(i + 1) * batch_size] true_counts_batch = true_counts[i * batch_size:(i + 1) * batch_size] z_where, z_pres = air.guide(X_batch, batch_size) inferred_counts = sum(z.cpu() for z in z_pres).squeeze().data true_counts_m = count_vec_to_mat(true_counts_batch, 2) inferred_counts_m = count_vec_to_mat(inferred_counts, 3) counts += torch.mm(true_counts_m.t(), inferred_counts_m) error_ind = 1 - (true_counts_batch == inferred_counts) error_ix = error_ind.nonzero().squeeze() error_latents.append(latents_to_tensor((z_where, z_pres)).index_select(0, error_ix)) error_indicators.append(error_ind) acc = counts.diag().sum().float() / X.size(0) error_indices = torch.cat(error_indicators).nonzero().squeeze() if X.is_cuda: error_indices = error_indices.cuda() return acc, counts, torch.cat(error_latents), error_indices
def _step(self, tok, states, attention): prev_states, prev_out = states lstm_in = torch.cat( [self._embedding(tok).squeeze(1), prev_out], dim=1 ) states = self._lstm(lstm_in, prev_states) lstm_out = states[0][-1] query = torch.mm(lstm_out, self._attn_w) attention, attn_mask, extend_src, extend_vsize = attention context, score = step_attention( query, attention, attention, attn_mask) dec_out = self._projection(torch.cat([lstm_out, context], dim=1)) # extend generation prob to extended vocabulary gen_prob = self._compute_gen_prob(dec_out, extend_vsize) # compute the probabilty of each copying copy_prob = torch.sigmoid(self._copy(context, states[0][-1], lstm_in)) # add the copy prob to existing vocab distribution lp = torch.log( ((-copy_prob + 1) * gen_prob ).scatter_add( dim=1, index=extend_src.expand_as(score), source=score * copy_prob ) + 1e-8) # numerical stability for log return lp, (states, dec_out), score
def _step(self, tok, states, attention): prev_states, prev_out = states lstm_in = torch.cat( [self._embedding(tok).squeeze(1), prev_out], dim=1 ) states = self._lstm(lstm_in, prev_states) lstm_out = states[0][-1] query = torch.mm(lstm_out, self._attn_w) attention, attn_mask = attention context, score = step_attention( query, attention, attention, attn_mask) dec_out = self._projection(torch.cat([lstm_out, context], dim=1)) states = (states, dec_out) logit = torch.mm(dec_out, self._embedding.weight.t()) return logit, states, score
def test_shape(di, dj, dk): x = self._gen_sparse(2, 20, [di, dj])[0] y = self.randn(dj, dk) res = torch.hsmm(x, y) expected = torch.mm(x.to_dense(), y) self.assertEqual(res.to_dense(), expected)
def test_shape(di, dj, dk): x = self._gen_sparse(2, 20, [di, dj])[0] y = self.randn(dj, dk) res = torch.dsmm(x, y) expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected)
def forward(self, X, posterior_mean = False): """ Funciton call to generate the output, every time we call it, the dynamic graph is created. There can be difference between forward in training and test: - In dropout we do not zero neurons in test - In Variational Inference we dont randombly sample from the posterior We create the forward pass by performing operations between the input X (Nsam_batch, Ndim) and the parameters of the model that we should have initialized in the __init__ """ ## We need to sample from the posterior !! self.sample_posterior(posterior_mean) o1 = self.linear1(X) # o1 = torch.mm(X, self.W1) + self.b1 # print ("x shape: ", X.shape, "W1 shape: ", self.W1.shape, "b1 shape: ", self.b1.shape) # print ("o1 shape: ", o1.shape) # print ("W2 shape: ", self.W2.shape, "b2 shape: ", self.b2.shape) ## Apply non-linearity o1 = self.cf_a.activation_func(o1) o1 = F.dropout(o1,p = self.cf_a.dop, training = self.training) o2 = torch.mm(o1, self.W2) + self.b2 # print ("o2 shape: ", o2.shape) return o2
def NN(epoch, net, lemniscate, trainloader, testloader, recompute_memory=0): net.eval() net_time = AverageMeter() cls_time = AverageMeter() losses = AverageMeter() correct = 0. total = 0 testsize = testloader.dataset.__len__() trainFeatures = lemniscate.memory.t() if hasattr(trainloader.dataset, 'imgs'): trainLabels = torch.LongTensor([y for (p, y) in trainloader.dataset.imgs]).cuda() else: trainLabels = torch.LongTensor(trainloader.dataset.train_labels).cuda() if recompute_memory: transform_bak = trainloader.dataset.transform trainloader.dataset.transform = testloader.dataset.transform temploader = torch.utils.data.DataLoader(trainloader.dataset, batch_size=100, shuffle=False, num_workers=1) for batch_idx, (inputs, targets, indexes) in enumerate(temploader): inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) batchSize = inputs.size(0) features = net(inputs) trainFeatures[:, batch_idx*batchSize:batch_idx*batchSize+batchSize] = features.data.t() trainLabels = torch.LongTensor(temploader.dataset.train_labels).cuda() trainloader.dataset.transform = transform_bak end = time.time() for batch_idx, (inputs, targets, indexes) in enumerate(testloader): inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) batchSize = inputs.size(0) features = net(inputs) net_time.update(time.time() - end) end = time.time() dist = torch.mm(features.data, trainFeatures) yd, yi = dist.topk(1, dim=1, largest=True, sorted=True) candidates = trainLabels.view(1,-1).expand(batchSize, -1) retrieval = torch.gather(candidates, 1, yi) retrieval = retrieval.narrow(1, 0, 1).clone().view(-1) yd = yd.narrow(1, 0, 1) total += targets.size(0) correct += retrieval.eq(targets.data).cpu().sum() cls_time.update(time.time() - end) end = time.time() print('Test [{}/{}]\t' 'Net Time {net_time.val:.3f} ({net_time.avg:.3f})\t' 'Cls Time {cls_time.val:.3f} ({cls_time.avg:.3f})\t' 'Top1: {:.2f}'.format( total, testsize, correct*100./total, net_time=net_time, cls_time=cls_time)) return correct/total
def l2l_validate(model, cluster_center, n_epoch=100): val_accuracy = [] for epoch in range(n_epoch): batch = generate_data(cluster_center) x, y = Variable(torch.from_numpy(batch[0])).float(), Variable(torch.from_numpy(batch[1])) w, b = model(x) M = Variable(torch.zeros(model.n_class, model.n_dim)) B = Variable(torch.zeros(model.n_class)) for k in range(model.n_class): M[k] = torch.cat((w[:, 0][y == k].view(-1, 1), w[:, 1][y == k].view(-1, 1)), 1).mean(0) B[k] = b[y == k].mean() pred = torch.mm(x, M.t()) + B.view(1, -1).expand_as(torch.mm(x, M.t())) pred = pred.data.max(1)[1] accuracy = pred.eq(y.data).cpu().sum() / y.size()[0] print('accuracy: %.2f' % accuracy) val_accuracy.append(accuracy) return numpy.mean(numpy.asarray(val_accuracy))
def memModel(contxtWords, aspectWords, position, sentLength): vaspect = aspectWords for i in range(hopNumber): Vi = 1.0 - position / sentLength - (i / vectorLength) * (1.0 - 2.0 * (position / sentLength)) Mi = Vi.expand_as(contxtWords) * contxtWords attentionInputs = torch.cat([Mi, vaspect.expand(vectorLength, sentLength)]) attentionA = torch.mm(attention_W, attentionInputs) gi = torch.tanh(attentionA + attention_b.expand_as(attentionA)) alpha = softmax(gi) linearLayerOut = torch.mm(linearLayer_W, vaspect) + linearLayer_b vaspect = torch.sum(alpha.expand_as(Mi) * Mi, 1) + linearLayerOut finallinearLayerOut = torch.mm(softmaxLayer_W, vaspect) + softmaxLayer_b return finallinearLayerOut
def nn(self, word, k): embedding = self.mu.weight.data.cpu() # [dict, embed_size] vector = embedding[self.dset.stoi[word], :].view(-1, 1) # [embed_size, 1] distance = torch.mm(embedding, vector).squeeze() / torch.norm(embedding, 2, 1) distance = distance / torch.norm(vector, 2, 0)[0] distance = distance.numpy() index = np.argsort(distance)[:-k] return [self.dset.itos[x] for x in index]
def _run_attention(self, h_all, return_weights=False): if not self.has_batch_dim: att_raw = torch.mm(h_all, self.attention_map[:, None]) att = F.softmax(att_raw.squeeze(), dim=0) if return_weights: return att else: return torch.mm(att[None, :], h_all).squeeze() else: att_raw = torch.bmm(h_all, self.attention_map[:, :, None]) att = F.softmax(att_raw.squeeze(), dim=0) if return_weights: return att else: return torch.bmm(att[:, None, :], h_all).squeeze()
def l2l_train(model, cluster_center, n_epoch=10000): optimizer = optim.Adam(model.parameters(), lr=0.01) for epoch in range(n_epoch): batch = generate_data(cluster_center) x, y = Variable(torch.from_numpy(batch[0])).float(), Variable(torch.from_numpy(batch[1])) optimizer.zero_grad() w, b = model(x) M = Variable(torch.zeros(model.n_class, model.n_dim)) B = Variable(torch.zeros(model.n_class)) for k in range(model.n_class): M[k] = torch.cat((w[:, 0][y == k].view(-1, 1), w[:, 1][y == k].view(-1, 1)), 1).mean(0) B[k] = b[y == k].mean() pred = torch.mm(x, M.t()) + B.view(1, -1).expand_as(torch.mm(x, M.t())) loss = F.cross_entropy(pred, y) loss.backward() optimizer.step() print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.data[0]))
def forward(self, input_, c_input, hx): """ Args: batch = 1 input_: A (batch, input_size) tensor containing input features. c_input: A list with size c_num,each element is the input ct from skip word (batch, hidden_size). hx: A tuple (h_0, c_0), which contains the initial hidden and cell state, where the size of both states is (batch, hidden_size). Returns: h_1, c_1: Tensors containing the next hidden and cell state. """ h_0, c_0 = hx batch_size = h_0.size(0) #assert(batch_size == 1) bias_batch = (self.bias.unsqueeze(0).expand(batch_size, *self.bias.size())) wh_b = torch.addmm(bias_batch, h_0, self.weight_hh) wi = torch.mm(input_, self.weight_ih) i, o, g = torch.split(wh_b + wi, split_size_or_sections=self.hidden_size, dim=1) i = torch.sigmoid(i) g = torch.tanh(g) o = torch.sigmoid(o) c_num = len(c_input) if c_num == 0: f = 1 - i c_1 = f*c_0 + i*g h_1 = o * torch.tanh(c_1) else: c_input_var = torch.cat(c_input, 0) alpha_bias_batch = (self.alpha_bias.unsqueeze(0).expand(batch_size, *self.alpha_bias.size())) c_input_var = c_input_var.squeeze(1) ## (c_num, hidden_dim) alpha_wi = torch.addmm(self.alpha_bias, input_, self.alpha_weight_ih).expand(c_num, self.hidden_size) alpha_wh = torch.mm(c_input_var, self.alpha_weight_hh) alpha = torch.sigmoid(alpha_wi + alpha_wh) ## alpha = i concat alpha alpha = torch.exp(torch.cat([i, alpha],0)) alpha_sum = alpha.sum(0) ## alpha = softmax for each hidden element alpha = torch.div(alpha, alpha_sum) merge_i_c = torch.cat([g, c_input_var],0) c_1 = merge_i_c * alpha c_1 = c_1.sum(0).unsqueeze(0) h_1 = o * torch.tanh(c_1) return h_1, c_1
def forward(ctx, input1, input2, weight, bias=None): ctx.save_for_backward(input1, input2, weight, bias) output = input1.new(input1.size(0), weight.size(0)) buff = input1.new() # compute output scores: for k, w in enumerate(weight): torch.mm(input1, w, out=buff) buff.mul_(input2) torch.sum(buff, 1, keepdim=True, out=output.narrow(1, k, 1)) if bias is not None: output.add_(bias.expand_as(output)) return output
def test_shape(di, dj, dk): x, _, _ = self._gen_sparse(2, 20, [di, dj]) t = torch.randn(di, dk) y = torch.randn(dj, dk) alpha = random.random() beta = random.random() res = torch.addmm(alpha, t, beta, x, y) expected = torch.addmm(alpha, t, beta, self.safeToDense(x), y) self.assertEqual(res, expected) res = torch.addmm(t, x, y) expected = torch.addmm(t, self.safeToDense(x), y) self.assertEqual(res, expected) res = torch.mm(x, y) expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected)
def test_shape(di, dj, dk): x = self._gen_sparse(2, 20, [di, dj])[0] y = self.randn(dj, dk) res = torch.hsmm(x, y) # TODO: use self.safeToDense(), but this triggers # https://github.com/pytorch/pytorch/issues/3170 expected = torch.mm(x.to_dense(), y) self.assertEqual(res.to_dense(), expected)
def forward(self, user_X, item_X): # ----------------------------------------GCN layer---------------------------------------- user_X = self.sparse_dropout(user_X) item_X = self.sparse_dropout(item_X) embeddings = [] if self.accum == 'sum': wu = 0. wv = 0. for i in range(self.num_support): # weight sharing wu = self.weights_u[i] + wu wv = self.weights_v[i] + wv # multiply feature matrices with weights if self.sparse_feature: temp_u = torch.sparse.mm(user_X, wu) temp_v = torch.sparse.mm(item_X, wv) else: temp_u = torch.mm(user_X, wu) temp_v = torch.mm(item_X, wv) all_embedding = torch.cat([temp_u, temp_v]) # then multiply with adj matrices graph_A = self.support[i] all_emb = torch.sparse.mm(graph_A, all_embedding) embeddings.append(all_emb) embeddings = torch.stack(embeddings, dim=1) embeddings = torch.sum(embeddings, dim=1) else: for i in range(self.num_support): # multiply feature matrices with weights if self.sparse_feature: temp_u = torch.sparse.mm(user_X, self.weights_u[i]) temp_v = torch.sparse.mm(item_X, self.weights_v[i]) else: temp_u = torch.mm(user_X, self.weights_u[i]) temp_v = torch.mm(item_X, self.weights_v[i]) all_embedding = torch.cat([temp_u, temp_v]) # then multiply with adj matrices graph_A = self.support[i] all_emb = torch.sparse.mm(graph_A, all_embedding) embeddings.append(all_emb) embeddings = torch.cat(embeddings, dim=1) users, items = torch.split(embeddings, [self.num_users, self.num_items]) u_hidden = self.activate(users) v_hidden = self.activate(items) # ----------------------------------------Dense Layer---------------------------------------- u_hidden = self.dropout(u_hidden) v_hidden = self.dropout(v_hidden) u_hidden = self.dense_layer_u(u_hidden) v_hidden = self.dense_layer_u(v_hidden) u_outputs = self.dense_activate(u_hidden) v_outputs = self.dense_activate(v_hidden) return u_outputs, v_outputs
epoch = 5000 # Setting training iterations lr = 0.1 # Setting learning rate inputlayer_neurons = X.shape[1] # number of features in data set hiddenlayer_neurons = 3 # number of hidden layers neurons output_neurons = 1 # number of neurons at output layer # weight and bias initialization wh = torch.randn(inputlayer_neurons, hiddenlayer_neurons).type(torch.FloatTensor) bh = torch.randn(1, hiddenlayer_neurons).type(torch.FloatTensor) wout = torch.randn(hiddenlayer_neurons, output_neurons) bout = torch.randn(1, output_neurons) for i in range(epoch): # Forward Propogation hidden_layer_input1 = torch.mm(X, wh) hidden_layer_input = hidden_layer_input1 + bh hidden_layer_activations = sigmoid(hidden_layer_input) output_layer_input1 = torch.mm(hidden_layer_activations, wout) output_layer_input = output_layer_input1 + bout output = sigmoid(output_layer_input1) # Backpropagation E = y - output slope_output_layer = derivatives_sigmoid(output) slope_hidden_layer = derivatives_sigmoid(hidden_layer_activations) d_output = E * slope_output_layer Error_at_hidden_layer = torch.mm(d_output, wout.t()) d_hiddenlayer = Error_at_hidden_layer * slope_hidden_layer wout += torch.mm(hidden_layer_activations.t(), d_output) * lr
def sample_v(self, y): #y stands for hidden nodes wy = torch.mm(y, self.W) # as weight matrix is for p_v_given_h we does not need to take transpose here activation = wy + self.b.expand_as(wy) p_v_given_h = torch.sigmoid(activation) return p_v_given_h, torch.bernoulli(p_v_given_h)
# mean print( '\nmean', '\nnumpy: ', np.mean(data), # 0.0 '\ntorch: ', torch.mean(tensor) # 0.0 ) # matrix multiplication data = [[1, 2], [3, 4]] tensor = torch.FloatTensor(data) # 32-bit floating point # correct method print( '\nmatrix multiplication (matmul)', '\nnumpy: ', np.matmul(data, data), # [[7, 10], [15, 22]] '\ntorch: ', torch.mm(tensor, tensor) # [[7, 10], [15, 22]] ) # incorrect method data = np.array(data) print( '\nmatrix multiplication (dot)', '\nnumpy: ', data.dot(data), # [[7, 10], [15, 22]] '\ntorch: ', torch.dot(tensor.dot( tensor)) # this will convert tensor to [1,2,3,4], you'll get 30.0 )
def train(self): """Train generator and discriminator.""" fixed_noise = self.to_variable(torch.randn(self.batch_size, self.z_dim)) total_step = len(self.data_loader) for epoch in range(self.num_epochs): for i, images in enumerate(self.data_loader): #===================== Train D =====================# images = self.to_variable(images) batch_size = images.size(0) noise = self.to_variable(torch.randn(batch_size, self.z_dim)) # Train D to recognize real images as real. outputs = self.discriminator(images) real_loss = torch.mean(torch.sum((outputs - images)**2, 1)) # Train D to recognize fake images as fake. fake_images = self.generator(noise) outputs = self.discriminator(fake_images) fake_loss = torch.mean(torch.sum((outputs - fake_images)**2, 1)) # Backprop + optimize d_loss = real_loss + torch.nn.functional.relu( 1 - fake_loss) # 1 is margin self.discriminator.zero_grad() d_loss.backward() self.d_optimizer.step() #===================== Train G =====================# noise = self.to_variable(torch.randn(batch_size, self.z_dim)) # Train G so that D recognizes G(z) as real. fake_images = self.generator(noise) outputs = self.discriminator(fake_images) g_loss = torch.mean(torch.sum((outputs - fake_images)**2, 1)) # Generator PT Regularizer Term # PT Reg. Term sample = fake_images.view(-1, batch_size) # 12288 x 32 nom = torch.mm(torch.transpose(sample, 0, 1), sample) # 32x32 denoms = torch.zeros((64 * 64 * 3, batch_size)) denom_column = torch.sqrt(torch.sum(torch.pow(sample, 2), 0)) # Should be 32x32 denoms[0, :] = denom_column.data denom = torch.mm(torch.transpose(denoms, 0, 1), denoms) denom = denom.cuda() pt = torch.pow(torch.div(nom.data, denom), 2) # 32x32 # Remove Diagonal Term pt -= torch.diag(torch.diag(pt, 0)) # Final PT Value pt = torch.sum(pt) / (batch_size * (batch_size - 1)) g_loss = g_loss + 0.1 * pt # Backprop + optimize self.generator.zero_grad() g_loss.backward() self.g_optimizer.step() # print the log info if (i + 1) % self.log_step == 0: print( 'Epoch [%d/%d], Step[%d/%d], d_real_loss: %.4f, ' 'd_fake_loss: %.4f, g_loss: %.4f' % (epoch + 1, self.num_epochs, i + 1, total_step, real_loss.data[0], fake_loss.data[0], g_loss.data[0])) # save the sampled images if (i + 1) % self.sample_step == 0: fake_images = self.generator(fixed_noise) torchvision.utils.save_image( self.denorm(fake_images.data), os.path.join( self.sample_path, 'fake_samples-%d-%d.png' % (epoch + 1, i + 1))) # save the model parameters for each epoch g_path = os.path.join(self.model_path, 'generator-%d.pkl' % (epoch + 1)) d_path = os.path.join(self.model_path, 'discriminator-%d.pkl' % (epoch + 1)) torch.save(self.generator.state_dict(), g_path) torch.save(self.discriminator.state_dict(), d_path)
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=()): """Runs Non-Maximum Suppression (NMS) on inference results Returns: list of detections, on (n,6) tensor per image [xyxy, conf, cls] """ nc = prediction.shape[2] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates # Settings # (pixels) minimum and maximum box width and height min_wh, max_wh = 2, 4096 max_det = 300 # maximum number of detections per image max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() time_limit = 10.0 # seconds to quit after redundant = True # require redundant detections multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) merge = False # use merge-NMS t = time.time() output = [torch.zeros( (0, 6), device=prediction.device)] * prediction.shape[0] for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence # Cat apriori labels if autolabelling if labels and len(labels[xi]): l = labels[xi] v = torch.zeros((len(l), nc + 5), device=x.device) v[:, :4] = l[:, 1:5] # box v[:, 4] = 1.0 # conf v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls x = torch.cat((x, v), 0) # If none remain process next image if not x.shape[0]: continue # Compute conf x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf # Box (center x, center y, width, height) to (x1, y1, x2, y2) box = xywh2xyxy(x[:, :4]) # Detections matrix nx6 (xyxy, conf, cls) if multi_label: i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1) else: # best class only conf, j = x[:, 5:].max(1, keepdim=True) x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres] # Filter by class if classes is not None: x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Apply finite constraint # if not torch.isfinite(x).all(): # x = x[torch.isfinite(x).all(1)] # Check shape n = x.shape[0] # number of boxes if not n: # no boxes continue elif n > max_nms: # excess boxes # sort by confidence x = x[x[:, 4].argsort(descending=True)[:max_nms]] # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes # boxes (offset by class), scores boxes, scores = x[:, :4] + c, x[:, 4] i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS if i.shape[0] > max_det: # limit detections i = i[:max_det] if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum( 1, keepdim=True) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy output[xi] = x[i] if (time.time() - t) > time_limit: print(f'WARNING: NMS time limit {time_limit}s exceeded') break # time limit exceeded return output
def forward(self, input): x = input.view(-1, visible_size) x = torch.sigmoid(torch.mm(x, self.encoder)) x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) return x.view_as(input)
def model(self, x): x = F.relu(torch.add(torch.mm(x, self.weights[0]),self.weights[1])) x = F.relu(torch.add(torch.mm(x, self.weights[2]), self.weights[3])) x = F.softmax(torch.add(torch.mm(x, self.weights[4]), self.weights[5])) return x
def net(x): h = relu(torch.mm(x.view((-1, num_inputs)), w1) + b1) # h = torch.tensor(h, dtype=torch.float, requires_grad=True) return softmax(torch.mm(h, w2) + b2)
def forward(self, src_inputs, src_mask, src_langs, tgt_inputs, tgt_mask, tgt_langs, src_neg_inputs=None, src_neg_mask=None, src_neg_langs=None, tgt_neg_inputs=None, tgt_neg_mask=None, tgt_neg_langs=None, normalize: bool = False): "Take in and process masked src and target sequences." device = self.encoder.embeddings.word_embeddings.weight.device src_langs = src_langs.unsqueeze(-1).expand(-1, src_inputs.size(-1)) src_inputs = src_inputs.to(device) src_langs = src_langs.to(device) if src_mask.device != device: src_mask = src_mask.to(device) src_embed = self.encode(src_inputs, src_mask, src_langs) tgt_langs = tgt_langs.unsqueeze(-1).expand( -1, tgt_inputs.size(-1)).to(device) if tgt_inputs.device != device: tgt_inputs = tgt_inputs.to(device) tgt_mask = tgt_mask.to(device) tgt_embed = self.encode(tgt_inputs, tgt_mask, tgt_langs) src_norm = torch.norm(src_embed, dim=-1, p=2).unsqueeze(-1) + 1e-4 src_embed = torch.div(src_embed, src_norm) tgt_norm = torch.norm(tgt_embed, dim=-1, p=2).unsqueeze(-1) + 1e-4 tgt_embed = torch.div(tgt_embed, tgt_norm) if normalize: if src_neg_langs is not None: src_neg_langs = src_neg_langs.unsqueeze(-1).expand( -1, src_neg_inputs.size(-1)) src_neg_inputs = src_neg_inputs.to(device) src_neg_langs = src_neg_langs.to(device) if src_neg_mask.device != device: src_neg_mask = src_neg_mask.to(device) src_neg_embed = self.encode(src_neg_inputs, src_neg_mask, src_neg_langs) src_neg_norm = torch.norm(src_neg_embed, dim=-1, p=2).unsqueeze(-1) + 1e-4 src_neg_embed = torch.div(src_neg_embed, src_neg_norm) tgt_neg_langs = tgt_neg_langs.unsqueeze(-1).expand( -1, tgt_neg_inputs.size(-1)) tgt_neg_inputs = tgt_neg_inputs.to(device) tgt_neg_langs = tgt_neg_langs.to(device) if tgt_neg_mask.device != device: tgt_neg_mask = tgt_neg_mask.to(device) tgt_neg_embed = self.encode(tgt_neg_inputs, tgt_neg_mask, tgt_neg_langs) tgt_neg_norm = torch.norm(tgt_neg_embed, dim=-1, p=2).unsqueeze(-1) + 1e-4 tgt_neg_embed = torch.div(tgt_neg_embed, tgt_neg_norm) tgt_neg_embd = torch.cat([tgt_neg_embed, tgt_embed]) src_neg_embd = torch.cat([src_neg_embed, src_embed]) nominator = torch.sum(src_embed * tgt_embed, dim=-1) + 1e-4 cross_dot = torch.mm(src_embed, tgt_neg_embd.T) cross_dot_rev = torch.mm(tgt_embed, src_neg_embd.T) cross_dot_all = torch.cat([cross_dot, cross_dot_rev], dim=1) denom = torch.log( torch.sum(torch.exp(cross_dot_all), dim=-1) + 1e-4) log_neg = torch.sum(denom - nominator) / len(cross_dot) else: cross_dot = torch.mm(src_embed, tgt_embed.T) denom = torch.log( torch.sum(torch.exp(cross_dot), dim=-1) + 1e-4) nominator = torch.diagonal(cross_dot[:, :], 0) + 1e-4 log_neg = torch.sum(denom - nominator) / len(cross_dot) return log_neg else: dot_prod = torch.sum(src_embed * tgt_embed, dim=-1) return dot_prod
def P(z): h = nn.relu(torch.mm(z, Wzh) + bzh.repeat(z.size(0), 1)) X = nn.sigmoid(torch.mm(h, Whx) + bhx.repeat(h.size(0), 1)) return X
# # > **Exercise:** Flatten the batch of images `images`. Then build a multi-layer network with 784 input units, 256 hidden units, and 10 output units using random tensors for the weights and biases. For now, use a sigmoid activation for the hidden layer. Leave the output layer without an activation, we'll add one that gives us a probability distribution next. # In[5]: def activation(x): return 1 / (1 + torch.exp(-x)) inputs = images.view(images.shape[0], -1) w1 = torch.randn(784, 256) b1 = torch.randn(256) w2 = torch.randn(256, 10) b2 = torch.randn(10) h = activation(torch.mm(inputs, w1) + b1) out = torch.mm(h, w2) + b2 print(out) # Now we have 10 outputs for our network. We want to pass in an image to our network and get out a probability distribution over the classes that tells us the likely class(es) the image belongs to. Something that looks like this: # <img src='assets/image_distribution.png' width=500px> # # Here we see that the probability for each class is roughly the same. This is representing an untrained network, it hasn't seen any data yet so it just returns a uniform distribution with equal probabilities for each class. # # To calculate this probability distribution, we often use the [**softmax** function](https://en.wikipedia.org/wiki/Softmax_function). Mathematically this looks like # # $$ # \Large \sigma(x_i) = \cfrac{e^{x_i}}{\sum_k^K{e^{x_k}}} # $$ # # What this does is squish each input $x_i$ between 0 and 1 and normalizes the values to give you a proper probability distribution where the probabilites sum up to one.
def initialize_weights_(self, Dict=None, L1_weight=None, init_type='ista', mu=None): """ Fully initializes the encoder using given weight matrices (or randomly, if none are given). """ self.init_type = init_type # fix-up L1 weights and mu's. if self.L1_weight is None: if (L1_weight is None): print('Using default L1 weight (0.1).') self.L1_weight = 0.1 else: self.L1_weight = L1_weight if self.mu is None: if mu is None: self.mu = 1 if init_type == 'salsa': print('Using default mu value (1).') else: self.mu = mu # If a dictionary is not provided for initialization, initialize randomly. if Dict is None: Dict = dictionary(self.data_size, self.code_size, use_cuda=False) #------------------------------------- # Initialize the loss function. self.initialize_cvx_lossFcn_(Dict) #------------------------------------- # Initialize ISTA-style (first order). Wd = Dict.getDecWeights().cpu() if init_type == 'ista': # Get the maximum eigenvalue. Dict.getMaxEigVal() self.L = Dict.maxEig # Initialize. self.We.weight.data = (1 / self.L) * (Wd.detach()).t() self.S.weight.data = torch.eye( Dict.n) - (1 / self.L) * (torch.mm(Wd.t(), Wd)).detach() self.thresh = (self.L1_weight / self.L) # Set up the nonlinearity, aka soft-thresholding function. #------------------------------------- # Initialize FISTA-style (first order). elif init_type == 'fista': # Get the maximum eigenvalue. Dict.getMaxEigVal() self.L = Dict.maxEig # Initialize. self.We.weight.data = Wd.detach().t() self.thresh = (self.L1_weight / self.L) # Set up the nonlinearity, aka soft-thresholding function. #--------------------------------------- # Initialize SALSA-style (second order). elif init_type == 'salsa': # Initialize matrices. self.We.weight.data = Wd.detach().t() AA = torch.mm(Wd.t(), Wd).cpu() S_weights = (self.mu * torch.eye(Dict.n) + AA).inverse() self.S.weight.data = S_weights.detach() self.thresh = (self.L1_weight / self.mu) # Set up the nonlinearity, aka soft-thresholding function. else: raise ValueError( 'Encoders can only be initialized for "ista" and "salsa" like families.' ) #------------------------------------- # Print status of the newly created encoder. # print('Encoder, threshold, and loss functions are initialized for {}-type algorithms.'.format(init_type)) #------------------------------------- # Finally, put to device if requested. self.We = self.We.to(self.device) self.S = self.S.to(self.device)
def forward(self, query, context, context_mask): batch_size, item_len, dimensions = context.size() # mask context # In (batch size, item_length) # Out (batch size, item_length, 1) context_masked = context * context_mask.unsqueeze(2) # element-wise matrix product # In (batch_size, 1, dimensions) * (batch_size, item_len, dimensions) -> # Out (batch_size, item_len, dimensions) pq = query.unsqueeze( 1) * context_masked #(batch_size, item_len, dimensions) pq_ = pq.view(-1, dimensions) #(batch * item_len, dimensions) # a linear layer # In (batch_size * item_len, dimensions) # Out (batch_size * item_len, dimensions) linear_output = self.linear_layer(pq_) if self.activation == 0: linear_output_ = torch.relu(linear_output) elif self.activation == 1: linear_output_ = torch.sigmoid(linear_output) elif self.activation == 2: linear_output_ = torch.tanh(linear_output) # attention score # In (batch_size * item_len, dimensions), (dimensions, 1) # Out (batch_size * item_len, 1) # reshape tensor # Out (batch_size, item_len) A_1 = torch.mm(linear_output_, self.h).view(batch_size, item_len) # use a mask to filter data # keep mask(1) and clear mask(0) # In (batch_size, item_len) and (batch_size, item_len) # Out (batch_size, item_len) A = A_1 * context_mask # through softmax for normalization # In (batch_size, item_len) # Out (batch_size, item_len) if self.beta == 1: attention_weight = self.softmax(A) #(5, 7) attention_weight_ = attention_weight.unsqueeze(2) else: #compute softmax on non-zero rows # do mask A_without_zero = A.sum(1) != 0 if A_without_zero.sum() != A.shape[0]: #exist zero row A_rest = A[A_without_zero.to(torch.device('cpu'))] context_mask_rest = context_mask[A_without_zero.to( torch.device('cpu'))] # compute upper part of \frac{exp(f(i,j))}{\sum_k exp(f(i,j))} A_ = torch.exp(A_rest) * context_mask_rest # compute lower part smoothing_softmax_denominator = A_.sum(1).pow(-self.beta) smoothing_softmax_denominator_ = smoothing_softmax_denominator.unsqueeze( 1) # compute \frac{exp(f(i,j))}{\sum_k exp(f(i,j))} attention_weight = A_ * smoothing_softmax_denominator_ # restore results attention_weight_ = torch.zeros(A.shape).to(self.device) attention_weight_[A_without_zero.to( torch.device('cpu'))] = attention_weight # In (batch_size, item_len) # Out (batch_size, item_len, 1) attention_weight_ = attention_weight_.unsqueeze(2) else: # a_ij = \frac{exp(f(p_i, q_j))}{(\sum_j exp(f(p_i, q_j)))^beta} # this step multiply mask is important in order to avoid unnecessary sum A_ = torch.exp(A) * context_mask # get sum on denominator smoothing_softmax_denominator = A_.sum(1).pow(-self.beta) # In (batch) # Out (batch, 1) smoothing_softmax_denominator_ = smoothing_softmax_denominator.unsqueeze( 1) # if A[i,:] is all zero, the wegiht will be inf large, so filter them again. attention_weight = A_ * smoothing_softmax_denominator_ # In (batch_size, item_len) # Out (batch_size, item_len, 1) attention_weight_ = attention_weight.unsqueeze(2) #(5, 7, 1) # set up final attended score by element-wise matrix production # In (batch_size, item_len, 1) * (batch_size, item_len, dimensions) # Out (batch_size, item_len, dimensions) ret = attention_weight_ * context_masked #(5, 7, 10) # sum all items # In (batch_size, item_len, dimensions) # Out (batch_size, dimensions) output = ret.sum(1) #(5, 10), a final result return output, attention_weight
def payload(self): x = torch.randn(10, 10).cuda() y = torch.randn(10, 10).cuda() z = torch.mm(x, y) z = z + y z = z.cpu()
def train(self, v0, vk, ph0, phk): self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk) self.b += torch.sum((v0 - vk), 0) self.a += torch.sum((ph0 - phk), 0)
def forward(self, X, A_hat): ### 1-layer GCN architecture X = torch.mm(X, self.weight) if self.bias is not None: X = (X + self.bias) X = F.relu(torch.mm(A_hat, X)) return X
def logistic_regression(x): return torch.sigmoid(torch.mm(x, w) + b)
def train(self, v0, vk, ph0, phk): self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk) self.b += torch.sum((v0 - vk), 0) # just to keep dimentions on equal form we add v0 - vk to 0 self.a += torch.sum((ph0 - phk), 0)
def sample_h(self, x): wx = torch.mm(x, self.W.t()) activation = wx + self.a.expand_as(wx) p_h_given_v = torch.sigmoid(activation) return p_h_given_v, torch.bernoulli(p_h_given_v)
def sample_h(self, x): # x stands for visible nodes wx = torch.mm(x, self.W.t()) # torch.mm to multiply two tensors but for mathematical correction we need to take transpose of weights(as weight matrix is for p_v_given_h) activation = wx + self.a.expand_as(wx) # bias self.a must be applied to every line of batch. So expan_as(wx) p_h_given_v = torch.sigmoid(activation) # hidden node is activated given the condition of visible node return p_h_given_v, torch.bernoulli(p_h_given_v) # return probabilities
def sample_v(self, y): wy = torch.mm(y, self.W) activation = wy + self.b.expand_as(wy) p_v_given_h = torch.sigmoid(activation) return p_v_given_h, torch.bernoulli(p_v_given_h)
def forward(self, sentence, seq_lengths, mask, label_sent, label_mask): ''' sentence: (batch, len) ''' '''label descriptions''' label_embeds = self.word_embeddings_bow( label_sent) #(12, len, emb_size) label_reps = torch.sum(label_embeds * label_mask.unsqueeze(2), dim=1) #(12, emb_size) label_hidden_reps = ( self.emb2hidden(label_reps)).tanh() #(12, hidden_size) '''neural BOW''' embeds_bow = self.word_embeddings_bow(sentence) bow = torch.sum(embeds_bow * mask.unsqueeze(2), dim=1) #(batch, emb_size) '''LSTM''' embeds_lstm = self.word_embeddings_bow(sentence) lstm_output = LSTM(embeds_lstm, seq_lengths, self.lstm, False) '''multi-channel CNN''' embeds_cnn = self.word_embeddings_bow(sentence) conv_output = multi_channel_conv_and_pool(embeds_cnn, mask, self.conv_1, self.conv_2) dot_cnn_dataless = (torch.mm( conv_output.reshape(2 * self.batch_size, self.hidden_dim), label_hidden_reps.t()).reshape(self.batch_size, 2 * self.tagset_size)).tanh() '''attentive convolution''' embeds_acnn = self.word_embeddings_bow(sentence) aconv_output = attentive_convolution(embeds_acnn, embeds_acnn, mask, mask, self.conv_self, self.conv_context) aconv_output2 = attentive_convolution(embeds_acnn, embeds_acnn, mask, mask, self.conv_self2, self.conv_context2) '''dataless''' dataless_cos = (cosine_two_matrices( bow, label_reps)).sigmoid() #(batch, 12) '''dataless top-30 fine grained cosine''' sent_side = embeds_bow * mask.unsqueeze( 2) #(batch, sent_len, emb_size) label_side = label_embeds * label_mask.unsqueeze( 2) #(12, label_len, emb_size) cosine_matrix = cosine_two_matrices( label_side.view(-1, self.emb_size), sent_side.view(-1, self.emb_size)) #(12*label_len, batch*sent_len) # print('cosine_matrix:', cosine_matrix) dot_prod_tensor4 = cosine_matrix.reshape( self.batch_size, sent_side.size(1), 12, label_side.size(1)).permute(0, 2, 3, 1) #(batch, 12, label_len, sent_len) dot_prod_tensor3_new = dot_prod_tensor4.reshape( self.batch_size, 12, label_side.size(1) * sent_side.size(1)) #(batch, 12, label_len*sent_len) sorted, indices = torch.sort(dot_prod_tensor3_new, descending=True) top_k_sorted = sorted[:, :, :50] dataless_top_30 = top_k_sorted.mean(dim=-1) #(batch, 12) # print('dataless_top_30:',top_k_sorted.var(dim=-1)) '''combine all output representations''' '''len = self.emb_size+3*self.hidden_dim+4*self.tagset_size''' # combine_rep_batch = torch.cat([bow, lstm_output, conv_output, dataless_cos, dataless_top_30, dot_cnn_dataless, aconv_output,aconv_output2], 1) combine_rep_batch = torch.cat( [bow, aconv_output, aconv_output2, conv_output], 1) tag_space = self.hidden2tag(combine_rep_batch) tag_prob = tag_space.sigmoid() return tag_prob
def Q(X): h = nn.relu(torch.mm(X, Wxh) + bxh.repeat(X.size(0), 1)) z_mu = torch.mm(h, Whz_mu) + bhz_mu.repeat(h.size(0), 1) z_var = torch.mm(h, Whz_var) + bhz_var.repeat(h.size(0), 1) return z_mu, z_var
start_time = time.time() # loss_ = PMF() # loss = loss_(rating_mat, user_features, movie_features) optimizer = torch.optim.SGD([user_features, movie_features], lr=0.01, weight_decay=0.5) pmferr = PMF(u_lambda=rating_var, v_lambda=rating_var) for step, epoch in enumerate(range(10)): optimizer.zero_grad() loss = pmferr(rating_mat, user_features, movie_features) loss.backward() optimizer.step() if step % 50 == 0: print(f'Step {step}, {loss:.3f}') dev_csv_path = 'data/dev.csv' dev_df = pd.read_csv(dev_csv_path, names=['movie', 'user']) file = open('eval/PMF_%d.txt' % latent_vectors, 'w') for i in range(len(dev_df.movie)): dev_movie = dev_df.iloc[i].movie dev_user = dev_df.iloc[i].user pred = torch.mm(user_features[dev_user, :].view(1, -1), movie_features.t()) # pred_rate = (pred*(max_rate-min_rate)+min_rate) pred_result = pred[0, dev_movie].data.tolist() file.writelines('%s\n' % (str(pred_result))) print('---------predicting for instance number %d' % i) file.close() print('%f secs spending' % (time.time() - start_time))
def gram_matrix(input): N, C, H, W = input.size() features = input.view(N * C, H * W) G = torch.mm(features, features.t()) # XX^t return G.div(N * C * H * W) # Normalize
def forward(self, graph, x): output = torch.mm(x, self.weight) output = output + self.bias if self.bias is not None else output output = self.bn(output) return self.sigma(output)
def forward(self, features, all_phrase_ids, targets, precomp_boxes, precomp_score, precomp_det_label, image_scale, all_sent_sgs, all_sentences, image_unique_id, det_label_embedding): """ :param obj_proposals: proposal from each images :param features: features maps from the backbone :param target: gt relation labels :param object_vocab, object_vocab_len [[xxx,xxx],[xxx],[xxx]], [2,1,1] :param sent_sg: sentence scene graph :return: prediction, loss note that first dimension is images """ img_num_per_gpu = len(features) batch_decode_logits = [] batch_topk_decoder_logits = [] batch_pred_similarity = [] batch_precomp_boxes = [] batch_topk_precomp_boxes = [] batch_pred_boxes = [] batch_topk_pred_boxes = [] batch_topk_fusion_pred_boxes = [] batch_topk_pred_similarity = [] batch_topk_fusion_similarity = [] batch_boxes_targets = [] batch_ctx_embed = [] batch_ctx_s1_embed = [] batch_rel_reconst_s0 = [] batch_rel_reconst_s1 = [] batch_rel_cls_s0 = [] batch_rel_cls_s1 = [] batch_rel_cls_gt = [] batch_pred_targets = [] batch_topk_pred_targets = [] """ Language Embedding""" batch_phrase_ids, batch_phrase_types, batch_phrase_embed, batch_phrase_len, \ batch_phrase_dec_ids, batch_phrase_mask, batch_decoder_word_embed, batch_phrase_glove_embed, batch_relation_conn, batch_sent_embed,\ batch_decoder_rel_word_embed, batch_rel_mask, batch_rel_dec_idx = self.phrase_embed(all_sentences, all_phrase_ids, all_sent_sgs) h, w = features.shape[-2:] # self.iter = 100000 self.storage = get_event_storage() for bid in range(img_num_per_gpu): """ Visual Embedding """ precomp_boxes_bid = precomp_boxes[bid].to(self.device) ## 100*4 order = [] for phr_ids in batch_phrase_ids[bid]: order.append(all_phrase_ids[bid].index(phr_ids)) target_filter = targets[bid][np.array(order)] batch_boxes_targets.append(target_filter.to(self.device)) batch_precomp_boxes.append(precomp_boxes_bid) img_feat_bid = features[[bid]] visual_features_bid = self.rcnn_top( self.det_roi_pooler( [img_feat_bid], [precomp_boxes_bid])).mean(dim=[2, 3]).contiguous() if cfg.MODEL.VG.SPATIAL_FEAT: spa_feat = meshgrid_generation(h, w) spa_feat = self.det_roi_pooler( [spa_feat], [precomp_boxes_bid]).view(visual_features_bid.shape[0], -1) spa_feat = self.spatial_trans(spa_feat) visual_features_bid = torch.cat( (visual_features_bid, spa_feat), dim=1) visual_features_bid = self.visual_embedding(visual_features_bid) visual_features_bid = self.vis_batchnorm(visual_features_bid) """ Noun Phrase embedding """ phrase_embed_bid = batch_phrase_embed[bid] if phrase_embed_bid.shape[0] == 1 and self.training: phrase_embed_bid = self.phr_batchnorm( phrase_embed_bid.repeat(2, 1))[[0]] else: phrase_embed_bid = self.phr_batchnorm(phrase_embed_bid) """ Similarity and attention prediction """ num_box = precomp_boxes_bid.tensor.size(0) num_phrase = phrase_embed_bid.size(0) phr_inds, obj_inds = self.make_pair(num_phrase, num_box) pred_similarity_bid, pred_targets_bid = self.similarity( visual_features_bid, phrase_embed_bid, obj_inds, phr_inds) pred_similarity_bid = pred_similarity_bid.reshape( num_phrase, num_box) pred_targets_bid = pred_targets_bid.reshape(num_phrase, num_box, 4) batch_pred_targets.append(pred_targets_bid) if cfg.MODEL.VG.USING_DET_KNOWLEDGE: det_label_embedding_bid = det_label_embedding[bid].to( self.device) sim = self.cal_det_label_sim_max(det_label_embedding_bid, batch_phrase_glove_embed[bid]) pred_similarity_bid = pred_similarity_bid * sim sim_mask = (sim > 0).float() atten_bid = numerical_stability_masked_softmax( pred_similarity_bid, sim_mask, dim=1) else: atten_bid = F.softmax(pred_similarity_bid, dim=1) ## reconstruction visual features visual_reconst_bid = torch.mm(atten_bid, visual_features_bid) decode_phr_logits = self.phrase_decoder( visual_reconst_bid, batch_decoder_word_embed[bid]) batch_decode_logits.append(decode_phr_logits) atten_score_topk, atten_ranking_topk = torch.topk( atten_bid, dim=1, k=self.s2_topk) ## (N, 10) ind_phr_topk = np.arange(num_phrase).repeat(self.s2_topk) ## -----------------------------------------------------## ## crop 2st features ## -----------------------------------------------------## if self.storage.iter <= cfg.SOLVER.REG_START_ITER: # if self.iter <= cfg.SOLVER.REG_START_ITER: visual_features_topk_bid = visual_features_bid[ atten_ranking_topk.reshape(-1)] precomp_boxes_topk_bid = precomp_boxes_bid[ atten_ranking_topk.reshape(-1)] batch_topk_precomp_boxes.append(precomp_boxes_topk_bid) else: topk_box_ids = atten_ranking_topk.reshape( -1) + torch.as_tensor(ind_phr_topk, dtype=torch.long).to( self.device) * num_box precomp_boxes_tensor, box_size = precomp_boxes_bid.tensor, precomp_boxes_bid.size precomp_boxes_topk_tensor = precomp_boxes_tensor[ atten_ranking_topk.reshape(-1)] ## (N*10, 4) pred_targets_s0 = pred_targets_bid.view(-1, 4)[topk_box_ids] precomp_boxes_topk_bid = self.box2box_translation.apply_deltas( pred_targets_s0, precomp_boxes_topk_tensor) precomp_boxes_topk_bid = Boxes(precomp_boxes_topk_bid, box_size) precomp_boxes_topk_bid.clip() batch_topk_precomp_boxes.append(precomp_boxes_topk_bid) visual_features_topk_bid = self.rcnn_top( self.det_roi_pooler([img_feat_bid], [precomp_boxes_topk_bid])).mean( dim=[2, 3]).contiguous() if cfg.MODEL.VG.SPATIAL_FEAT: spa_feat = meshgrid_generation(h, w) spa_feat = self.det_roi_pooler( [spa_feat], [precomp_boxes_topk_bid]).view( visual_features_topk_bid.shape[0], -1) spa_feat = self.spatial_trans(spa_feat) visual_features_topk_bid = torch.cat( (visual_features_topk_bid, spa_feat), dim=1) visual_features_topk_bid = self.visual_embedding( visual_features_topk_bid) ## (N*10, 1024) visual_features_topk_bid = self.vis_batchnorm( visual_features_topk_bid) if cfg.MODEL.RELATION.IS_ON: relation_conn_bid = batch_relation_conn[bid] if len(relation_conn_bid) > 0: relation_conn_bids = [rel[:2] for rel in relation_conn_bid] phr_sub_idx, phr_obj_idx = torch.as_tensor( relation_conn_bids).to(self.device).long().transpose( 0, 1) visual_ctx_topk = (atten_score_topk.unsqueeze(2) * visual_features_topk_bid.reshape( num_phrase, self.s2_topk, -1)).sum( 1) ## N*1024 visual_trans = self.rel_trans(visual_ctx_topk) ent_gate = (visual_trans[phr_sub_idx] * visual_trans[phr_obj_idx]).sum(1) / 512**0.5 gate_mat = torch.zeros([num_phrase, num_phrase]).to(self.device) gate_mat[phr_sub_idx, phr_obj_idx] = F.relu(ent_gate) gate_mat[phr_obj_idx, phr_sub_idx] = F.relu(ent_gate) # gate_mat[phr_sub_idx, phr_obj_idx] = 1 # gate_mat[phr_obj_idx, phr_sub_idx] = 1 gate_mask = (gate_mat != 0).float() gate_mat = numerical_stability_masked_softmax(gate_mat, gate_mask, dim=1) # cxt_feat = torch.mm(gate_mat, visual_ctx_topk) cxt_feat = self.agg_trans( torch.mm(gate_mat, visual_ctx_topk)) visual_features_topk_bid = visual_features_topk_bid + torch.bmm( atten_score_topk.unsqueeze(2), cxt_feat.unsqueeze(1)).view(-1, self.visual_embed_dim) pred_similarity_topk_bid, pred_targets_topk_bid = self.similarity_topk( visual_features_topk_bid, phrase_embed_bid, ind_phr_topk) pred_similarity_topk_bid = pred_similarity_topk_bid.reshape( num_phrase, self.s2_topk) pred_targets_topk_bid = pred_targets_topk_bid.reshape( num_phrase, self.s2_topk, 4) batch_topk_pred_targets.append(pred_targets_topk_bid) if cfg.MODEL.VG.USING_DET_KNOWLEDGE: sim_topk = torch.gather(sim, dim=1, index=atten_ranking_topk.long()) sim_mask = (sim_topk > 0).float() pred_similarity_topk_bid = pred_similarity_topk_bid * sim_topk atten_topk_bid = numerical_stability_masked_softmax( pred_similarity_topk_bid, sim_mask, dim=1) else: atten_topk_bid = F.softmax(pred_similarity_topk_bid, dim=1) atten_fusion = atten_topk_bid * atten_score_topk ## N*10 visual_features_topk_bid = visual_features_topk_bid.view( num_phrase, self.s2_topk, -1) visual_reconst_topk_bid = (atten_fusion.unsqueeze(2) * visual_features_topk_bid).sum( 1) ## N*1024 decoder_phr_topk_logits = self.phrase_decoder( visual_reconst_topk_bid, batch_decoder_word_embed[bid]) batch_topk_decoder_logits.append(decoder_phr_topk_logits) if cfg.MODEL.RELATION.IS_ON: relation_conn_bid = batch_relation_conn[bid] if len(relation_conn_bid) > 0: ent_idx = [] rel_cates = torch.zeros([ len(relation_conn_bid), self.phrase_embed.rel_cater_size ]).to(self.device).float() for rel_i, conn in enumerate(relation_conn_bid): phr_sub_id, phr_obj_id, rel_cate, rel_id = conn rel_cates[rel_i][ rel_cate] = 1 ## indicate the target relation ent_idx.append([phr_sub_id, phr_obj_id]) phr_sub_ids, phr_obj_ids = torch.as_tensor( ent_idx).long().to(self.device).transpose(0, 1) visual_rel_feats_s0 = torch.cat( (visual_reconst_bid[phr_sub_ids], visual_reconst_bid[phr_obj_ids]), dim=1) visual_rel_feats_s0 = self.relation_merge( visual_rel_feats_s0) rel_logits_s0 = self.vis2rel(visual_rel_feats_s0) batch_rel_cls_s0.append(rel_logits_s0) # batch_rel_cls_s0.append(None) visual_rel_feats_s1 = torch.cat( (visual_reconst_topk_bid[phr_sub_ids], visual_reconst_topk_bid[phr_obj_ids]), dim=1) visual_rel_feats_s1 = self.relation_merge( visual_rel_feats_s1) rel_logits_s1 = self.vis2rel(visual_rel_feats_s1) batch_rel_cls_s1.append(rel_logits_s1) batch_rel_cls_gt.append( rel_cates) ## change the ground-truth into cuda tensor batch_rel_reconst_s0.append(None) batch_rel_reconst_s1.append(None) else: batch_rel_cls_s0.append(None) batch_rel_cls_s1.append(None) batch_rel_reconst_s0.append(None) batch_rel_reconst_s1.append(None) batch_rel_cls_gt.append(None) else: batch_rel_cls_s0.append(None) batch_rel_cls_s1.append(None) batch_rel_reconst_s0.append(None) batch_rel_reconst_s1.append(None) batch_rel_cls_gt.append(None) ## construct the discriminative loss batch_ctx_s1_embed.append( self.visual_mlp(visual_reconst_bid.mean(0, keepdim=True))) batch_ctx_embed.append( self.visual_mlp(visual_reconst_topk_bid.mean(0, keepdim=True))) batch_pred_similarity.append(atten_bid) batch_topk_pred_similarity.append(atten_topk_bid) batch_topk_fusion_similarity.append(atten_fusion) ### transform boxes for stage-1 num_phrase_indices = torch.arange(num_phrase).long().to( self.device) max_box_ind = atten_bid.detach().cpu().numpy().argmax(1) precomp_boxes_delta_max = pred_targets_bid[ num_phrase_indices, max_box_ind] ## numPhrase*4 max_topk_id = torch.topk(atten_topk_bid, dim=1, k=1)[1].long().squeeze(1) precomp_boxes_delta_max_topk = pred_targets_topk_bid[ num_phrase_indices, max_topk_id] ## num_phrase*4 precomp_boxes_topk_bid_tensor = precomp_boxes_topk_bid.tensor.reshape( -1, self.s2_topk, 4) max_fusion_topk_id = torch.topk(atten_fusion, dim=1, k=1)[1].long().squeeze() precomp_boxes_delta_max_topk_fusion = pred_targets_topk_bid[ num_phrase_indices, max_fusion_topk_id] ## num_phrase*4 phr_index = torch.arange(num_phrase).to(self.device) * self.s2_topk if self.storage.iter <= cfg.SOLVER.REG_START_ITER: max_select_boxes = precomp_boxes_bid[max_box_ind] max_precomp_boxes = precomp_boxes_topk_bid[max_topk_id + phr_index] max_fusion_precomp_boxes = precomp_boxes_topk_bid[ max_fusion_topk_id + phr_index] else: max_select_boxes = Boxes( self.box2box_translation.apply_deltas( precomp_boxes_delta_max, precomp_boxes_bid[max_box_ind].tensor), precomp_boxes_bid.size) max_precomp_boxes = Boxes( self.box2box_translation.apply_deltas( precomp_boxes_delta_max_topk, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_topk_id]), precomp_boxes_bid.size) max_fusion_precomp_boxes = Boxes( self.box2box_translation.apply_deltas( precomp_boxes_delta_max_topk_fusion, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_fusion_topk_id]), precomp_boxes_bid.size) batch_pred_boxes.append(max_select_boxes) batch_topk_pred_boxes.append(max_precomp_boxes) batch_topk_fusion_pred_boxes.append(max_fusion_precomp_boxes) batch_ctx_sim, batch_ctx_sim_s1 = self.generate_image_sent_discriminative( batch_sent_embed, batch_ctx_embed, batch_ctx_s1_embed) noun_reconst_loss, noun_topk_reconst_loss, disc_img_sent_loss_s1, disc_img_sent_loss_s2, reg_loss, \ reg_loss_s1, rel_cls_loss, rel_cls_loss_s1, rel_const_loss, rel_const_loss_s1 = self.VGLoss(batch_phrase_mask, batch_decode_logits, batch_topk_decoder_logits, batch_phrase_dec_ids, batch_ctx_sim, batch_ctx_sim_s1, batch_pred_similarity, batch_topk_pred_similarity, batch_boxes_targets, batch_precomp_boxes, batch_pred_targets, batch_topk_pred_targets, batch_topk_precomp_boxes, batch_rel_cls_s0, batch_rel_cls_s1, batch_rel_cls_gt, batch_rel_reconst_s0, batch_rel_reconst_s1, batch_rel_mask, batch_rel_dec_idx) all_loss = dict(noun_reconst_loss=noun_reconst_loss, noun_topk_reconst_loss=noun_topk_reconst_loss, disc_img_sent_loss_s1=disc_img_sent_loss_s1, disc_img_sent_loss_s2=disc_img_sent_loss_s2, reg_loss_s1=reg_loss, reg_loss_s2=reg_loss_s1, rel_cls_loss=rel_cls_loss, rel_cls_loss_s1=rel_cls_loss_s1, rel_const_loss=rel_const_loss, rel_const_loss_s1=rel_const_loss_s1) if self.training: return all_loss, None else: return all_loss, (batch_phrase_ids, batch_phrase_types, move2cpu(batch_pred_boxes), move2cpu(batch_pred_similarity), move2cpu(batch_boxes_targets), move2cpu(batch_precomp_boxes), image_unique_id, move2cpu(batch_topk_pred_similarity), move2cpu(batch_topk_fusion_similarity), move2cpu(batch_topk_pred_boxes), move2cpu(batch_topk_fusion_pred_boxes), move2cpu(batch_topk_precomp_boxes), move2cpu(batch_topk_pred_targets), move2cpu(batch_pred_targets))
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, predicate_identification, all_l_ids, Predicate_link, Predicate_Labels_nd, Predicate_Labels, unlabeled_sentence_in=False, p_unlabeled_sentence_in=False, unlabeled_sen_lengths=False, test=False, cvt_train=False): """ elmo_embedding_0 = self.elmo_embeddings_0(sentence).view(self.batch_size, len(sentence[0]), 1024) elmo_embedding_1 = self.elmo_embeddings_1(sentence).view(self.batch_size, len(sentence[0]), 1024) w = F.softmax(self.elmo_word, dim=0) elmo_emb = self.elmo_gamma_word * (w[0] * elmo_embedding_0 + w[1] * elmo_embedding_1) elmo_emb_word = self.elmo_mlp_word(elmo_emb) """ region_marks = self.region_embeddings(region_marks).view( self.batch_size, len(sentence[0]), 16) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) pos_embeds = self.pos_embeddings(pos_tags) SRL_hidden_states = torch.cat((embeds_SRL, fixed_embeds, region_marks), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer # first layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] self.hidden = self.init_hidden_spe() hidden_states, self.hidden = self.BiLSTM_0(embeds_sort, self.hidden) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] # second_layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] self.hidden_1 = self.init_hidden_spe() hidden_states, self.hidden_1 = self.BiLSTM_1(embeds_sort, self.hidden_1) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_1 = hidden_states[unsort_idx] hidden_states_0 = self.hidden_state_dropout_0(hidden_states_0) hidden_states_1 = self.hidden_state_dropout_1(hidden_states_1) hidden_states = torch.cat((hidden_states_0, hidden_states_1), 2) # B * H hidden_states_word = self.dropout_1( F.relu(self.Non_Predicate_Proj(hidden_states))) predicate_embeds = hidden_states[np.arange(0, hidden_states.size()[0]), target_idx_in] hidden_states_predicate = self.dropout_2( F.relu(self.Predicate_Proj(predicate_embeds))) bias_one = torch.ones( (self.batch_size, len(sentence[0]), 1)).to(device) hidden_states_word = torch.cat( (hidden_states_word, Variable(bias_one)), 2) left_part = torch.mm( hidden_states_word.view(self.batch_size * len(sentence[0]), -1), self.W_R) left_part = left_part.view(self.batch_size, len(sentence[0]) * self.dep_size, -1) hidden_states_predicate = hidden_states_predicate.view( self.batch_size, -1, 1) tag_space = torch.bmm(left_part, hidden_states_predicate).view( len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) # +++++++++++++++++++++++ wrong_l_nums = 0.0 all_l_nums = 0.0 right_noNull_predict = 10.0 noNull_predict = 10.0 noNUll_truth = 10.0 loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, Predicate_Labels_nd.view(-1)) return SRLloss, SRLloss, SRLloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums, \ right_noNull_predict, noNull_predict, noNUll_truth,\ right_noNull_predict, noNull_predict, noNUll_truth
if __name__ == '__main__': plt.close('all') mlp.rcParams['font.family'] = ['times new roman'] # default is sans-serif rc('text', usetex=True) f, ax = plt.subplots(1, 1, figsize=(8, 4)) f.suptitle('Figure 3.17, pg. 172', fontsize=14) m = 10 #Number of basis alpha = 2.0 beta = 11.1 m0 = th.DoubleTensor(m, 1).zero_() X_train, T_train = generateData(1, 30, np.sqrt(1 / beta)) blr = BayesianLinearReg(m, m0, np.exp(-5), beta, basis='guassian') phi = blr.getBasis(X_train) e, v = th.eig(beta * th.mm(th.transpose(phi, 0, 1), phi)) X0 = np.linspace(-10, 10, 100) gamma = np.zeros(len(X0)) W0 = np.zeros((m, len(X0))) for idx, val in enumerate(X0): blr = BayesianLinearReg(m, m0, np.exp(val), beta, basis='guassian') blr.posterUpdate(X_train, T_train) #Eq. 3.91 gamma[idx] = Variable(th.sum(th.div(e, np.exp(val) + e), 0)).data.numpy()[0, 0] W0[:, idx] = Variable(blr.getWeightsMAP().squeeze()).data.numpy() cmap = plt.cm.get_cmap("gnuplot")