def retrieval_ref(model, batch_query, extra_memory_loader, evaluation, device): """ Given a batch of query and a extra memory, to search a best matched img ref Params: model(torch.nn.Module): for extract feature batch_query(torch.Tensor): a batch of query extra_memory_loader(torch.data.DataLoader): A dataset loader evaluation(torch.nn.Module or other function for evaluation distance): typically cosine distance or l2 distance device(torch.device): move the model and data to gpu or cpu Return: a btach of img, the best matched img for queries """ batch_query = batch_query.to(device) model = model.to(device) batch_qf = model(batch_query) best_min_val, best_min_img = torch.zeors(batch_qf.size(0)), torch.zeros_like(batch_query) for i, data in enumerate(extra_memory_loader): imgs = data[0] imgs = imgs.to(device) ref_f = model(imgs) # a n*n matrix each column represent the distance # between ith query and each image in memeory dist = evalution(batch_qf, ref_f) min_val, min_ind = torch.min(dist, 1) # Update the best min val min_cmp = min_val < best_min_val best_min_val[min_cmp] = min_val[min_cmp] best_min_img[min_cmp] = imgs[min_ind[min_cmp]] return best_min_img
def generate_caption_w_target(self, h_0, target_caption): """ h_0(sentence embedding), target caption -LSTM-> list of (word probability distribution) use for training. Forced teacher method @h_0 : (batch, hidden_size) @target_caption : (batch, num_of_words, len_of_Vocab) (batch*(seq of one-hot vectors)) * starting from <START>?? """ batch_size = h_0.size(0) num_of_words = target_caption.size(1) h=torch.zeros((batch_size, self.hidden_size)) # hidden state c=torch.zeors((batch_size, self.hidden_size)) # cell state hat_y_s = torch.empty((batch_size,num_of_words,self.vocab_size)) # tensor of output states(i.e. prob distributions) target_caption_embedding = self.pretrained_embedding(target_caption) # (batch, len(target_caption), embedding_size) for t in range(num_of_words): if t==0: h, c = self.lstm_cell(h_0, (h,c)) ## starting from <START> 면 여기서도 else와 같이 해야하는 것 아닌가? ## 2가지 architecture가 있는듯 - feature vector을 1) h_0에 2)x_1에(h_0는 zero vector) else: h, c= self.lstm_cell(target_caption_embedding[:,t,:], (h,c)) word_pb_distribution = nn.functional.log_softmax(self.linear_words(h)) hat_y_s[:,t,:] = word_pb_distribution return hat_y_s
def generate_caption_wo_target(self,h_0): """ h_0(sentence embedding)-LSTM-> list of (word probability distribution) use for testing . Greedy algorithm (argmax) """ batch_size = h_0.size(0) num_of_words = target_caption.size(1) h=torch.zeros((batch_size, self.hidden_size)) # hidden state c=torch.zeors((batch_size, self.hidden_size)) # cell state hat_y_s = torch.empty((batch_size,num_of_words,self.vocab_size)) # tensor of output states(i.e. prob distributions) nextword_embedding = torch.zeros((batch_size,self.embed_size)) for t in range(num_of_words): ## <END> token 나왔을때 멈춰야 하는 것 아닌가? if t==0: h, c = self.lstm_cell(h_0, (h,c)) else: h, c= self.lstm_cell(nextword_embedding, (h,c)) word_pb_distribution = nn.functional.log_softmax(self.linear_words(h)) # (batch, vocab_size) hat_y_s[:,t,:] = word_pb_distribution max_pb_idx = torch.argmax(word_pb_distribution, dim=1) # (batch) nextword_embedding = self.pretrained_embedding(max_pb_idx) # (batch, embed_size) return hat_y_s
def inference(self, input, target): # 进行 Beam Search # 此函数的 batch_size = 1 # input = [batch_size, input_len, vocab_size] # target = [batch_size, target_len, vocab_size] batch_size = input.shape[0] input_len = input.shape[1] # 取得最大字数 vocab_size = self.decoder.cn_vocab_size # 准备一个储存空间来储存输出 outputs = torch.zeors(batch_size, input_len, vocab_size).to(self.device) # 将输入放入 encoder encoder_outputs, hidden = self.encoder(input) # encoder 最终的隐层用来初始化 decoder # encoder_outputs 主要是用在 Attention # 因为 encoder 是双向 RNN ,所以需要将同一层两个方向的 hidden_state 接在一起 # hidden = [num_layers * directions, batch_size, hid_dim] -> [num_layers, directions, bacth_size, hid_dim] hidden = hidden.view(self.encoder.n_layers, 2, batch_size - 1) hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) # 取 <BOS> 标识 input = target[:, 0] preds = [] for t in range(1, input_len): output, hidden = self.decoder(input, hidden, encoder_outputs) # 将预测结果存起来 outputs[:, t] = output # 取出几率最大的单词 top1 = output.argmax(1) input = top1 preds.append(top1.unsqueeze(1)) preds = torch.cat(preds, 1) return outputs, preds
def plot_durations(): plt.figure(2) plt.clf() durations_t = torch.tensor(episode_durations, dtype=torch.float) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy()) # Take 100 episode averages and plot them too if len(durations_t) >= 100: means = durations_t.unflod(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeors(99), means)) plt.plot(means.numpy()) plt.pause(0.001)
def init_hidden(self): num_directions = 2 if self.bidirectional else 1 return torch.zeors(num_directions * self.num_layer, 1, self.hidden_dim, device=device)
def __init_hidden(self, batch_size): n, hs = self.num_layers, self.hidden_size return (torch.zeors(n * 1, batch_size, hs), torch.zeros(n * 1, batch_size, hs))