def nll(self,scores,volatile): y_pos = util.to_var(np.ones(scores.size()[0],dtype='float32'),volatile=volatile) y_neg = util.to_var(-1.*np.ones(scores.size()[0], dtype='float32'),volatile=volatile) loss = self.logistic(scores[:, 0],y_pos) for i in range(1,scores.size()[1]): loss += self.logistic(scores[:, i], y_neg) return loss/scores.size()[1]
def logistic(self,scores): y_pos = util.to_var(np.ones(scores.size()[0],dtype='float32'),requires_grad=False) y_neg = util.to_var(np.zeros(scores.size()[0], dtype='float32'),requires_grad=False) loss = self.bce(scores[:, 0],y_pos) for i in range(1,scores.size()[1]): loss += self.bce(scores[:, i], y_neg) return loss/scores.size()[1]
def mixup(image, label, num_classes): alpha = 1.0 rand_idx = torch.randperm(label.shape[0]) image2 = image[rand_idx].clone() label2 = label[rand_idx].clone() y_one_hot = torch.eye(num_classes, device='cuda')[label].clone() y2_one_hot = torch.eye(num_classes, device='cuda')[label2].clone() mix_rate = np.random.beta(alpha, alpha, image.shape[0]) mix_rate2 = None if image.ndim == 2: mix_rate2 = util.to_var( torch.from_numpy(mix_rate.reshape((image.shape[0], 1))).float()) elif image.ndim == 4: mix_rate2 = util.to_var( torch.from_numpy(mix_rate.reshape( (image.shape[0], 1, 1, 1))).float()) mix_rate = util.to_var( torch.from_numpy(mix_rate.reshape((image.shape[0], 1))).float()) x_mixed = image.clone() * mix_rate2 + image2.clone() * (1 - mix_rate2) y_soft = y_one_hot * mix_rate + y2_one_hot * (1 - mix_rate) # util.save_images(x_mixed) return x_mixed, y_soft
def init_hidden(self, num_layers, batch_size, hidden_dim): """ initialize h0, c0 """ h = util.to_var(torch.zeros(num_layers, batch_size, hidden_dim)) c = util.to_var(torch.zeros(num_layers, batch_size, hidden_dim)) return h, c
def output(self, entities, rels, is_target): entities = self.entities(util.to_var(entities, True)).unsqueeze(2) rels = self.rels(util.to_var(rels, True)) if is_target: out = entities + rels else: out = entities - rels return out.view(-1, out.size()[1] * out.size()[2]).data.cpu().numpy()
def sample_old(self, n=4, z=None, max_length=60): if z is None: z = to_var(torch.randn([n, self.latent_size])) batch_size, l = z.size() hidden = self.tohidden(z) hidden = hidden.view(2, batch_size, self.hidden_size) # required for dynamic stopping of sentence generation sequence_idx = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch sequence_running = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch which are still generating sequence_mask = torch.ones(batch_size, out=self.tensor()).byte() running_seqs = torch.arange(0, batch_size, out=self.tensor()).long() # idx of still generating sequences with respect to current loop generations = self.tensor(batch_size, max_length).fill_(PAD).long() t=0 while t < max_length and len(running_seqs) > 0: if t == 0: input_sequence = to_var(torch.Tensor(batch_size).fill_(SOS).long()) input_sequence = input_sequence.unsqueeze(1) input_embedding = self.embedding(input_sequence) output, hidden = self.decoder_rnn(input_embedding, hidden) logits = self.outputs2vocab(output) input_sequence = self._sample_old(logits) # save next input generations = self._save_sample(generations, input_sequence, sequence_running, t) # update global running sequence sequence_mask[sequence_running] = (input_sequence != EOS).data sequence_running = sequence_idx.masked_select(sequence_mask) # update local running sequences running_mask = (input_sequence != EOS).data running_seqs = running_seqs.masked_select(running_mask) # prune input and hidden state according to local update if len(running_seqs.size()) > 0: input_sequence = input_sequence[running_seqs] hidden = hidden[:, running_seqs] running_seqs = torch.arange(0, len(running_seqs), out=self.tensor()).long() t += 1 return generations
def eval_epoch(self, model, data_loader, criterion): total_loss = 0. total_words = 0. for i, (data, target) in enumerate(data_loader): x, y = util.to_var(data, volatile=True), util.to_var(target, volatile=True) # x: (None, sequence_len + 1), y: (None, sequence_len + 1). Should use volatile if no backward operation logits = model(x) # (None, vocab_size, sequence_len+1) loss = criterion(logits, y) total_loss += loss.data.cpu()[0] total_words += x.size(0) * x.size(1) return total_loss / total_words
def train(): train_file_idxs = np.arange(0, len(TRAIN_FILES)) np.random.shuffle(train_file_idxs) model = get_model() print(model) dtype = torch.FloatTensor criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001) losses = [] # writer = SummaryWriter() global_step = 0 for epoch in range(num_epochs): running_loss = 0 num_total_btches = 0 for i in range(len(TRAIN_FILES)): train_data, current_labels = util.loadDataFile( TRAIN_FILES[train_file_idxs[i]]) train_data = util.to_var(torch.from_numpy(train_data)) current_labels = util.to_var(torch.from_numpy(current_labels)) num_batches = train_data.shape[0] // args.batch_size print('Training file: {:5d} |num of batches: {:5d}'.format( i, num_batches)) for btch in range(num_batches): # print('Batch [{:5d}/{:5d}]'.format(btch, num_batches)) optimizer.zero_grad() start_idx = btch * args.batch_size end_idx = (btch + 1) * args.batch_size current_train = train_data[start_idx:end_idx, :, :] btch_label = current_labels[start_idx:end_idx, :].type( torch.long).cuda() # pdb.set_trace() logits = model(current_train) # pdb.set_trace() loss = criterion(logits, btch_label.view(-1)) loss.backward() optimizer.step() preds = F.log_softmax(logits, 1) pred_choice = preds.data.max(1)[1] correct = pred_choice.eq(btch_label.data).cpu().sum() running_loss += loss.item() * args.batch_size losses.append(loss.item()) # writer.add_scalar('loss',loss.item(), global_step) # writer.add_graph(model,current_train) if btch % args.print_every == 0: print( 'Epoch [{:5d}/{:5d}] | loss: {:6.4f} | accuracy:{:6.4f}' .format(epoch + 1, num_epochs, loss.item(), correct.item() / float(args.batch_size))) global_step += 1 num_total_btches += 1 total_training_samples = num_total_btches * args.batch_size print("Training loss {:6.4f}".format(running_loss / total_training_samples))
def output(self, entities, rels, is_target): entities_i = self.entities_i(util.to_var(entities, True)).unsqueeze(2) rels_i = self.rels_i(util.to_var(rels, True)) entities = self.entities(util.to_var(entities, True)).unsqueeze(2) rels = self.rels(util.to_var(rels, True)) if is_target: out = torch.mul(entities,rels) + torch.mul(entities_i,rels) \ + torch.mul(rels_i,entities) - torch.mul(rels_i,entities_i) else: out = torch.mul(entities, rels) + torch.mul(entities_i, rels) \ + torch.mul(rels_i, entities_i) - torch.mul(rels_i, entities) return out.squeeze(2).data.cpu().numpy()
def train_epoch(self, model, data_loader, criterion, optim): total_loss = 0. total_words = 0. for i, (data, target) in enumerate(data_loader): optim.zero_grad() x, y = util.to_var(data), util.to_var(target) # x: (None, sequence_len + 1), y: (None, sequence_len + 1) logits = model(x) # (None, vocal_size, sequence_len+1) loss = criterion(logits, y) total_loss += loss.data.cpu()[0] total_words += x.size(0) * x.size(1) loss.backward() optim.step() return total_loss / total_words
def update(self): # 如果儲存的資訊太少就不更新 if self.memory_counter <= 5000: return # 將evaluate network的參數複製進入target network中 self.softCopy() # 決定輸入的batch data if self.memory_counter > self.memory_size: sample_idx = np.random.choice(self.memory_size, size=self.batch_size) else: sample_idx = np.random.choice(self.memory_counter, size=self.batch_size) # 從記憶庫中擷取要訓練的資料 batch_data = self.memory[sample_idx, :] batch_s = batch_data[:, :self.n_state] batch_a = batch_data[:, self.n_state:self.n_state + self.n_action] batch_r = batch_data[:, -self.n_state - 1:-self.n_state] batch_s_ = batch_data[:, -self.n_state:] # 送入Pytorch中 batch_s = to_var(batch_s) batch_a = to_var(batch_a) batch_r = to_var(batch_r) batch_s_ = to_var(batch_s_) # 用target network計算target Q值 next_q_target = self.target_critic(batch_s_, self.target_actor(batch_s_)) q_target = batch_r + self.gamma * next_q_target # 更新critic self.critic_optimizer.zero_grad() q_batch = self.eval_critic(batch_s, batch_a) value_loss = F.mse_loss(input=q_batch, target=q_target) value_loss.backward() self.critic_optimizer.step() # 更新actor self.actor_optimizer.zero_grad() policy_loss = -self.eval_critic(batch_s, self.eval_actor(batch_s)).mean() policy_loss.backward() self.actor_optimizer.step() # 降低action隨機搜索廣度 self.var *= .9995
def train_gan(self, backend): rollout = Rollout(self.generator, self.discriminator, self.update_rate) print('\nStart Adeversatial Training......') gen_optim, dis_optim = torch.optim.Adam(self.generator.parameters(), self.lr), torch.optim.Adam(self.discriminator.parameters(), self.lr) dis_criterion = util.to_cuda(nn.BCEWithLogitsLoss(size_average=False)) gen_criterion = util.to_cuda(nn.CrossEntropyLoss(size_average=False, reduce=True)) for epoch in range(self.gan_epochs): start = time.time() for _ in range(1): samples = self.generator.sample(self.batch_size, self.sequence_len) # (batch_size, sequence_len) zeros = util.to_var(torch.zeros(self.batch_size, 1).long()) # (batch_size, 1) inputs = torch.cat([samples, zeros], dim=1)[:, :-1] # (batch_size, sequence_len) rewards = rollout.reward(samples, 16) # (batch_size, sequence_len) rewards = util.to_var(torch.from_numpy(rewards)) logits = self.generator(inputs) # (None, vocab_size, sequence_len) pg_loss = self.pg_loss(logits, samples, rewards) gen_optim.zero_grad() pg_loss.backward() gen_optim.step() print 'generator updated via policy gradient......' if epoch % 10 == 0: util.generate_samples(self.generator, self.batch_size, self.sequence_len, self.generate_sum, self.eval_file) eval_data = GenData(self.eval_file) eval_data_loader = DataLoader(eval_data, batch_size=self.batch_size, shuffle=True, num_workers=8) loss = self.eval_epoch(self.target_lstm, eval_data_loader, gen_criterion) print 'epoch: [{0:d}], true loss: [{1:.4f}]'.format(epoch, loss) for _ in range(1): util.generate_samples(self.generator, self.batch_size, self.sequence_len, self.generate_sum, self.fake_file) dis_data = DisData(self.real_file, self.fake_file) dis_data_loader = DataLoader(dis_data, batch_size=self.batch_size, shuffle=True, num_workers=8) for _ in range(1): loss = self.train_epoch(self.discriminator, dis_data_loader, dis_criterion, dis_optim) print 'discriminator updated via gan loss......' rollout.update_params() end = time.time() print 'time: [{:.3f}s/epoch] in {}'.format(end-start, backend)
def init_h(self, batch_size=None, hidden=None): """Return RNN initial state""" if hidden is not None: return hidden if self.use_lstm: return (to_var( torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)), to_var( torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size))) else: return to_var( torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size))
def sample(self, batch_size, sequence_len, x=None): flag = False if x is None: x = util.to_var(torch.zeros(batch_size, 1).long()) flag = True h, c = self.init_hidden(self.num_layers, batch_size, self.hidden_dim) samples = [] if flag: for _ in range(sequence_len): logits, h, c = self.step(x, h, c) probs = F.softmax(logits, dim=1) sample = probs.multinomial(1) # (batch_size, 1) samples.append(sample) else: given_len = x.size(1) lis = x.chunk(x.size(1), dim=1) for i in range(given_len): logits, h, c = self.step(lis[i], h, c) samples.append(lis[i]) x = F.softmax(logits, dim=1).multinomial(1) for i in range(given_len, sequence_len): samples.append(x) logits, h, c = self.step(x, h, c) x = F.softmax(logits, dim=1).multinomial(1) output = torch.cat(samples, 1) return output # (batch_size, sequence_len)
def softmax(self,scores,volatile): # Need to recreate y every time because batch sizes may vary, though one could cache for batch sizes # Very large batch size, negligible effect y = np.zeros(scores.size()[0],dtype='int') y = util.to_var(y,volatile=volatile) loss = self.cross_ent(scores,y) return loss
def update(self): # 如果儲存的transition資訊太少,則不更新 if self.memory_counter < self.batch_size: return # 將evaluate network中的所有參數複製到target network中 if self.learn_step_counter % self.update_iter == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter += 1 # 從記憶庫中隨機挑選batch個transition來更新 if self.had_fill_memory: sample_idx = np.random.choice(self.memory_size, self.batch_size) else: sample_idx = np.random.choice(self.memory_counter, self.batch_size) # 隨機從記憶庫中選取transition (Game re-play) batch_memory = self.memory[sample_idx, :] batch_s = to_var(batch_memory[:, :self.n_state]) batch_a = to_var(batch_memory[:, self.n_state:self.n_state + 1].astype(int), to_float=False) batch_r = to_var(batch_memory[:, self.n_state + 1:self.n_state + 2]) batch_s_ = to_var(batch_memory[:, -self.n_state:]) # 移到GPU上 if torch.cuda.is_available(): batch_s = batch_s.cuda() batch_a = batch_a.cuda() batch_r = batch_r.cuda() batch_s_ = batch_s_.cuda() # ------------------------------------------------ # 更新 # 1. 用eval_net做預測後,取出t+1時間點的V值 # 2. 用target_net做預測後,得到t+2時間點的V值 # 3. 假設t+2時間點是正確答案,套入DQN更新公式 # 4. 更新參數 # ------------------------------------------------ q_eval = self.eval_net(batch_s).gather(1, batch_a) # 1 q_next = self.target_net(batch_s_).detach() # 2 q_target = batch_r + self.gamma * q_next.max(1)[0].view( self.batch_size, 1) # 3 loss = self.criterion(q_eval, q_target) self.optimizer.zero_grad() loss.backward() # 4 self.optimizer.step()
def output(self, entities, rels, is_target): ''' Given source and rels output the target or given target and rels output the source vector :param entities: source or target entity ids :param rels: rel ids :param is_target: True for predicting targets :return: ''' entities = self.entities(util.to_var(entities, True)).unsqueeze(2) rels = self.rels(util.to_var(rels, True)) # Reshape rels rels = rels.view(-1, self.dim, self.dim) if is_target: out = torch.bmm(torch.transpose(entities, 1, 2), rels) else: out = torch.bmm(rels, entities) out = out.view(-1, out.size()[1] * out.size()[2]) return out.data.cpu().numpy()
def chooseAction(self, s): """ 給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值 """ s = to_var(s) a = self.eval_actor(s) a = a.cpu().data.numpy() if self.var > 0: a = np.clip(np.random.normal(a, self.var), -2, 2) return a
def random_noise(image): noise_scale = 0.001 noise = np.random.randn( np.array(image.data).shape[0], np.array(image.data).shape[1], np.array(image.data).shape[2], np.array(image.data).shape[3]) # ノイズ生成 image = image + util.to_var(torch.from_numpy(noise_scale * noise).float()) # util.save_images(image) return image
def chooseAction(self, x): """ 透過network決定action, 極少數用隨機來決定 """ x = to_var(x) x = x.cuda() if torch.cuda.is_available() else x if np.random.uniform() > self.epsilon: # 用DQN決定動作 action_value = self.eval_net(x) _, action = torch.max(action_value, 0) action = action[0].cpu().data.numpy()[0] else: # 用隨機指定動作 action = np.random.randint(0, self.n_action) return action
def sample(self, batch_size, sequence_len): x = util.to_var(torch.zeros(batch_size, 1).long()) h, c = self.init_hidden(self.num_layers, batch_size, self.hidden_dim) samples = [] for _ in range(sequence_len): logits, h, c = self.step(x, h, c) probs = F.softmax(logits, dim=1) sample = probs.multinomial(1) # (batch_size, 1) samples.append(sample) output = torch.cat(samples, 1) return output # (batch_size, sequence_len)
def sample(self, z=None, n=4, max_length=60, temperature=1.0): """ :param z: Batch of latent vectors :param n: Ignored if z is given :param max_length: :param temperature: :return: """ if z is None: z = to_var(torch.randn([n, self.latent_size])) b, l = z.size() hidden = self.tohidden(z) input = self.tensor(b, max_length).fill_(PAD).long() input[:, 0] = SOS for t in range(max_length - 1): input_embedding = self.embedding(input) # input_embedding = rnn_utils.pack_padded_sequence(input_embedding, [t+1] * b, # batch_first=True) output, _ = self.decoder_rnn(input_embedding, hidden.unsqueeze(0)) # output = rnn_utils.pad_packed_sequence(output, batch_first=True)[0] logits = self.outputs2vocab(output) current = logits[:, t, :] # logits for the current step input[:, t+1] = util.sample_logits(current, temperature) return input
def forward(self, input_sentences, input_sentence_length, input_conversation_length, input_masks): """ Args: input_sentences: (Variable, LongTensor) [num_sentences, seq_len] target_sentences: (Variable, LongTensor) [num_sentences, seq_len] Return: decoder_outputs: (Variable, FloatTensor) - train: [batch_size, seq_len, vocab_size] - eval: [batch_size, seq_len] """ num_sentences = input_sentences.size(0) max_len = input_conversation_length.max().item() # encoder_outputs: [num_sentences, max_source_length, hidden_size * direction] # encoder_hidden: [num_layers * direction, num_sentences, hidden_size] # encoder_outputs, encoder_hidden = self.encoder(input_sentences, # input_sentence_length) all_encoder_layers, _ = self.encoder(input_sentences, token_type_ids=None, attention_mask=input_masks) bert_output = [] for idx in range(self.config.num_bert_layers): layer = all_encoder_layers[idx] bert_output.append(layer[:, 0, :]) bert_output = torch.stack(bert_output, dim=1) bert_output = torch.mean(bert_output, dim=1, keepdim=False) # encoder_hidden: [num_sentences, num_layers * direction * hidden_size] encoder_hidden = bert_output # pad and pack encoder_hidden start = torch.cumsum( torch.cat((to_var(input_conversation_length.data.new(1).zero_()), input_conversation_length[:-1])), 0) # encoder_hidden: [batch_size, max_len, num_layers * direction * hidden_size] encoder_hidden = torch.stack([ pad(encoder_hidden.narrow(0, s, l), max_len) for s, l in zip( start.data.tolist(), input_conversation_length.data.tolist()) ], 0) # context_outputs: [batch_size, max_len, context_size] context_outputs, context_last_hidden = self.context_encoder( encoder_hidden, input_conversation_length) # flatten outputs # context_outputs: [num_sentences, context_size] context_outputs = torch.cat([ context_outputs[i, :l, :] for i, l in enumerate(input_conversation_length.data) ]) context_outputs = self.dropoutLayer(context_outputs) # project context_outputs to decoder init state decoder_init = self.context2decoder(context_outputs) output = self.decoder2output(decoder_init) return output
def max_margin(self,scores): y = util.to_var(np.ones(scores.size()[0],dtype='float32'), requires_grad=False) loss = self.mm(scores[:,0],scores[:,1],y) for i in range(2,scores.size()[1]): loss += self.mm(scores[:,0],scores[:,i],y) return loss/(scores.size()[1]-1.)
def evaluate(self, data_loader, mode=None): assert (mode is not None) self.model.eval() batch_loss_history, predictions, ground_truth = [], [], [] for batch_i, (conversations, labels, conversation_length, sentence_length, type_ids, masks) in enumerate(data_loader): # conversations: (batch_size) list of conversations # conversation: list of sentences # sentence: list of tokens # conversation_length: list of int # sentence_length: (batch_size) list of conversation list of sentence_lengths input_conversations = conversations # flatten input and target conversations input_sentences = [ sent for conv in input_conversations for sent in conv ] input_labels = [label for conv in labels for label in conv] input_sentence_length = [ l for len_list in sentence_length for l in len_list ] input_conversation_length = [l for l in conversation_length] input_masks = [mask for conv in masks for mask in conv] orig_input_labels = input_labels with torch.no_grad(): # transfering the input to cuda input_sentences = to_var(torch.LongTensor(input_sentences)) input_labels = to_var(torch.LongTensor(input_labels)) input_sentence_length = to_var( torch.LongTensor(input_sentence_length)) input_conversation_length = to_var( torch.LongTensor(input_conversation_length)) input_masks = to_var(torch.LongTensor(input_masks)) sentence_logits = self.model(input_sentences, input_sentence_length, input_conversation_length, input_masks) present_predictions = list( np.argmax(sentence_logits.detach().cpu().numpy(), axis=1)) loss_function = nn.CrossEntropyLoss() batch_loss = loss_function(sentence_logits, input_labels) predictions += present_predictions ground_truth += orig_input_labels assert not isnan(batch_loss.item()) batch_loss_history.append(batch_loss.item()) epoch_loss = np.mean(batch_loss_history) print_str = f'{mode} loss: {epoch_loss:.3f}\n' w_f1_score = self.print_metric(ground_truth, predictions, mode) return epoch_loss, w_f1_score, predictions
def main(args): # cfg_file = os.path.join(args.example_config_path, args.primitive) + ".yaml" cfg = get_vae_defaults() # cfg.merge_from_file(cfg_file) cfg.freeze() batch_size = args.batch_size dataset_size = args.total_data_size if args.experiment_name is None: experiment_name = args.model_name else: experiment_name = args.experiment_name if not os.path.exists(os.path.join(args.log_dir, experiment_name)): os.makedirs(os.path.join(args.log_dir, experiment_name)) description_txt = raw_input('Please enter experiment notes: \n') if isinstance(description_txt, str): with open( os.path.join(args.log_dir, experiment_name, experiment_name + '_description.txt'), 'wb') as f: f.write(description_txt) writer = SummaryWriter(os.path.join(args.log_dir, experiment_name)) # torch_seed = np.random.randint(low=0, high=1000) # np_seed = np.random.randint(low=0, high=1000) torch_seed = 0 np_seed = 0 torch.manual_seed(torch_seed) np.random.seed(np_seed) trained_model_path = os.path.join(args.model_path, args.model_name) if not os.path.exists(trained_model_path): os.makedirs(trained_model_path) if args.task == 'contact': if args.start_rep == 'keypoints': start_dim = 24 elif args.start_rep == 'pose': start_dim = 7 if args.goal_rep == 'keypoints': goal_dim = 24 elif args.goal_rep == 'pose': goal_dim = 7 if args.skill_type == 'pull': # + 7 because single arm palm pose input_dim = start_dim + goal_dim + 7 else: # + 14 because both arms palm pose input_dim = start_dim + goal_dim + 14 output_dim = 7 decoder_input_dim = start_dim + goal_dim vae = VAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) elif args.task == 'goal': if args.start_rep == 'keypoints': start_dim = 24 elif args.start_rep == 'pose': start_dim = 7 if args.goal_rep == 'keypoints': goal_dim = 24 elif args.goal_rep == 'pose': goal_dim = 7 input_dim = start_dim + goal_dim output_dim = goal_dim decoder_input_dim = start_dim vae = GoalVAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) elif args.task == 'transformation': input_dim = args.input_dimension output_dim = args.output_dimension decoder_input_dim = args.input_dimension - args.output_dimension vae = GoalVAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) else: raise ValueError('training task not recognized') if torch.cuda.is_available(): vae.encoder.cuda() vae.decoder.cuda() if args.start_epoch > 0: start_epoch = args.start_epoch num_epochs = args.num_epochs fname = os.path.join( trained_model_path, args.model_name + '_epoch_%d.pt' % args.start_epoch) torch_seed, np_seed = load_seed(fname) load_net_state(vae, fname) load_opt_state(vae, fname) args = load_args(fname) args.start_epoch = start_epoch args.num_epochs = num_epochs torch.manual_seed(torch_seed) np.random.seed(np_seed) data_dir = args.data_dir data_loader = DataLoader(data_dir=data_dir) data_loader.create_random_ordering(size=dataset_size) dataset = data_loader.load_dataset(start_rep=args.start_rep, goal_rep=args.goal_rep, task=args.task) total_loss = [] start_time = time.time() print('Saving models to: ' + trained_model_path) kl_weight = 1.0 print('Starting on epoch: ' + str(args.start_epoch)) for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): print('Epoch: ' + str(epoch)) epoch_total_loss = 0 epoch_kl_loss = 0 epoch_pos_loss = 0 epoch_ori_loss = 0 epoch_recon_loss = 0 kl_coeff = 1 - kl_weight kl_weight = args.kl_anneal_rate * kl_weight print('KL coeff: ' + str(kl_coeff)) for i in range(0, dataset_size, batch_size): vae.optimizer.zero_grad() input_batch, decoder_input_batch, target_batch = \ data_loader.sample_batch(dataset, i, batch_size) input_batch = to_var(torch.from_numpy(input_batch)) decoder_input_batch = to_var(torch.from_numpy(decoder_input_batch)) z, recon_mu, z_mu, z_logvar = vae.forward(input_batch, decoder_input_batch) kl_loss = vae.kl_loss(z_mu, z_logvar) if args.task == 'contact': output_r, output_l = recon_mu if args.skill_type == 'grasp': target_batch_right = to_var( torch.from_numpy(target_batch[:, 0])) target_batch_left = to_var( torch.from_numpy(target_batch[:, 1])) pos_loss_right = vae.mse(output_r[:, :3], target_batch_right[:, :3]) ori_loss_right = vae.rotation_loss( output_r[:, 3:], target_batch_right[:, 3:]) pos_loss_left = vae.mse(output_l[:, :3], target_batch_left[:, :3]) ori_loss_left = vae.rotation_loss(output_l[:, 3:], target_batch_left[:, 3:]) pos_loss = pos_loss_left + pos_loss_right ori_loss = ori_loss_left + ori_loss_right elif args.skill_type == 'pull': target_batch = to_var( torch.from_numpy(target_batch.squeeze())) #TODO add flags for when we're training both arms # output = recon_mu[0] # right arm is index [0] # output = recon_mu[1] # left arm is index [1] pos_loss_right = vae.mse(output_r[:, :3], target_batch[:, :3]) ori_loss_right = vae.rotation_loss(output_r[:, 3:], target_batch[:, 3:]) pos_loss = pos_loss_right ori_loss = ori_loss_right elif args.task == 'goal': target_batch = to_var(torch.from_numpy(target_batch.squeeze())) output = recon_mu if args.goal_rep == 'pose': pos_loss = vae.mse(output[:, :3], target_batch[:, :3]) ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:, 3:]) elif args.goal_rep == 'keypoints': pos_loss = vae.mse(output, target_batch) ori_loss = torch.zeros(pos_loss.shape) elif args.task == 'transformation': target_batch = to_var(torch.from_numpy(target_batch.squeeze())) output = recon_mu pos_loss = vae.mse(output[:, :3], target_batch[:, :3]) ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:, 3:]) recon_loss = pos_loss + ori_loss loss = kl_coeff * kl_loss + recon_loss loss.backward() vae.optimizer.step() epoch_total_loss = epoch_total_loss + loss.data epoch_kl_loss = epoch_kl_loss + kl_loss.data epoch_pos_loss = epoch_pos_loss + pos_loss.data epoch_ori_loss = epoch_ori_loss + ori_loss.data epoch_recon_loss = epoch_recon_loss + recon_loss.data writer.add_scalar('loss/train/ori_loss', ori_loss.data, i) writer.add_scalar('loss/train/pos_loss', pos_loss.data, i) writer.add_scalar('loss/train/kl_loss', kl_loss.data, i) if (i / batch_size) % args.batch_freq == 0: if args.skill_type == 'pull' or args.task == 'goal' or args.task == 'transformation': print( 'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tPos: %f\t Ori: %f' % (epoch, i, dataset_size, 100.0 * i / dataset_size / batch_size, loss.item(), kl_loss.item(), pos_loss.item(), ori_loss.item())) elif args.skill_type == 'grasp' and args.task == 'contact': print( 'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tR Pos: %f\t R Ori: %f\tL Pos: %f\tL Ori: %f' % (epoch, i, dataset_size, 100.0 * i / dataset_size / batch_size, loss.item(), kl_loss.item(), pos_loss_right.item(), ori_loss_right.item(), pos_loss_left.item(), ori_loss_left.item())) print(' --avgerage loss: ') print(epoch_total_loss / (dataset_size / batch_size)) loss_dict = { 'epoch_total': epoch_total_loss / (dataset_size / batch_size), 'epoch_kl': epoch_kl_loss / (dataset_size / batch_size), 'epoch_pos': epoch_pos_loss / (dataset_size / batch_size), 'epoch_ori': epoch_ori_loss / (dataset_size / batch_size), 'epoch_recon': epoch_recon_loss / (dataset_size / batch_size) } total_loss.append(loss_dict) if epoch % args.save_freq == 0: print('\n--Saving model\n') print('time: ' + str(time.time() - start_time)) save_state(net=vae, torch_seed=torch_seed, np_seed=np_seed, args=args, fname=os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '.pt')) np.savez(os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '_loss.npz'), loss=np.asarray(total_loss)) print('Done!') save_state(net=vae, torch_seed=torch_seed, np_seed=np_seed, args=args, fname=os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '.pt'))
def relation_vectors(self, ids): var = util.to_var(ids, volatile=True) vector = self.rels(var).data.cpu().numpy() return vector
def train(): train_file_idxs = np.arange(0, len(TRAIN_FILES)) test_file_idxs = np.arange(0, len(TEST_FILES)) np.random.shuffle(train_file_idxs) model = get_model() print(model) dtype = torch.FloatTensor criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(model.parameters(), lr=0.001) optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999)) losses = [] # writer = SummaryWriter() global_step =0 # pdb.set_trace() # for i in range(len(TEST_FILES)): # test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]]) # print(test_data.shape) # print(gt[0]) if args.continue_latest==1: model = load_model(args.checkpoint_dir,'latest.pth',model) optimizer = load_optimizer(args.checkpoint_dir,'latest.pth',optimizer) for epoch in range(num_epochs): running_loss = 0 num_total_btches=0 total_correct =0 total_training_samples=0 for i in range(len(TRAIN_FILES)): train_data, current_labels = util.loadDataFile(TRAIN_FILES[train_file_idxs[i]]) train_data = util.to_var(torch.from_numpy(train_data)) current_labels = util.to_var(torch.from_numpy(current_labels)) num_batches = train_data.shape[0] // args.batch_size print('Training file: {:5d} |num of batches: {:5d}'.format(i ,num_batches)) for btch in range(num_batches): optimizer.zero_grad() start_idx = btch*args.batch_size end_idx = (btch+1)*args.batch_size current_train = train_data[start_idx:end_idx, :, :] btch_label = current_labels[start_idx:end_idx,:].type(torch.long) # pdb.set_trace() logits = model(current_train) # pdb.set_trace() loss = criterion(logits, btch_label.view(-1)) loss.backward() optimizer.step() preds = F.log_softmax(logits, 1) pred_choice = preds.data.max(1)[1] correct = pred_choice.eq(btch_label.view(-1).data).cpu().sum() total_correct+=correct.item() running_loss+= loss.item()*args.batch_size losses.append(loss.item()) total_training_samples+=btch_label.shape[0] # writer.add_scalar('loss',loss.item(), global_step) # writer.add_graph(model,current_train) # pdb.set_trace() if btch % args.print_every==0: print('Epoch [{:5d}/{:5d}] | loss: {:6.4f} | accuracy:{:6.4f}'.format(epoch+1, num_epochs, loss.item(), correct.item()/float(args.batch_size))) global_step+=1 num_total_btches+=1 print(num_total_btches*args.batch_size, total_training_samples) print("Epoch {} : Total training loss {:6.4f} and accuracy {:6.4f}".format(epoch, running_loss/total_training_samples, total_correct/total_training_samples)) log_value('training_loss',running_loss/total_training_samples,epoch) log_value('accuracy',total_correct/total_training_samples,epoch) if (epoch % evaluation_epoch==0 and epoch!=0): model.eval() pred_score = 0 test_loss+ = 0 total_test_samples=0 num_test_batches=0 with torch.no_grad(): for i in range(len(TEST_FILES)): test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]]) test_data = util.to_var(torch.from_numpy(test_data)) gt = util.to_var(torch.from_numpy(gt)).type(torch.long) num_batches = test_data.shape[0] // args.batch_size for btch in range(num_batches): start_indx = btch*args.batch_size end_indx = (btch+1)*args.batch_size current_test = test_data[start_indx:end_indx, :, :] logits = model(current_test) gt_btch = gt[start_indx:end_indx,:] loss = criterion(logits, gt_btch.view(-1)) test_loss+=loss.item()*args.batch_size preds = F.log_softmax(logits, 1) predictions = preds.data.max(1)[1] actuals = predictions.eq(gt_btch.view(-1).data).cpu().sum() pred_score+=actuals.item() num_test_batches+=1 total_test_samples+=gt_btch.shape[0] # pdb.set_trace() # print(num_test_batches*args.batch_size, total_test_samples) print('Evaluation loss {:6.4f} | Accuracy {:6.4f}'.format(test_loss/total_test_samples,pred_score/total_test_samples)) log_value('evaluation_accuracy',pred_score/total_test_samples, epoch) model.train() # writer.close() model.eval() pred_score = 0 test_loss = 0 total_test_samples=0 num_test_batches=0 with torch.no_grad(): for i in range(len(TEST_FILES)): test_data, gt = util.loadDataFile(TEST_FILES[test_file_idxs[i]]) test_data = util.to_var(torch.from_numpy(test_data)) gt = util.to_var(torch.from_numpy(gt)).type(torch.long) num_batches = test_data.shape[0] // args.batch_size for btch in range(num_batches): start_indx = btch*args.batch_size end_indx = (btch+1)*args.batch_size current_test = test_data[start_indx:end_indx, :, :] logits = model(current_test) gt_btch = gt[start_indx:end_indx,:] loss = criterion(logits, gt_btch.view(-1)) test_loss+=loss.item()*args.batch_size preds = F.log_softmax(logits, 1) predictions = preds.data.max(1)[1] actuals = predictions.eq(gt_btch.view(-1).data).cpu().sum() pred_score+=actuals.item() num_test_batches+=1 total_test_samples+=gt_btch.shape[0] print('Final test loss {:6.4f} | accuracy {:6.4f}'.format(test_loss/total_test_samples,pred_score/total_test_samples)) save_model(args.checkpoint_dir,'latest.pth',model, num_epochs) save_optimizer(args.checkpoint_dir,'latest.pth',optimizer,num_epochs)
def train(self): min_val_loss = np.inf patience_counter = 0 best_epoch = -1 for epoch_i in range(self.epoch_i, self.config.n_epoch): self.epoch_i = epoch_i batch_loss_history = [] predictions, ground_truth = [], [] self.model.train() n_total_words = 0 before_gradient = None for batch_i, (conversations, labels, conversation_length, sentence_length, type_ids, masks) in enumerate( tqdm(self.train_data_loader, ncols=80)): # conversations: (batch_size) list of conversations # conversation: list of sentences # sentence: list of tokens # conversation_length: list of int # sentence_length: (batch_size) list of conversation list of sentence_lengths input_conversations = conversations # flatten input and target conversations input_sentences = [ sent for conv in input_conversations for sent in conv ] input_labels = [label for utt in labels for label in utt] input_sentence_length = [ l for len_list in sentence_length for l in len_list ] input_conversation_length = [l for l in conversation_length] input_masks = [mask for conv in masks for mask in conv] orig_input_labels = input_labels # transfering the input to cuda input_sentences = to_var(torch.LongTensor(input_sentences)) input_labels = to_var(torch.LongTensor(input_labels)) input_sentence_length = to_var( torch.LongTensor(input_sentence_length)) input_conversation_length = to_var( torch.LongTensor(input_conversation_length)) input_masks = to_var(torch.LongTensor(input_masks)) # reset gradient self.optimizer.zero_grad() sentence_logits = self.model(input_sentences, input_sentence_length, input_conversation_length, input_masks) present_predictions = list( np.argmax(sentence_logits.detach().cpu().numpy(), axis=1)) loss_function = nn.CrossEntropyLoss() batch_loss = loss_function(sentence_logits, input_labels) predictions += present_predictions ground_truth += orig_input_labels assert not isnan(batch_loss.item()) batch_loss_history.append(batch_loss.item()) if batch_i % self.config.print_every == 0: tqdm.write( f'Epoch: {epoch_i+1}, iter {batch_i}: loss = {batch_loss.item()}' ) # Back-propagation batch_loss.backward() # Gradient cliping torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.clip) # Run optimizer self.optimizer.step() epoch_loss = np.mean(batch_loss_history) self.epoch_loss = epoch_loss print_str = f'Epoch {epoch_i+1} loss average: {epoch_loss:.3f}' print(print_str) self.w_train_f1 = self.print_metric(ground_truth, predictions, "train") self.validation_loss, self.w_valid_f1, valid_predictions = self.evaluate( self.valid_data_loader, mode="valid") self.test_loss, self.w_test_f1, test_predictions = self.evaluate( self.test_data_loader, mode="test") print(self.epoch_loss, self.w_train_f1, self.w_valid_f1, self.w_test_f1) IMPROVED = False if self.validation_loss < min_val_loss: IMPROVED = True min_val_loss = self.validation_loss best_test_loss = self.test_loss best_test_f1_w = self.w_test_f1 best_epoch = (self.epoch_i + 1) if (not IMPROVED): patience_counter += 1 else: patience_counter = 0 print(f'Patience counter: {patience_counter}') if (patience_counter > self.config.patience): break return best_test_loss, best_test_f1_w, best_epoch
def main(args): langs = args.langs embedding_path = args.mono_embedding_path bilingual_dict_path = args.bilingual_dict_path prefix = args.mono_emb_prefix char_prefix = args.mono_char_prefix model_path = args.model_path output_file = args.common_emb_eval output_file_best = args.common_emb_best linguistic_vec_path = args.linguistic_vec_path mono_dict_path = args.mono_dict_path # initialize model parameters batch_size = args.batch_size num_epochs = args.num_epochs learning_rate = args.learning_rate save_step = args.save_step log_step = save_step emb_size = args.word_embedding_size common_size = args.common_embedding_size kernel_num = args.kernel_num patience = args.patience max_word_length = args.max_word_length char_vec_size = emb_size filter_withs = args.filter_widths num_workers = args.num_workers top_k = args.top_k lg = args.lg # using dev sets in multilingual eval repro to select the best parameters eval_data_path = args.eval_data_path trans_path = os.path.join(eval_data_path, "word_translation/wiktionary.da+en+it.dev") word_sim_path = os.path.join(eval_data_path, "wordsim/en+it-mws353-dev") mono_sim_path = os.path.join(eval_data_path, "wordsim/EN-MEN-TR-3k") mono_qvec_path = os.path.join(eval_data_path, "qvec/dev-en") mono_qvec_cca_path = os.path.join(eval_data_path, "qvec/dev-en") multi_qvec_cca_path = os.path.join(eval_data_path, "qvec/dev-en-da-it") # create model directory if not os.path.exists(model_path): os.makedirs(model_path) lang_matrixs = load_linguistic_vector(langs, linguistic_vec_path) context_langs = {} for lang in langs: context_langs[lang] = load_top_k( os.path.join(mono_dict_path, lang + ".top50.dict")) # Load vocabulary wrapper. vocab_langs = {} vectors_langs = {} embedding_langs = {} char_vocab_langs = {} char_vectors_langs = {} char_embedding_langs = {} word2char_langs = {} for lang in langs: vocab, vectors, char_vocab, char_vectors, word_2_char = \ build_vocab(os.path.join(embedding_path,lang+prefix), os.path.join(embedding_path, lang+char_prefix), emb_size, max_word_length, char_vec_size, head=True) embedding = nn.Embedding(len(vectors), len(vectors[0])) char_embedding = nn.Embedding(len(char_vectors), len(char_vectors[0])) vectors_langs[lang] = vectors vectors = convert2tensor(vectors) char_vectors = convert2tensor(char_vectors) embedding.weight = nn.Parameter(vectors) embedding.weight.requires_grad = False char_embedding.weight = nn.Parameter(char_vectors) if torch.cuda.is_available(): embedding.cuda() char_embedding.cuda() vocab_langs[lang] = vocab embedding_langs[lang] = embedding char_vocab_langs[lang] = char_vocab char_vectors_langs[lang] = char_vectors char_embedding_langs[lang] = char_embedding word2char_langs[lang] = word_2_char # Build the models projectors = {} for lang in langs: projector = ProjLanguage(emb_size, common_size, kernel_num, char_vec_size, filter_withs) if torch.cuda.is_available(): projector.cuda() projectors[lang] = projector # Loss and Optimizer criterion = nn.CosineEmbeddingLoss(margin=0) params = [] for lang in langs: params += list(projectors[lang].parameters()) optimizer = torch.optim.Adadelta(params, lr=learning_rate) start = time.time() best_score = 0 best_sim_score = 0 best_model_dict = {} # Build data loader print("start to load data ... ") data_loader_set = get_loader_bilingual_context_char( bilingual_dict_path, langs, vocab_langs, batch_size, word2char_langs, shuffle=True, num_workers=num_workers, top_k=top_k) print("finish loading data. \nstart to train models ") total_step = 0 for new_data_loader in data_loader_set: (lang1, lang2, data_loader) = new_data_loader total_step = len(data_loader) print("total step ", total_step) data_loader_mono_set = {} for lang in langs: vocab_lang = vocab_langs[lang] context_lang = context_langs[lang] word2char_lang = word2char_langs[lang] data_loader = get_loader_mono_context_char(os.path.join( embedding_path, lang + prefix), vocab_lang, context_lang, word2char_lang, head=True, batch_size=batch_size, shuffle=False, num_workers=num_workers, top_k=top_k) data_loader_mono_set[lang] = data_loader i0 = 0 current_patience = 0 for epoch in range(num_epochs): if learning_rate < 0.01: break epoch_start = time.time() (lang01, lang02, data_loader_0) = data_loader_set[0] (matrix01_orig, matrix02_orig) = lang_matrixs[lang01 + "#" + lang02] for ids01, ids02, ids01_context, ids02_context, char_ids01, char_ids02 in data_loader_0: # Set mini-batch dataset ids01 = torch.FloatTensor(ids01).long() ids02 = torch.FloatTensor(ids02).long() ids01_context = torch.FloatTensor(ids01_context).long() ids02_context = torch.FloatTensor(ids02_context).long() char_ids01 = torch.FloatTensor(char_ids01).long() char_ids02 = torch.FloatTensor(char_ids02).long() if len(matrix01_orig) > batch_size: gap = len(matrix01_orig) - batch_size rand = random.randint(0, gap) matrix01 = matrix01_orig[rand:rand + batch_size][:] matrix02 = matrix02_orig[rand:rand + batch_size][:] else: matrix01 = matrix01_orig[:][:] matrix02 = matrix02_orig[:][:] matrix01 = torch.from_numpy(matrix01) matrix01 = matrix01.float() matrix02 = torch.from_numpy(matrix02) matrix02 = matrix02.float() if torch.cuda.is_available(): ids01 = to_var(ids01) ids02 = to_var(ids02) ids01_context = to_var(ids01_context) ids02_context = to_var(ids02_context) char_ids01 = to_var(char_ids01) char_ids02 = to_var(char_ids02) matrix01 = to_var(matrix01) matrix02 = to_var(matrix02) for langTmp in langs: projectors[langTmp].zero_grad() input01 = embedding_langs[lang01](ids01) input02 = embedding_langs[lang02](ids02) input01_context = embedding_langs[lang01](ids01_context) input02_context = embedding_langs[lang02](ids02_context) input01_context = torch.mean(input01_context, 1) input02_context = torch.mean(input02_context, 1) char_ids01_tmp = char_ids01.view( char_ids01.size(0) * char_ids01.size(1)) char_ids02_tmp = char_ids02.view( char_ids02.size(0) * char_ids02.size(1)) input_char01 = char_embedding_langs[lang01](char_ids01_tmp) input_char02 = char_embedding_langs[lang02](char_ids02_tmp) input_char01 = input_char01.view(char_ids01.size(0), char_ids01.size(1), -1) input_char02 = input_char02.view(char_ids02.size(0), char_ids02.size(1), -1) # Forward, Backward and Optimize features01, output_char01, decoded_input01, decoded_input01_context = \ projectors[lang01].forward(input01, input01_context, input_char01) features02, output_char02, decoded_input02, decoded_input02_context, \ cross_decoded_input02, cross_decoded_input02_context = \ projectors[lang02].forward(input02, input02_context, input_char02, features01) features01, output_char01, decoded_input01, decoded_input01_context, \ cross_decoded_input01, cross_decoded_input01_context = \ projectors[lang01].forward(input01, input01_context, input_char01, features02) linguistic_encoded_01, linguistic_decoded_01 = projectors[ lang01].forward(matrix01) linguistic_encoded_02, linguistic_decoded_02, cross_linguistic_decoded_02 = \ projectors[lang02].forward(matrix02, cross_encoded=linguistic_encoded_01) linguistic_encoded_01, linguistic_decoded_01, cross_linguistic_decoded_01 = \ projectors[lang01].forward(matrix01, cross_encoded=linguistic_encoded_02) linguistic_label0 = Variable( torch.ones(linguistic_encoded_01.size(0))) label00 = Variable(torch.ones(features01.size(0))) if torch.cuda.is_available(): features01 = features01.cuda() features02 = features02.cuda() decoded_input01 = decoded_input01.cuda() decoded_input02 = decoded_input02.cuda() cross_decoded_input01 = cross_decoded_input01.cuda() cross_decoded_input02 = cross_decoded_input02.cuda() decoded_input01_context = decoded_input01_context.cuda() decoded_input02_context = decoded_input02_context.cuda() cross_decoded_input01_context = cross_decoded_input01_context.cuda( ) cross_decoded_input02_context = cross_decoded_input02_context.cuda( ) label00 = label00.cuda() output_char01 = output_char01.cuda() output_char02 = output_char02.cuda() linguistic_encoded_01 = linguistic_encoded_01.cuda() linguistic_encoded_02 = linguistic_encoded_02.cuda() linguistic_label0 = linguistic_label0.cuda() loss = 0 loss += criterion(features01, features02, label00) loss += criterion(input01, decoded_input01, label00) loss += criterion(input02, decoded_input02, label00) loss += criterion(input01, cross_decoded_input01, label00) loss += criterion(input02, cross_decoded_input02, label00) loss += criterion(input01_context, decoded_input01_context, label00) loss += criterion(input01_context, cross_decoded_input01_context, label00) loss += criterion(input02_context, decoded_input02_context, label00) loss += criterion(input02_context, cross_decoded_input02_context, label00) char_loss = 0 char_loss += criterion(output_char01, output_char02, label00) linguistic_loss = 0 linguistic_loss += criterion(linguistic_encoded_01, linguistic_encoded_02, linguistic_label0) loss = loss + char_loss + lg * linguistic_loss loss.backward() optimizer.step() # Print log info if epoch > 0 and i0 % log_step == 0: if os.path.exists(output_file): os.remove(output_file) out = open(output_file, "w") for langTmp in langs: data_loader = data_loader_mono_set[langTmp] for i, (ids, context_ids, char_ids) in enumerate(data_loader): ids = torch.FloatTensor(ids).long() context_ids = torch.FloatTensor(context_ids).long() char_ids = torch.FloatTensor(char_ids).long() if torch.cuda.is_available(): ids = to_var(ids) context_ids = to_var(context_ids) char_ids = to_var(char_ids) proj = projectors[langTmp] input1 = embedding_langs[langTmp](ids) input1_contexts = embedding_langs[langTmp](context_ids) input1_contexts = torch.mean(input1_contexts, 1) char_ids_tmp = char_ids.view( char_ids.size(0) * char_ids.size(1)) input_char = char_embedding_langs[langTmp]( char_ids_tmp) input_char = input_char.view(char_ids.size(0), char_ids.size(1), -1) features, output_char, decoded_input, decoded_input_context = \ proj.forward(input1, input1_contexts, input_char) features = torch.cat((features, output_char), 1) vocab = vocab_langs[langTmp] features = features.data.cpu().numpy() ids = ids.data.cpu().numpy() for j in range(0, len(ids)): word = vocab.idx2word[ids[j]] out.write(langTmp + ":" + word) for m in range(len(features[j])): out.write(" " + str(features[j][m])) out.write("\n") out.close() mono_sim_score, mono_sim_coverate = evaluate_word_sim( mono_sim_path, output_file) multi_sim_score, multi_sim_coverage = evaluate_word_sim( word_sim_path, output_file) multi_trans_score, multi_trans_coverage = evaluate( trans_path, output_file) mono_qvec_score, mono_qvec_coverate = evaluate_qvec( mono_qvec_path, output_file) multi_qvec_score, multi_qvec_coverate = evaluate_qvec( multi_qvec_cca_path, output_file) mono_cvec_score, mono_cvec_coverate = evaluate_cvec( mono_qvec_cca_path, output_file) multi_cvec_score, multi_cvec_coverate = evaluate_cvec( multi_qvec_cca_path, output_file) score = mono_sim_score + multi_sim_score + multi_trans_score + \ mono_qvec_score + mono_cvec_score + multi_cvec_score print( "mono_sim: %.4f, multi_sim: %.4f, multi_trans: %.4f, mono_qvec: %.4f, multi_qvec: %.4f, " "mono_cvec: %.4f, multi_cvec: %.4f" % (mono_sim_score, multi_sim_score, multi_trans_score, mono_qvec_score, multi_qvec_score, mono_cvec_score, multi_cvec_score)) print("\n") if score > best_score: shutil.copyfile(output_file, output_file_best) current_patience = 0 best_score = score for tmp in langs: best_model_dict[tmp] = projectors[tmp].state_dict() else: current_patience += 1 if current_patience > patience: learning_rate = learning_rate * 0.5 current_patience = 0 epoch_end = time.time() epoch_time = epoch_end - epoch_start print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Best Score: %.4f, Best WordSim: %.4f, Learning_Rate: ' '%.4f, CurrentPatience: %.4f, Perplexity: %5.4f, Time: %d' % (epoch, num_epochs, i0, total_step, loss.data[0], best_score, best_sim_score, learning_rate, current_patience, np.exp(loss.data[0]), epoch_time)) epoch_start = time.time() # Save the models if (epoch + 1) % save_step == 0: for tmp in langs: torch.save( projectors[tmp].state_dict(), os.path.join( model_path, tmp + '-encoder-%d-%d.pkl' % (epoch + 1, i0 + 1))) i0 += 1 end = time.time() all_time = end - start print('Overall training time %d' % all_time) for lang in langs: torch.save(best_model_dict[lang], os.path.join(model_path, lang + '-best-encoder.pkl'))