def get_prediction(source_path, target_path, model): # use beam search preds = [] targets = [] i = 0 batch_size = 32 max_len = 50 base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/" source_lines, source_word2id, source_id2word = get_origin_data( source_path, base_dir + "fr_word2id.json", base_dir + "fr_id2word.json") target_lines, target_word2id, target_id2word = get_origin_data( target_path, base_dir + "en_word2id.json", base_dir + "en_id2word.json") for start_index in range(0, len(source_lines), 32): source_input, _, source_input_token, _ = generate_batch( source_lines, source_word2id, source_id2word, 32, start_index, True, True, max_len) target_input, target_output, target_input_token, target_output_token = generate_batch( target_lines, target_word2id, target_id2word, 32, start_index, True, True, max_len) source_input = torch.tensor(source_input).to(device) batch_size = source_input.size(0) source_input_mask = compute_mask(source_input) target_input, target_output = torch.tensor(target_input).to( device), torch.tensor(target_output).to(device) batch_preds = [[] for _ in range(batch_size)] mask = [0 for _ in range(batch_size)] encode_state = model.encode(source_input, source_input_mask) decode_input = torch.tensor(SOS_ID).repeat(batch_size).view( batch_size, 1).to(device) # print(hidden_cell_state) for _ in range(max_len): output_logits, decoder_state = model.decode( decode_input, encode_state) #batch_size, 1, vocab_size output_logits = output_logits.squeeze(1) output_id = torch.argmax(output_logits, 1).tolist() # batch_size # topv, topi = output_logits.data.topk(1) # topi = topi.squeeze(1) for idx in range(batch_size): if mask[idx]: continue id = output_id[idx] if id == EOS_ID: mask[idx] = 1 else: batch_preds[idx].append(target_id2word[str(id)]) if sum(mask) == len(mask): break decode_input = torch.tensor(output_id, device=device).view(batch_size, 1).to(device) encode_state = decoder_state preds.extend(batch_preds) targets.extend(target_output_token) print(preds[:2]) print(targets[:2]) return preds, targets
def train(opt): source_vocab_size = opt["source_vocab_size"] target_vocab_size = opt["target_vocab_size"] d_model = opt["d_model"] key_hidden_size = opt["key_hidden_size"] value_hidden_size = opt["value_hidden_size"] heads = opt["heads"] ff_size = opt["ff_size"] drop_rate = opt["drop_rate"] epoch = opt["epoch"] lr_rate = opt["lr_rate"] model = Transformer(source_vocab_size, target_vocab_size, d_model, key_hidden_size, value_hidden_size, heads, ff_size, drop_rate).to(device) base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/" source_lines, source_word2id, source_id2word = get_origin_data( base_dir + "frtrain_lines", base_dir + "fr_word2id.json", base_dir + "fr_id2word.json") target_lines, target_word2id, target_id2word = get_origin_data( base_dir + "entrain_lines", base_dir + "en_word2id.json", base_dir + "en_id2word.json") parameters = filter(lambda i: i.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=lr_rate) print_every = 1000 cnt = 0 total = 0 save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/" batch_size = 64 max_len = 50 for ep in range(opt["epoch"]): for start_index in range(0, len(source_lines), batch_size): source_input, _, source_input_token, _ = generate_batch( source_lines, source_word2id, source_id2word, batch_size, start_index, False, True, max_len) target_input, target_output, target_input_token, target_output_token = generate_batch( target_lines, target_word2id, target_id2word, batch_size, start_index, True, True, max_len) source_input = torch.tensor(source_input).to(device) target_input, target_output = torch.tensor(target_input).to( device), torch.tensor(target_output).to(device) decoder_logits = model(source_input, target_input) loss = model.compute_loss(decoder_logits, target_output) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(parameters, 5) optimizer.step() cnt += 1 total += loss.item() if cnt % print_every == 0: print(total / 1000) total = 0 if cnt % 10000 == 0: model.eval() preds, targets = get_prediction(base_dir + "frdev_lines", base_dir + "endev_lines", model) with open( save_dir + str(cnt) + "origin_transformer_preds.json", "w") as f: json.dump(preds, f) with open( save_dir + str(cnt) + "origin_transformer_targets.json", "w") as f: json.dump(targets, f) # print(eval(preds, targets)) model.train() # if ep > 0 and ep % 2 == 0: # lr_rate = optimizer_lr_rate(d_model, cnt) # for param_group in optimizer.param_groups: # param_group["lr"] = lr_rate print(ep) lr_rate = lr_rate * 0.5 for param_group in optimizer.param_groups: param_group["lr"] = lr_rate
def train(opt): source_vocab_size = opt["source_vocab_size"] target_vocab_size = opt["target_vocab_size"] source_hidden_size = opt["source_hidden_size"] target_hidden_size = opt["target_hidden_size"] source_emb_size = opt["source_emb_size"] target_emb_size = opt["target_emb_size"] batch_size = opt["batch_size"] max_len = opt["max_len"] model = Basic_NMT(source_vocab_size, target_vocab_size, source_emb_size, target_emb_size, source_hidden_size, source_hidden_size).to(device) base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/" source_lines, source_word2id, source_id2word = get_origin_data( base_dir + "frtrain_lines", base_dir + "fr_word2id.json", base_dir + "fr_id2word.json") target_lines, target_word2id, target_id2word = get_origin_data( base_dir + "entrain_lines", base_dir + "en_word2id.json", base_dir + "en_id2word.json") # source_lines, source_word2id, source_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/endebug", base_dir+"en_word2id.json", base_dir+"en_id2word.json") # target_lines, target_word2id, target_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/frdebug", base_dir+"fr_word2id.json", base_dir+"fr_id2word.json") parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=opt["learning_rate"]) print_every = 1000 cnt = 0 total = 0 save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/" for ep in range(opt["epoch"]): for start_index in range(0, len(source_lines), batch_size): source_input, _, source_input_token, _ = generate_batch( source_lines, source_word2id, source_id2word, batch_size, start_index, False, True, max_len) target_input, target_output, target_input_token, target_output_token = generate_batch( target_lines, target_word2id, target_id2word, batch_size, start_index, True, True, max_len) # print(source_input_token[3]) # print(target_input_token[3]) # print(target_output_token[3]) # # assert 1==0 # valid_idx = [] # for idx in range(len(source_input)): # if sum(source_input[idx]) != 0: # valid_idx.append(idx) # source_input = np.array(source_input)[valid_idx] # target_input = np.array(target_input)[valid_idx] # target_output = np.array(target_output)[valid_idx] source_input = torch.tensor(source_input).to(device) source_input_mask = compute_mask(source_input) target_input, target_output = torch.tensor(target_input).to( device), torch.tensor(target_output).to(device) decoder_logits = model(source_input, source_input_mask, target_input) loss = model.compute_loss(decoder_logits, target_output) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(parameters, 10) optimizer.step() cnt += 1 total += loss.item() if cnt % print_every == 0: print(total / 1000) total = 0 if cnt % 10000 == 0: # topv, topi = decoder_logits.data.topk(1) # topi = topi.squeeze(1) # for idx in range(5): # pred = "" # real = "" # for i in range(target_output.size(1)): # if target_id2word[str(target_output[idx][i].item())] == "<pad>": # break # real = real + target_id2word[str(target_output[idx][i].item())]+ " " # # real.append(target_id2word[str(target_output[idx][i].item())]) # pred = pred + target_id2word[str(topi[idx][i].item())] + " " # # pred.append(target_id2word[str(topi[idx][i].item())]) # print(pred) # print("\n") # print(real) # print("\n") model.eval() preds, targets = get_prediction(base_dir + "frdev_lines", base_dir + "endev_lines", model) with open(save_dir + str(cnt) + "preds.json", "w") as f: json.dump(preds, f) with open(save_dir + str(cnt) + "targets.json", "w") as f: json.dump(targets, f) # print(eval(preds, targets)) model.train()
def train(opt): source_vocab_size = opt["source_vocab_size"] target_vocab_size = opt["target_vocab_size"] source_hidden_size = opt["source_hidden_size"] target_hidden_size = opt["target_hidden_size"] source_emb_size = opt["source_emb_size"] target_emb_size = opt["target_emb_size"] batch_size = opt["batch_size"] max_len = opt["max_len"] lr_rate = opt["learning_rate"] model = Attention_NMT(source_vocab_size, target_vocab_size, source_emb_size, target_emb_size, source_hidden_size, source_hidden_size).to(device) base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/" source_lines, source_word2id, source_id2word = get_origin_data( base_dir + "frtrain_lines", base_dir + "fr_word2id.json", base_dir + "fr_id2word.json") target_lines, target_word2id, target_id2word = get_origin_data( base_dir + "entrain_lines", base_dir + "en_word2id.json", base_dir + "en_id2word.json") # source_lines, source_word2id, source_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/endebug", base_dir+"en_word2id.json", base_dir+"en_id2word.json") # target_lines, target_word2id, target_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/frdebug", base_dir+"fr_word2id.json", base_dir+"fr_id2word.json") parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=lr_rate) print_every = 1000 cnt = 0 total = 0 save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/" for ep in range(opt["epoch"]): for start_index in range(0, len(source_lines), batch_size): source_input, _, source_input_token, _ = generate_batch( source_lines, source_word2id, source_id2word, batch_size, start_index, False, True, max_len) target_input, target_output, target_input_token, target_output_token = generate_batch( target_lines, target_word2id, target_id2word, batch_size, start_index, True, True, max_len) source_input = torch.tensor(source_input).to(device) source_input_mask = compute_mask(source_input) target_input, target_output = torch.tensor(target_input).to( device), torch.tensor(target_output).to(device) decoder_logits = model(source_input, source_input_mask, target_input) loss = model.compute_loss(decoder_logits, target_output) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(parameters, 10) optimizer.step() cnt += 1 total += loss.item() if cnt % print_every == 0: print(total / 1000) total = 0 if cnt % 10000 == 0: model.eval() preds, targets = get_prediction(base_dir + "frdev_lines", base_dir + "endev_lines", model) with open(save_dir + str(cnt) + "attention_preds.json", "w") as f: json.dump(preds, f) with open(save_dir + str(cnt) + "attention_targets.json", "w") as f: json.dump(targets, f) # print(eval(preds, targets)) model.train() if ep > 0 and ep % 2 == 0: lr_rate *= 0.5 for param_group in optimizer.param_groups: param_group["lr"] = lr_rate