def predict(): from utils import FileIO, Utils from word_process import WordProcess # Path to the data txt file on disk. path_base = '../data/' path_file = path_base + 'bytecup.corpus.train.0.50k.txt' fio = FileIO() word = WordProcess(path_base, is_model_load=False, is_dict_load=True) contents, titles = fio.load_from_json(path_file) total_size = len(titles) num_samples = int(total_size * 0.8) num_test = total_size - num_samples print('num samples:', num_samples, 'num tests:', num_test) max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2 max_decoder_seq_length = max([len(txt) for txt in titles]) + 2 print('max_lengths:', max_encoder_seq_length, ' ', max_decoder_seq_length) train_data = { 'contents': contents[0:num_samples], 'titles': titles[0:num_samples] } test_data = { 'contents': contents[num_samples:total_size], 'titles': titles[num_samples:total_size] } datasets = { 'train': TextData2(train_data, word.dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length), 'val': TextData2(test_data, word.dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length) } data_loads = { x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers=15) for x in ['train', 'val'] } encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256) decoder = AttnDecoder3(hidden_size=256, vocab_size=84031) if use_cuda: encoder.cuda() decoder.cuda() best_model = torch.load(path_base + './50k.1.best_model_wts') best_model = Utils().gpu_model_to_cpu_model(best_model) encoder.load_state_dict(best_model[0]) decoder.load_state_dict(best_model[1]) out = evaluate(encoder, decoder, datasets) file1 = open(path_base + '50k.1.predict', 'a') for i, o in enumerate(out): file1.write(str([word.dic[int(i)] for i in o.data[0]])) file1.write(str(test_data['titles'][i]) + '\n') file1.close() print('predict done!')
def main(): from utils import FileIO from word_process import WordProcess # Path to the data txt file on disk. path_base = '../data/' path_file = path_base + 'bytecup.corpus.train.0.50k.txt' fio = FileIO() word = WordProcess(path_base, is_model_load=False, is_dict_load=True) dic = word.dic contents, titles = fio.load_from_json(path_file) total_size = len(titles) num_samples = int(total_size * 0.8) num_test = total_size - num_samples print('num samples:', num_samples, 'num tests:', num_test) max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2 max_decoder_seq_length = max([len(txt) for txt in titles]) + 2 print('max_lengths:', max_encoder_seq_length, ' ', max_decoder_seq_length) train_data = { 'contents': contents[0:num_samples], 'titles': titles[0:num_samples] } test_data = { 'contents': contents[num_samples:total_size], 'titles': titles[num_samples:total_size] } datasets = { 'train': TextData2(train_data, dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length), 'val': TextData2(test_data, dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length) } data_loads = { x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers=15) for x in ['train', 'val'] } encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256) decoder = AttnDecoder3(hidden_size=256, vocab_size=84031) optimizer = optim.SGD([{ 'params': encoder.parameters(), 'lr': 0.01 }, { 'params': decoder.parameters(), 'lr': 0.01 }], lr=0.01, momentum=0.9) lambda1 = lambda epoch: epoch // 30 lambda2 = lambda epoch: 0.95**epoch scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2, lambda2]) criterion = {'loss': nn.CosineSimilarity(dim=2), 'acc': nn.MSELoss()} loss_history = HistoryLoss() train_model(encoder, decoder, data_loads, criterion, scheduler, loss_history)