def eval_split(models, crits, loader, json_path, eval_kwargs={}, flag_eval_what='tap', debug=False): split = eval_kwargs.get('split', 'val') lang_eval = eval_kwargs.get('language_eval', 1) val_score_thres = eval_kwargs.get('val_score_thres', 0) nms_threshold = eval_kwargs.get('nms_threshold', 0) is_reranking = eval_kwargs.get('reranking', False) print('is_reranking', is_reranking) topN = eval_kwargs.get('topN', 1000) get_eval_loss = eval_kwargs.get('get_eval_loss', 1) tap_model, cg_model = models tap_crit, cg_crit = crits for model in models: model.eval() loader.reset_iterator(split) n = 0 loss_sum = [0, 0, 0, 0, 0] loss_evals = 1e-8 predictions = [] tap_cg_pred = {} iter = 0 bad_vid_num = 0 time_consumption = {} with torch.set_grad_enabled(False): while True: data = loader.get_batch(split) n = n + 1 if iter % int(len(loader) / 100) == 0: print('generating result.json:{:.3f}%'.format(100 * iter / len(loader))) if data.get('proposal_num', 1) == 0 or data['fc_feats'].shape[0] <= 1: continue tmp = [data['fc_feats'], data['att_feats'], data['lda_feats']] tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp] c3d_feats, att_feats, lda_feats = tmp torch.cuda.synchronize() t0 = time.time() tap_feats, pred_proposals = tap_model(c3d_feats) torch.cuda.synchronize() t1 = time.time() # select top score 1000 proposals cg_gts = data['cg_gts'] if data.get('cg_labels', None) is not None else [] if flag_eval_what == 'cg': ind_select_list = data['gts_ind_select_list'] soi_select_list = data['gts_soi_select_list'] cg_select_list = data['gts_cg_select_list'] #good_time_stamps = [loader.featstamp_to_time(s, e, len(data['fc_feats']), data['duration']) for (s, e) in soi_select_list] good_time_stamps = data['gt_timestamps'] tap_prob = [1] * len(ind_select_list) elif flag_eval_what == 'cg_extend': ind_select_list, soi_select_list, cg_select_list, sampled_ids, = data[ 'ind_select_list'], data['soi_select_list'], data[ 'cg_select_list'], data['sampled_ids'] good_time_stamps = [ loader.featstamp_to_time(s, e, len(data['fc_feats']), data['duration']) for (s, e) in soi_select_list ] tap_prob = [1] * len(ind_select_list) elif flag_eval_what == 'SOTA_TEP': if data['SOTA_Prop_score'] is None: print('bad video for SOTA_TEP, vid:{}'.format(data['vid'])) bad_vid_num += 1 continue _ind_select_list, _soi_select_list, _cg_select_list = data[ 'SOTA_ind_select_list'], data[ 'SOTA_soi_select_list'], data['SOTA_cg_select_list'] #_good_time_stamps = [loader.featstamp_to_time(s, e, len(data['fc_feats']), data['duration']) for (s, e) in _soi_select_list] _good_time_stamps = data['SOTA_timestamps'] _tap_prob = data['SOTA_Prop_score'] ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = [], [], [], [], [] if nms_threshold > 0: _, _, pick = gettopN_nms(_good_time_stamps, _tap_prob, _tap_prob, nms_overlap=nms_threshold, topN=1000) else: pick = list(range(len(_tap_prob))) for i, p_scpre in enumerate(_tap_prob): if i not in pick: continue if p_scpre >= val_score_thres: ind_select_list.append(_ind_select_list[i]) soi_select_list.append(_soi_select_list[i]) if len(_cg_select_list): cg_select_list.append(_cg_select_list[i]) good_time_stamps.append(_good_time_stamps[i]) tap_prob.append(_tap_prob[i]) if len(ind_select_list) >= topN: break elif flag_eval_what == 'cg' or flag_eval_what == 'tap_cg' or flag_eval_what == 'tap': if nms_threshold != 0: ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = \ gettop1000_nms(pred_proposals.data, data['tap_masks_for_loss'], cg_gts, data['duration'], loader.featstamp_to_time, overlap=nms_threshold, topN=topN) else: ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = \ gettop1000(pred_proposals.data, data['tap_masks_for_loss'], cg_gts, data['duration'], loader.featstamp_to_time, val_score_thres=val_score_thres, topN=topN) else: assert 1 == 0 t2 = time.time() if (len(cg_select_list) == 0) and (split != 'test'): sents = [] else: if flag_eval_what == 'tap': sents = [0] * len(ind_select_list) cg_prob = [0] * len(ind_select_list) cg_score = [0] * len(ind_select_list) else: seq, cg_prob = cg_model(tap_feats, c3d_feats, lda_feats, [], ind_select_list, soi_select_list, mode='eval') if len(seq) == 0: sents = [] else: cg_score = cg_prob.sum(1).cpu().numpy().astype('float') # cg_prob = np.round(cg_prob, 3).tolist() sents = utils.decode_sequence( loader.get_vocab(), seq) # [proposal_num , max_sent_len] torch.cuda.synchronize() t3 = time.time() # get val_loss if get_eval_loss and tap_crit and ( data.get('cg_labels', None) is not None) and len(cg_select_list) and (split != 'test'): tmp = [ data['tap_labels'], data['tap_masks_for_loss'], data['cg_labels'][cg_select_list], data['cg_masks'][cg_select_list], data['w1'] ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] tap_labels, tap_masks_for_loss, cg_labels, cg_masks, w1 = tmp tap_loss = tap_crit(pred_proposals, tap_masks_for_loss, tap_labels, w1) loss_sum[0] = loss_sum[0] + tap_loss.item() if flag_eval_what != 'tap': pred_captions = cg_model(tap_feats, c3d_feats, lda_feats, cg_labels, ind_select_list, soi_select_list, mode='train') cg_loss = cg_crit(pred_captions, cg_labels[:, 1:], cg_masks[:, 1:]) loss_sum[1] = loss_sum[1] + cg_loss.item() total_loss = eval_kwargs[ 'lambda1'] * tap_loss + eval_kwargs['lambda2'] * cg_loss loss_sum[2] = loss_sum[2] + total_loss.item() vid_info = [] for i, sent in enumerate(sents): proposal_info = {} proposal_info['sentence'] = sent proposal_info['timestamp'] = good_time_stamps[i] # proposal_info['cg_prob'] = cg_prob[i] proposal_info['sentence_confidence'] = cg_score[i] proposal_info['proposal_score'] = tap_prob[i] proposal_info['re_score'] = 10 * tap_prob[i] + cg_score[i] proposal_info['num'] = [i, len(sents)] vid_info.append(proposal_info) if len(vid_info) != 0: if is_reranking: vid_info = reranking(vid_info) tap_cg_pred[data['vid']] = vid_info if data['bounds']['wrapped']: loader.reset_iterator(split) break if iter == eval_kwargs['num_vids_eval']: loader.reset_iterator(split) break ''' if iter%500==0: pred2json = {'results': tap_lm_pred, 'version': "VERSION 1.0", "external_data": { "used": True, "details": "First fully-connected layer from VGG-16 pre-trained on ILSVRC-2012 training set" } } with open(json_path+'iter{}'.format(iter), 'w') as f: json.dump(pred2json, f) ''' time_consumption[iter] = { 'tep': t1 - t0, 'cg': t3 - t2, 'postprocess': t2 - t1 } iter += 1 #relation_analyse(data['vid'], vid_info) # torch.cuda.empty_cache() pred2json = { 'results': tap_cg_pred, 'version': "VERSION 1.0", "external_data": { "used": True, "details": "First fully-connected layer from VGG-16 pre-trained on ILSVRC-2012 training set" } } with open(json_path, 'w') as f: json.dump(pred2json, f) json.dump(time_consumption, open(json_path + '.time_consumption.json', 'w')) sys.path.append('external_tool/densevid_eval') sys.path.append('external_tool/densevid_eval/coco-caption') score = {'ARAN': 0} if lang_eval: from evaluate import eval_score sample_score = eval_score(json_path, flag_eval_what == 'tap', eval_kwargs['val_all_metrics']) for key in sample_score.keys(): score[key] = np.array(sample_score[key]) print('vilid vid num:{}, bad_num:{}'.format( (eval_kwargs['num_vids_eval'] - bad_vid_num), bad_vid_num)) if flag_eval_what == 'tap': import external_tool.eval_ARAN.get_proposal_performance as eval_score_tap eval_tap_opt = {} eval_tap_opt[ 'ground_truth_filename'] = '/data/huichengzheng/wangteng/dvc2_pytorch04/data/captiondata/val_forARAN.json' eval_tap_opt['proposal_filename'] = json_path score['ARAN'] = eval_score_tap.main(**eval_tap_opt) # Switch back to training mode for model in models: model.train() return tap_cg_pred, score, np.array(loss_sum) / iter
def fit(self, train_loader, test_loader, vaild_loader=False, loop_conf=False): # inituate a dictionry to store all the logs for tensorboard ts_writer = {} # create working directory if necessary if not os.path.exists(self.params["working_dir"]): os.makedirs(self.params["working_dir"]) date_time_now = str( datetime.datetime.now()).replace(" ", "_").replace(":", "_") # Create sub_working_dir sub_working_dir = os.path.join(self.params["working_dir"] + self.params['sub_name'] + date_time_now) if not os.path.exists(sub_working_dir): os.makedirs(sub_working_dir) self.params["sub_working_dir"] = sub_working_dir logging.info("sub working dir: %s" % sub_working_dir) # Creat tf_summary writer ts_writer["tensorboard_writer"] = SummaryWriter(sub_working_dir) logging.info("Please using 'python -m tensorboard.main --logdir={} \ '".format(sub_working_dir)) # optimizer optimizer_dic = {'sgd': torch.optim.SGD( self.module_list.parameters(), lr=self.params["learning_rate"], momentum=self.params["momentum"], weight_decay=self.params["decay"]), 'adam': torch.optim.Adam( self.module_list.parameters(), lr=self.params["learning_rate"], weight_decay=self.params["decay"])} optimizer = optimizer_dic[self.params['optimizer'].lower()] # initiate global step self.params["global_step"] = 0 # initiate learning rate scheduler lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=self.params["steps"], gamma=self.params["scales"]) self.train() map_results_names = ["best_map", "best_ap", "best_conf", "specific_conf_map", "specific_conf_ap"] # Start the training loop logging.info("Start training.") for epoch in range(self.params["epochs"]): save = 1 eva = 1 if self.params['loop_epoch'] and epoch > self.params['loop_epoch']: loop_conf = True for step, samples in enumerate(train_loader): if self.params['cuda']: images, labels = (samples["image"].to('cuda'), samples["label"]) else: images, labels = samples["image"], samples["label"] start_time = time.time() self.params["global_step"] += 1 # Forward and backward optimizer.zero_grad() batch_size = images.size(0) losses = self(images, is_training=True, labels=labels) loss = losses[0] if torch.isnan(loss): continue loss.backward() optimizer.step() if step > 0 and step % self.params['loss_step'] == 0: _loss = loss.item() duration = float(time.time() - start_time) example_per_second = batch_size / duration lr = optimizer.param_groups[0]['lr'] logging.info( "epoch [%.3d] iter = \ %d loss = %.2f example/sec = %.3f lr = %.5f " % (epoch, step, _loss, example_per_second, lr) ) ts_writer["tensorboard_writer"].add_scalar( "lr", lr, self.params["global_step"]) ts_writer["tensorboard_writer"].add_scalar( "example/sec", example_per_second, self.params["global_step"]) for i, name in enumerate(self.losses_name): value = _loss if i == 0 else losses[i] ts_writer["tensorboard_writer"].add_scalar( name, value, self.params["global_step"]) if eva and (epoch+1) % self.params['eva_epoch'] == 0: self.train(False) logging.info(f"test epoch number {epoch+1}") # results consist best_map, best_ap, best_conf, # specific_conf_map, specific_conf_ap map_results = get_map(self, test_loader, train=True, loop_conf=loop_conf) self.params['best_map'] = map_results[0] self.params['confidence'] = map_results[2] for index, mr_name in enumerate(map_results_names): try: ts_writer["tensorboard_writer"].add_scalar( mr_name, map_results[index], self.params["global_step"]) except AttributeError: continue evaluate_running_loss = eval_score(self, test_loader) logging.info(f"evaluate_running_loss:\ {evaluate_running_loss[0]}") for i, name in enumerate(self.losses_name): ts_writer["tensorboard_writer"].add_scalar( "evel_" + name, evaluate_running_loss[i], self.params["global_step"]) if vaild_loader: self.params['test_best_map'] = get_map( self, vaild_loader, confidence=[self.params['confidence']])[0] ts_writer["tensorboard_writer"].add_scalar( "test_best_map", self.params['test_best_map'], self.params["global_step"]) self.train(True) eva = 0 if save and (epoch+1) % self.params['save_epoch'] == 0: _save_checkpoint(self) save = 0 lr_scheduler.step() # best_map, best_ap, best_conf, specific_conf_map, specific_conf_ap, \ # map_frame = get_map(self, test_loader, train=False, loop_conf=True) map_results = get_map(self, test_loader, train=False, loop_conf=True) self.params['best_map'] = map_results[0] self.params['confidence'] = map_results[2] if vaild_loader: self.params['test_best_map'] = get_map( self, vaild_loader, confidence=[self.params['confidence']])[0] ts_writer["tensorboard_writer"].add_scalar( "test_best_map", self.params['test_best_map'], self.params["global_step"]) _save_checkpoint(self) for index, mr_name in enumerate(map_results_names): try: ts_writer["tensorboard_writer"].add_scalar( mr_name, map_results[index], self.params["global_step"]) except (AttributeError): continue # model.train(True) logging.info("Bye~") if self.params['return_csv']: map_results[5].to_csv( f"{self.params['sub_working_dir']}/final_performance.csv", index=True) return tuple(map_results)
def main(): print("preparing data...") SRC = data.Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) TRG = data.Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) train, val, test, filename = choose_dataset(False, SRC, TRG) SRC.build_vocab(train) TRG.build_vocab(train) train_batch_size = 128 test_batch_size = 32 eval_batch_size = 128 train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort=False, batch_sizes=( train_batch_size, eval_batch_size, test_batch_size), device=device) print("building model...") in_tokens = len(SRC.vocab.stoi) # the size of vocabulary out_tokens = len(TRG.vocab.stoi) emsize = 768 # embedding dimension nhid = 1024 # the dimension of the feedforward network model in nn.TransformerEncoder and nn.TransformerDecoder nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder and nn.TransformerDecoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.3 # the dropout value model = TransformerModel(in_tokens, out_tokens, emsize, nhead, nhid, nlayers, dropout).to(device) print(model) criterion = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi["<unk>"]) lr = 0.0001 # learning rate optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) best_val_loss = float("inf") epochs = 100 # The number of epochs best_model = None # model.init_weights() print("training...") for epoch in range(1, epochs + 1): epoch_start_time = time.time() t_loss = train_model(model, train_iter, optimizer, criterion, SRC) val_loss = evaluate_model(model, val_iter, criterion) print('-' * 65) print('| epoch {:3d} | time: {:3d}m {:3d}s | train loss {:5.2f} | valid loss {:5.2f}' .format(epoch, int((time.time() - epoch_start_time)/60), int((time.time() - epoch_start_time)%60), t_loss, val_loss)) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() model.eval() sentence = "今日はいい日ですね" output = [] sentence = SRC.preprocess(sentence) # print(sentence) index = [SRC.vocab.stoi[SRC.init_token]] + [SRC.vocab.stoi[i] for i in sentence] + [SRC.vocab.stoi[SRC.eos_token]] src_tensor = torch.LongTensor([index]).T.to(device) trg = torch.LongTensor([[TRG.vocab.stoi[TRG.init_token]]]).to(device) for i in range(25): pred = model(src_tensor, trg) pred_index = pred.argmax(2)[-1].item() # print(pred_index) output.append(pred_index) if pred_index == TRG.vocab.stoi[TRG.eos_token]: break pred_index = torch.LongTensor([[pred_index]]).to(device) # print(pred_index.size()) trg = torch.cat((trg, pred_index)) print("source sentence: ", sentence) print("output sentence: ", [TRG.vocab.itos[i] for i in output]) test_loss = evaluate_model(best_model, test_iter, criterion) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89) torch.save(model.state_dict(), "../model/transformer.pth") model.state_dict(torch.load("../model/transformer.pth", map_location=device)) # print(model.state_dict()) # 中間発表時にはvalidデータは用いない print("generating sentence from text..") path = "../data/test.tsv" test_input, test_output, test_pred = [], [], [] test_input, test_output, test_pred = gen_sentence_list(model, path, SRC, TRG) path = "../data/train.tsv" train_input, train_output, train_pred = [], [], [] train_input, train_output, train_pred = gen_sentence_list(model, path, SRC, TRG) train_df = convert_list_to_df(train_input, train_output, train_pred) test_df = convert_list_to_df(test_input, test_output, test_pred) test_df = prepare_df(test_df) test_percentage, test_kinds, test_bleu = eval_score(test_df) train_df = prepare_df(train_df) train_percentage, train_kinds, train_bleu = eval_score(train_df) train_df.to_csv("../csv/train/result_transformer.csv") test_df.to_csv("../csv/test/result_transformer.csv") print(f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}") print(f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}") with open("./score/score_transformer.txt", mode="w") as f: f.write(f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}\n") f.write(f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}") print("done!")
writerb2 = csv.writer(output_bleu2, delimiter=',') for i in range(len(los) - 1): print(i) prob_list = [] probability = model.predict( [x_val[los[i]:los[i + 1]], sp_val[los[i]:los[i + 1]]]) #print(probability) for j in range(len(probability)): prob_list.append(probability[j][0]) #print(prob_list, y_val[los[i]:los[i+1]]) #calc_test_result(prob_list, y_val[los[i]:los[i+1]]) b01, b11, b21, b31, p11, r11, f11, p21, r21, f21, p31, r31, f31 = eval_score( prob_list, sent[los[i]:los[i + 1]], summ[i]) writer1.writerow([i, p11, r11, f11, p21, r21, f21, p31, r31, f31]) writerb1.writerow([i, b01, b11, b21, b31]) divstr = mmr(sent[los[i]:los[i + 1]], prob_list) b02, b12, b22, b32, p12, r12, f12, p22, r22, f22, p32, r32, f32 = eval_str( divstr, summ[i]) writer2.writerow([i, p12, r12, f12, p22, r22, f22, p32, r32, f32]) writerb2.writerow([i, b02, b12, b22, b32]) output1.close() output_bleu1.close() output2.close() output_bleu2.close() """ scores_dir = "test_scores/"
def main(): os.environ['CUDA_LAUNCH_BLOCKING'] = "1" SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True print("preparing data..") paths = ["../data/train.tsv", "../data/val.tsv"] src, trg, tmp = [], [], [] for path in paths: with open(path, mode='r', encoding="utf-8") as f: for file in f: sentence = file.split("\t") tmp.append(sentence) # random.shuffle(tmp) for sentence in tmp: src.append(sentence[0]) trg.append(sentence[1].replace("\t", "")) src_tensors = tok(text=src, padding=True, return_tensors='pt', return_attention_mask=False) trg_tensors = tok(text=trg, padding=True, return_tensors='pt', return_attention_mask=False) dataset = torch.utils.data.TensorDataset(src_tensors['input_ids'], trg_tensors['input_ids']) train_size = int(len(dataset) * 0.8) valid_size = len(dataset) - train_size train_data, valid_data = torch.utils.data.random_split( dataset, [train_size, valid_size]) batch_size = 128 # batch_size = 8 train_data_loader = torch.utils.data.DataLoader(train_data, batch_size) valid_data_loader = torch.utils.data.DataLoader(valid_data, batch_size) print("building model...") emsize = 768 # embedding dimension nhid = 1024 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 1 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.3 # the dropout value model = TransformerModel(emsize, nhead, nhid, nlayers, dropout).to(device) print(model) criterion = nn.CrossEntropyLoss( ignore_index=tok.convert_tokens_to_ids("[UNK]")) lr = 0.0001 # learning rate optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) best_val_loss = float("inf") epochs = 100 # The number of epochs best_model = None model.init_weights() train_loss_list, eval_loss_list = [], [] print("training model...") for epoch in range(1, epochs + 1): epoch_start_time = time.time() t_loss = train(model, train_data_loader, optimizer, criterion) val_loss = evaluate(model, valid_data_loader, criterion) print('-' * 89) print( '| epoch {:3d} | time: {:3d}m {:3d}s | train loss {:5.2f} | valid loss {:5.2f} | ' .format(epoch, int((time.time() - epoch_start_time) / 60), int((time.time() - epoch_start_time) % 60), t_loss, val_loss)) train_loss_list.append(t_loss) eval_loss_list.append(val_loss) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model model.eval() sentence = "今日は良い日ですね" sentence = tok.tokenize(sentence) # src = [tok.convert_tokens_to_ids("[CLS]")] + tok.convert_tokens_to_ids(sentence) + [tok.convert_tokens_to_ids("[SEP]")] src = tok.convert_tokens_to_ids( sentence) # + [tok.convert_tokens_to_ids("[SEP]")] src = torch.LongTensor([src]) src = torch.t(src) src = src.to(device) trg = tok.convert_tokens_to_ids("[CLS]") trg = torch.LongTensor([[trg]]).to(device) output = [] for i in range(25): with torch.no_grad(): pred = model(src, trg) pred_word_index = pred.argmax(2)[-1] output.append(pred_word_index) if pred_word_index == 3: break last_index = torch.LongTensor([[pred_word_index.item()] ]).to(device) trg = torch.cat((trg, last_index)) predict = tok.convert_ids_to_tokens(output) print("source sentence: ", sentence) print("predicted sentence: ", predict) scheduler.step() torch.save(best_model.state_dict(), "../model/bert_embedded_transformer.pth") # model.init_weights() # model.state_dict(torch.load("../model/bert_embedded_transformer.pth")) print("generating sentence from text..") path = "../data/test.tsv" test_input, test_output, test_pred = [], [], [] test_input, test_output, test_pred = gen_sentence_list(model, path) path = "../data/train.tsv" train_input, train_output, train_pred = [], [], [] train_input, train_output, train_pred = gen_sentence_list(model, path) print("converting list to dataframe") train_df = convert_list_to_df(train_input, train_output, train_pred) test_df = convert_list_to_df(test_input, test_output, test_pred) test_df = prepare_df(test_df) test_percentage, test_kinds, test_bleu = eval_score(test_df) train_df = prepare_df(train_df) train_percentage, train_kinds, train_bleu = eval_score(train_df) train_df.to_csv("../csv/train/result_bert_embedded_transformer.csv") test_df.to_csv("../csv/test/result_bert_embedded_transformer.csv") print( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}" ) print( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) with open("./score/score_bert_embedded_transformer.txt", mode="w") as f: f.write( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}\n" ) f.write( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) print("done!")
def main(): # pytorchのデータフィードの定義(重要!!) print("preparing data...") SRC = data.Field(sequential=True, tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) TRG = data.Field(sequential=True, tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) train, val, test, filename = choose_dataset(False, SRC, TRG) # 辞書の作成 SRC.build_vocab(train) TRG.build_vocab(train) # 各データをバッチ化する train_batch_size = 128 test_batch_size = 32 eval_batch_size = 32 train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), sort=False, batch_sizes=(train_batch_size, eval_batch_size, test_batch_size), device=device) # ハイパーパラメータの設定 INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 768 DEC_EMB_DIM = 768 ENC_HID_DIM = 1024 DEC_HID_DIM = 1024 N_LAYERS = 1 ENC_DROPOUT = 0.3 DEC_DROPOUT = 0.3 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) print(model) # モデルの重みの初期化 model.apply(init_weights) # 最適化手法の設定 optimizer = optim.Adam(model.parameters(), lr=0.0001) SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] # lossの計算用のやつ criterion = nn.CrossEntropyLoss(ignore_index=SRC_PAD_IDX) # 学習率の自動最適化 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) epochs = 100 clip = 1 best_model = None print("training...") best_valid_loss = float('inf') for epoch in range(epochs): start_time = time.time() train_loss = train_model(model, train_iter, optimizer, criterion, clip, TRG) valid_loss = evaluate_model(model, val_iter, criterion) scheduler.step() end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_model = model # torch.save(model.state_dict(), 'tut1-model.pt') print("-" * 65) print( f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}' ) # 重みの保存 torch.save(best_model.state_dict(), '../model/seq2seq.pth') model.state_dict(torch.load("../model/seq2seq.pth")) print("generating sentence...") # テストデータに対して文章を生成する path = "../data/test.tsv" test_input, test_output, test_pred = gen_sentence_list( model, path, SRC, TRG) test_df = convert_list_to_df(test_input, test_output, test_pred) # 学習データに対して文章を生成する path = "../data/train.tsv" train_input, train_output, train_pred = gen_sentence_list( model, path, SRC, TRG) train_df = convert_list_to_df(train_input, train_output, train_pred) # スコアの計算 test_df = prepare_df(test_df) test_percentage, test_kinds, test_bleu = eval_score(test_df) train_df = prepare_df(train_df) train_percentage, train_kinds, train_bleu = eval_score(test_df) # 結果の保存 train_df.to_csv("../csv/train/result_Seq2seq.csv") test_df.to_csv("../csv/test/result_Seq2seq.csv") print( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}" ) print( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) with open("./score/score_seq2seq.txt", mode="w") as f: f.write( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}" ) f.write( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) print("done!")
def main(): print("preparing data...") paths = ["../data/train.tsv", "../data/val.tsv"] src, trg, tmp = [], [], [] for path in paths: with open(path, mode='r', encoding="utf-8") as f: for file in f: sentence = file.split("\t") tmp.append(sentence) # random.shuffle(tmp) for sentence in tmp: src.append(sentence[0]) trg.append(sentence[1].replace("\n", "")) src_tensors = tok(text=src, padding=True, return_tensors='pt', return_attention_mask=False) trg_tensors = tok(text=trg, padding=True, return_tensors='pt', return_attention_mask=False) dataset = torch.utils.data.TensorDataset(src_tensors['input_ids'], trg_tensors['input_ids']) train_size = int(len(dataset) * 0.8) valid_size = len(dataset) - train_size train_data, valid_data = torch.utils.data.random_split( dataset, [train_size, valid_size]) batch_size = 128 train_data_loader = torch.utils.data.DataLoader(train_data, batch_size) valid_data_loader = torch.utils.data.DataLoader(valid_data, batch_size) print("building model...") OUTPUT_DIM = tok.vocab_size # OUTPUT_DIM = 3454 ENC_EMB_DIM = 768 DEC_EMB_DIM = 768 ENC_HID_DIM = 1024 DEC_HID_DIM = 1024 N_LAYERS = 1 ENC_DROPOUT = 0.3 DEC_DROPOUT = 0.3 enc = Encoder(ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) print(model) # model.apply(init_weights) optimizer = optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) criterion = nn.CrossEntropyLoss(ignore_index=0) epochs = 100 clip = 1 best_valid_loss = float('inf') best_model = None print("training...") for epoch in range(epochs): start_time = time.time() train_loss = train(model, train_data_loader, optimizer, criterion, clip) valid_loss = evaluate(model, valid_data_loader, criterion) scheduler.step() end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_model = model print("-" * 65) print( f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |' ) torch.save(best_model.state_dict(), '../model/bert_embedded_seq2seq.pth') # model.apply(init_weights) # model.state_dict(torch.load("../model/bert_embedded_seq2seq.pth")) print("generating sentences...") path = "../data/test.tsv" test_input, test_output, test_pred = gen_sentence_list(model, path, tok) # print(test_pred) path = "../data/train.tsv" train_input, train_output, train_pred = gen_sentence_list(model, path, tok) train_df = convert_list_to_df(train_input, train_output, train_pred) test_df = convert_list_to_df(test_input, test_output, test_pred) test_df = prepare_df(test_df) test_percentage, test_kinds, test_bleu = eval_score(test_df) train_df = prepare_df(train_df) train_percentage, train_kinds, train_bleu = eval_score(test_df) train_df.to_csv("../csv/train/result_bert_embedded_Seq2seq.csv") test_df.to_csv("../csv/test/result_bert_embedded_Seq2seq.csv") print( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}" ) print( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) with open("./score/bert_embedded_score_seq2seq.txt", mode="w") as f: f.write( f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}" ) f.write( f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}" ) print("done!")