def train(nvsm, device, optimizer, epochs, train_loader, eval_loader, k_values, loss_function, lamb, print_every): for epoch in tqdm(range(epochs), desc = 'Epochs', ncols = 70): tqdm_train_loader = tqdm( enumerate(train_loader), desc = 'Batch', total = len(train_loader), ncols = 70, leave = True ) for i, (n_grams, doc_ids) in tqdm_train_loader: n_grams = n_grams.to(device) doc_ids = doc_ids.to(device) optimizer.zero_grad() pred_proba = nvsm(n_grams, doc_ids) loss = loss_function(nvsm, pred_proba, lamb) loss.backward() optimizer.step() if i % print_every == 0: nvsm.eval() recall_at_ks = evaluate( nvsm = nvsm, device = device, eval_loader = eval_loader, recalls = k_values, loss_function = loss_function, ) nvsm.train() model_eval = generate_eval(k_values, recall_at_ks) print(f' [{epoch:3}, {i:5d}]: {loss:5.4f} || {model_eval}')
def main(): data = np.array(list(extract_features.extract(args.video))) t = data[:, 0] X = data[:, 1:] with open(args.model, 'rb') as model_file: model = pickle.load(model_file) y = evaluate_model.evaluate(X, model) doc = write_clips.create_document(t, y, args.video) with open(args.output, 'wb') as output_file: doc.write(output_file)
def do_run(models): start_time = datetime.datetime.now() save_last_run() msgBody = "" log = "" failed = False datasets = makeDatasets() for m, dataset, args, expected in models: try: ds = datasets[dataset] log += sh("rake create_model_%s_%s" % (m, ds.rake_suffix)) log += "\n\n" msgBody += "**********************************\n" msgBody += "%s %s " % (m, args) import evaluate_model eval_start = datetime.datetime.now() actual, out_fname = evaluate_model.evaluate( run_description=str(args), corpus_fn=ds.corpus_fn, model_fn="%s/models/%s.pck" % (ds.dir, m), output_dir="%s/output" % ds.dir, gtruth_tag_fn=ds.gtruth_tag_fn, map_fn=ds.map_fn, **args) eval_stop = datetime.datetime.now() msgBody += "\n(outfile: %s)\n" % out_fname if expected != None: assert actual == expected, (actual, expected) msgBody += "got %d, as expected, in %s\n" % ( actual, str(eval_stop - eval_start)) else: msgBody += "Didn't check expected. got %d\n" % (actual) except: stackTrace = traceback.format_exc() failed = True msgBody += "failed with %s!\n\n" msgBody += stackTrace end_time = datetime.datetime.now() msgBody = "Ran %d tests in " % len(models) + str( end_time - start_time) + "\n" + msgBody return failed, msgBody
def create_subdirectory_if_not_exists(dir_name): if not os.path.exists(dir_name): os.mkdir(dir_name) print("Directory", dir_name, "created.") else: print("Directory", dir_name, "already exists.") # Create output directory to store preprocessed data and trained model create_subdirectory_if_not_exists("out") # Define file locations train_data = "out/preprocessed_train.npz" test_data = "out/preprocessed_test.npz" model_file = "out/model.h5" if not os.path.isfile(train_data) or not os.path.isfile(test_data): # Preprocess data preprocess(train_data=train_data, test_data=test_data) print("Data preprocessed and saved locally.") else: print("Preprocessed data exists.") # Train model build(train_data=train_data, save_file=model_file) # Evaluate model scores = evaluate(model_file, test_data) print("Final scores:", scores)
import os, shutil from keras import backend import settings from train_model import train from evaluate_model import evaluate for config in settings.TESTS_CONFIG: print("Training model ", str(config)) train(config) file = os.path.basename(config.file_name) os.rename(config.file_name, os.path.join(settings.completed_tests_folder, file)) shutil.move(config.model_dir, os.path.join(settings.results_folder, str(config))) print("Evaluating model ", str(config)) evaluate(config) backend.clear_session()
# 7, 8, 9, 10 new_y.append(2) y = new_y X = dataset[:, 0:11] # Divide Dataset: 20% Test and 80% Train x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) # Now we're going to Tunning Hyperparameters only with Train Data best_parameters = svc_tunning(x_train, y_train) # best_parameters = {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'} # best_parameters = {'C': 10, 'gamma': 0.9 , 'kernel': 'rbf'} # Pass the best parameters to train, and the Train Data trained_model = svc_train(x_train, y_train, c=best_parameters["C"], gamma=best_parameters["gamma"], kernel=best_parameters["kernel"]) # Evaluate the model print("Evaluate model\n") evaluate(trained_model, x_train, y_train) # Test the model print("Train model\n") test(trained_model, x_test, y_test)
def main(): #mypath = r'/home/connlab/108IR/will/final/NVSM_pytorch/' mypath = r'C:/Users/willll/Desktop/WIillll/IRCLass/Final/NVSM_pytorch' pretrained_model = 'bert-base-uncased' glove_path = Path(mypath + '/glove') model_folder = Path(mypath + '/models') # data_folder = Path(mypath + '/data/processed') data_folder = Path(mypath + '/Willll/fakedoc') testing_query_folder = Path(mypath + '/Willll/test/query') model_path = model_folder / 'nvsm_bert.pt' batch_size = 140 # for 150, 8053 / 8113MB GPU memory, to tweak epochs = 1 docs, queries, tokenizer = load_data(data_folder, testing_query_folder, pretrained_model) # docs = docs[:20] doc_names = [doc['name'] for doc in docs] n_grams, document_ids = create_dataset( tok_docs=[doc['tokens'] for doc in docs], tokenizer=tokenizer, n=30) print('N-grams number', len(n_grams)) k_values = [1, 3, 5, 10] (train_data, eval_data, eval_train_data) = create_pytorch_datasets(n_grams, document_ids) print('Train dataset size', len(train_data)) print('Eval dataset size', len(eval_data)) print('Eval (training) dataset size', len(eval_train_data)) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False) eval_train_loader = DataLoader(eval_train_data, batch_size=batch_size, shuffle=False) device = torch.device('cuda') lamb = 1e-3 nvsm = NVSMBERT( pretrained_model=pretrained_model, n_doc=len(doc_names), dim_doc_emb=20, neg_sampling_rate=10, ).to(device) # BERT custom optimizer param_optimizer = list(nvsm.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=len(train_loader) * epochs) train(nvsm=nvsm, device=device, optimizer=optimizer, epochs=epochs, train_loader=train_loader, eval_loader=eval_train_loader, k_values=k_values, loss_function=loss_function, lamb=lamb, print_every=10000) torch.save(nvsm.state_dict(), model_path) nvsm.eval() recall_at_ks = evaluate( nvsm=nvsm, device=device, eval_loader=eval_loader, recalls=k_values, loss_function=loss_function, ) print(generate_eval(k_values, recall_at_ks)) queries_text = [query['tokens'] for query in queries] queries_name = [query['name'] for query in queries] evaluation_results = evaluate_queries_bert(nvsm, queries_text, doc_names, tokenizer, batch_size, device) print(evaluation_results) # print(len(ranksResults)) for query_name, query_text, doc_idx in zip(queries_name, queries_text, evaluation_results): print(f'{query_name} {query_text:35} -> {doc_names[doc_idx]}') with open(mypath + './Willll/result.txt', 'w') as f: f.write('Query,RetrievedDocuments\n') resuList = ' ' for qIndex, qName in enumerate(queries_name): f.write(f'{qName},') f.write( f'{resuList.join(doc_names[x] for x in ranksResults[qIndex])}\n' )
def main(): model_folder = Path('../../models') data_folder = Path('../../data/processed') model_path = model_folder / 'nvsm_30_20_10.pt' batch_size = 1000 voc, stoi, itos, docs = load_data(model_folder, data_folder) doc_names = [doc['name'] for doc in docs] print('Vocabulary size', len(voc)) n_grams, document_ids = create_dataset( tok_docs=[doc['tokens'] for doc in docs], stoi=stoi, n=10) print('N-grams number', len(n_grams)) k_values = [1, 3, 5, 10] (train_data, eval_data, eval_train_data) = create_pytorch_datasets(n_grams, document_ids) print('Train dataset size', len(train_data)) print('Eval dataset size', len(eval_data)) print('Eval (training) dataset size', len(eval_train_data)) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False) eval_train_loader = DataLoader(eval_train_data, batch_size=batch_size, shuffle=False) device = torch.device('cuda') lamb = 1e-3 # regularization weight in the loss nvsm = NVSMLinear(n_doc=len(doc_names), n_tok=len(stoi), dim_doc_emb=20, dim_tok_emb=30, neg_sampling_rate=10, pad_token_id=stoi['<PAD>']).to(device) optimizer = optim.Adam(nvsm.parameters(), lr=1e-3) train(nvsm=nvsm, device=device, optimizer=optimizer, epochs=120, train_loader=train_loader, eval_loader=eval_train_loader, k_values=k_values, loss_function=loss_function, lamb=lamb, print_every=500) torch.save(nvsm.state_dict(), model_path) nvsm.eval() recall_at_ks = evaluate( nvsm=nvsm, device=device, eval_loader=eval_loader, recalls=k_values, loss_function=loss_function, ) print(generate_eval(k_values, recall_at_ks)) queries_text = [ 'violence king louis decapitated', 'domain language translate', 'governement robespierre', 'perfect imperfect information', 'ontology translation', 'high levels of political violence', 'state education system which promotes civic values', 'political struggles', 'Almost all future revolutionary movements looked back to the Revolution as their predecessor', 'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture', 'mathematical model winning strategy', 'solutions for two-person zero-sum games', 'cooperative coalitions bargaining', 'eigenvalue', 'graph, dimension and components', 'inner product vertex' ] evaluation_results = evaluate_queries(nvsm, queries_text, doc_names, stoi, batch_size, device) for query, doc_idx in zip(queries_text, evaluation_results): print(f'{query:35} -> {doc_names[doc_idx]}')
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim, sentence_length, num_layers, epochs, learning_rate, tag2id, model_saved_path, train_log_path, validate_log_path, train_history_image_path): ''' data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造 data_size: 训练集和测试集的样本数量 batch_size: 批次的样本个数 embedding_dim: 词嵌入的维度 hidden_dim: 隐藏层的维度 sentence_length: 文本限制的长度 num_layers: 神经网络堆叠的LSTM层数 epochs: 训练迭代的轮次 learning_rate: 学习率 tag2id: 标签到id的映射字典 model_saved_path: 模型保存的路径 train_log_path: 训练日志保存的路径 validate_log_path: 测试集日志保存的路径 train_history_image_path: 训练数据的相关图片保存路径 ''' # 将中文字符和id的对应码表加载进内存 char2id = json.load( open("./data/char_to_id.json", mode="r", encoding="utf-8")) # 初始化BiLSTM_CRF模型 model = BiLSTM_CRF(vocab_size=len(char2id), tag_to_ix=tag2id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, batch_size=batch_size, num_layers=num_layers, sequence_length=sentence_length) # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam) # 参数说明如下: # lr: 优化器学习率 # momentum: 优化下降的动量因子, 加速梯度下降过程 # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4) # 设定优化器学习率更新策略 # 参数说明如下: # optimizer: 优化器 # step_size: 更新频率, 每过多少个epoch更新一次优化器学习率 # gamma: 学习率衰减幅度, # 按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1 # 例如: # 初始学习率 lr = 0.5, step_size = 20, gamma = 0.1 # lr = 0.5 if epoch < 20 # lr = 0.05 if 20 <= epoch < 40 # lr = 0.005 if 40 <= epoch < 60 # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8) # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标 train_loss_list = [] train_acc_list = [] train_recall_list = [] train_f1_list = [] train_log_file = open(train_log_path, mode="w", encoding="utf-8") # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标 validate_loss_list = [] validate_acc_list = [] validate_recall_list = [] validate_f1_list = [] validate_log_file = open(validate_log_path, mode="w", encoding="utf-8") # 利用tag2id生成id到tag的映射字典 id2tag = {v: k for k, v in tag2id.items()} # 利用char2id生成id到字符的映射字典 id2char = {v: k for k, v in char2id.items()} # 按照参数epochs的设定来循环epochs次 for epoch in range(epochs): # 在进度条打印前, 先输出当前所执行批次 tqdm.write("Epoch {}/{}".format(epoch + 1, epochs)) # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的训练部分 for inputs, labels in tqdm(data_loader["train"]): # 将数据以Variable进行封装 inputs, labels = Variable(inputs), Variable(labels) # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加 optimizer.zero_grad() # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数 loss = model.neg_log_likelihood(inputs, labels) # 获取当前步的loss, 由tensor转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 对损失函数进行反向传播 loss.backward() # 通过optimizer.step()计算损失, 梯度和更新参数 optimizer.step() # 记录训练日志 # train_log_file.write(log_text + "\n") step += 1 # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量) epoch_loss = total_loss / data_size["train"] # 计算当前epochs准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算当前epochs召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算当前epochs的F1值 total_f1 = 0 if total_acc + total_recall != 0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 当前epochs训练后更新学习率, 必须在优化器更新之后 # scheduler.step() # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值 train_loss_list.append(epoch_loss) train_acc_list.append(total_acc) train_recall_list.append(total_recall) train_f1_list.append(total_f1) train_log_file.write(log_text + "\n") # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的验证部分 with torch.no_grad(): for inputs, labels in tqdm(data_loader["validation"]): # 将数据以 Variable 进行封装 inputs, labels = Variable(inputs), Variable(labels) # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数 # 返回最终的 CRF 的对数似然结果 try: loss = model.neg_log_likelihood(inputs, labels) except: continue # 获取当前步的 loss 值,由 tensor 转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 记录验证集损失日志 # validate_log_file.write(log_text + "\n") step += 1 # 获取当前批次平均损失值(每一批次损失总值除以数据量) epoch_loss = total_loss / data_size["validation"] # 计算总批次准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算总批次召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算总批次F1值 total_f1 = 0 if total_acc + total_recall != 0.0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值 validate_loss_list.append(epoch_loss) validate_acc_list.append(total_acc) validate_recall_list.append(total_recall) validate_f1_list.append(total_f1) validate_log_file.write(log_text + "\n") # 保存模型 torch.save(model.state_dict(), model_saved_path) # 将loss下降历史数据转为图片存储 save_train_history_image(train_loss_list, validate_loss_list, train_history_image_path, "Loss") # 将准确率提升历史数据转为图片存储 save_train_history_image(train_acc_list, validate_acc_list, train_history_image_path, "Acc") # 将召回率提升历史数据转为图片存储 save_train_history_image(train_recall_list, validate_recall_list, train_history_image_path, "Recall") # 将F1上升历史数据转为图片存储 save_train_history_image(train_f1_list, validate_f1_list, train_history_image_path, "F1") print("train Finished".center(100, "-"))
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim, sentence_length, num_layers, epochs, learning_rate, tag2id, model_saved_path, train_log_path, validate_log_path, train_history_image_path): char2id = json.load(open("./data/char_to_id.json", mode="r", encoding="utf-8")) # 初始化模型 model = BiLSTM(vocab_size=len(char2id), tag_to_ix=tag2id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, batch_size=batch_size, num_layers=num_layers, sequence_length=sentence_length) # 定义优化器,使用 SGD 作为优化器(因为 torch 中 Embedding 支持的 GPU 加速为 SGD 和 SparseAdam) # 参数说明如下: # params 需要更新的模型参数; # lr 优化器学习率; # momentum 优化下降的动量因子,加速梯度下降过程。 optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85) # 设定优化器学习率更新策略 # 参数说明如下: # optimizer 待更新优化器; # step_size 更新频率,即没多少个 epoch 更新一次优化器学习率; # gamma 学习率衰减幅度, # 按照什么比例调整(衰减)学习率(相对于上一轮次数 epoch 而言),默认 0.1 # ---------------------------------------------------------------------- # 例: # >>> # 初始学习率 lr = 0.5 step_size = 20, gamma=0.1 # >>> # lr = 0.5 if epoch < 20 # >>> # lr = 0.05 if 20 <= epoch < 40 # >>> # lr = 0.005 if 40 <= epoch < 60 scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.2) train_loss_list = [] train_acc_list = [] train_recall_list = [] train_f1_list = [] train_log_file = open(train_log_path, mode="w", encoding="utf-8") # 定义记录验证 loss 值(用于图表展示)列表以及需要记录的验证日志文件 validate_loss_list = [] validate_acc_list = [] validate_recall_list = [] validate_f1_list = [] validate_log_file = open(validate_log_path, mode="w", encoding="utf-8") # 调转字符标签与id值 id2tag = {v:k for k, v in tag2id.items()} # 调转字符编码与id值 id2char = {v:k for k, v in char2id.items()} for epoch in range(epochs): # 在进度条打印前,先输出当前所执行批次 tqdm.write("Epoch {}/{}".format(epoch + 1, epochs)) # 定义要记录的正确总实体数、识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每 batch 步数,批次 loss 总值,准确度,f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 for inputs, labels in tqdm(data_loader["train"]): # 将数据以 Variable 进行封装 inputs, labels = Variable(inputs), Variable(labels) # 请记住Pytorch会积累梯度。我们需要在每个实例之前清除它们 optimizer.zero_grad() # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数 # 返回最终的 CRF 的对数似然结果 loss = model.neg_log_likelihood(inputs, labels) # 获取当前步的 loss 值,由 tensor 转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表 best_path_list = model(inputs) # 模型评估指标值获取包括:当前批次准确率、召回率、F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length loss.backward() # 通过optimizer.step()计算损失、梯度和更新参数 optimizer.step() # 记录训练日志 train_log_file.write(log_text + "\n") step += 1 # 获取当前批次平均损失值(每一批次损失总值除以数据量) epoch_loss = total_loss / data_size["train"] # 计算总批次准确率 total_acc = total_acc_entities_length / total_predict_entities_length # 计算总批次召回率 total_recall = total_acc_entities_length / total_gold_entities_length # 计算总批次F1值 total_f1 = 0 if total_acc + total_recall != 0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 当前批次训练后更新学习率 # 必须在优化器更新之后 scheduler.step() # 记录当前批次训练 loss 值(用于图表展示)、准确率、召回率、f1值 train_loss_list.append(epoch_loss) train_acc_list.append(total_acc) train_recall_list.append(total_recall) train_f1_list.append(total_f1) train_log_file.write(log_text + "\n") # 保存模型 torch.save(model.state_dict(), model_saved_path) # 将 loss 下降历史数据转为图片存储 save_train_history_image(train_loss_list, validate_loss_list, train_history_image_path, "Loss") # 将准确率提升历史数据转为图片存储 save_train_history_image(train_acc_list, validate_acc_list, train_history_image_path, "Acc") # 将召回提升历史数据转为图片存储 save_train_history_image(train_recall_list, validate_recall_list, train_history_image_path, "Recall") # 将F1上升历史数据转为图片存储 save_train_history_image(train_f1_list, validate_f1_list, train_history_image_path, "F1") print("train Finished".center(100, "-"))
def main(): #mypath = r'/home/connlab/108IR/will/final/NVSM_pytorch/' mypath = r'C:/Users/willll/Desktop/WIillll/IRfinal/NVSM_pytorch' print(mypath) pretrained_model = 'bert-base-uncased' glove_path = Path(mypath + '/glove') model_folder = Path(mypath + '/models') # data_folder = Path(mypath + '/data/processed') data_folder = Path(mypath + '/Willll/fakedoc') testing_query_folder = Path(mypath + '/Willll/test/query') model_path = model_folder / 'nvsm_bert.pt' batch_size = 140 # for 150, 8053 / 8113MB GPU memory, to tweak epochs = 3 docs, queries ,tokenizer = load_data( data_folder, testing_query_folder, pretrained_model ) # docs = docs[:20] doc_names = [doc['name'] for doc in docs] n_grams, document_ids = create_dataset( tok_docs = [doc['tokens'] for doc in docs], tokenizer = tokenizer, n = 10 ) print('N-grams number', len(n_grams)) k_values = [1, 3, 5, 10] (train_data, eval_data, eval_train_data) = create_pytorch_datasets(n_grams, document_ids) print('Train dataset size', len(train_data)) print('Eval dataset size', len(eval_data)) print('Eval (training) dataset size', len(eval_train_data)) eval_loader = DataLoader(eval_data, batch_size = batch_size, shuffle = False) eval_train_loader = DataLoader(eval_train_data, batch_size = batch_size, shuffle = False) device = torch.device('cuda') lamb = 1e-3 nvsm = NVSMBERT( pretrained_model = pretrained_model, n_doc = len(doc_names), dim_doc_emb = 20, neg_sampling_rate = 10, ).to(device) #torch.save(nvsm.state_dict(), model_path) nvsm.load_state_dict(torch.load(model_path)) nvsm.eval() recall_at_ks = evaluate( nvsm = nvsm, device = device, eval_loader = eval_loader, recalls = k_values, loss_function = loss_function, ) print(generate_eval(k_values, recall_at_ks)) queries_text = [query['tokens'] for query in queries] queries_name = [query['name'] for query in queries] # queries_text = [ # 'violence king louis decapitated', # 'domain language translate', # 'governement robespierre', # 'perfect imperfect information', # 'ontology translation', # 'high levels of political violence', # 'state education system which promotes civic values', # 'political struggles', # 'Almost all future revolutionary movements looked back to the Revolution as their predecessor', # 'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture', # 'mathematical model winning strategy', # 'solutions for two-person zero-sum games', # 'cooperative coalitions bargaining', # 'eigenvalue', # 'graph, dimension and components', # 'inner product vertex' # ] evaluation_results,ranksResults = evaluate_queries_bert( nvsm, queries_text, doc_names, tokenizer, batch_size, device ) print(evaluation_results) # print(len(ranksResults)) for query_name,query_text, doc_idx in zip(queries_name,queries_text, evaluation_results): print(f'{query_name} {query_text:35} -> {doc_names[doc_idx]}') with open(mypath + './Willll/result.txt','w') as f: f.write('Query,RetrievedDocuments\n') resuList = ' ' for qIndex,qName in enumerate(queries_name): f.write(f'{qName},') f.write(f'{resuList.join(doc_names[x] for x in ranksResults[qIndex])}\n')
y_train, penalty=best_parameters["model_non_regularized"]["Penalty"], C=best_parameters["model_non_regularized"]["C"], solver=best_parameters["model_non_regularized"]["Solver"], multi_class=best_parameters["model_non_regularized"]["MultiClass"], max_iter=1000) lg = LogisticRegression( penalty=best_parameters["model_regularized"]["Penalty"], C=best_parameters["model_regularized"]["C"], solver=best_parameters["model_regularized"]["Solver"], multi_class=best_parameters["model_regularized"]["MultiClass"], max_iter=1000) cvs = cross_val_score(lg, x_train, y_train, cv=4) # Evaluate the model print("\n Evaluate the model \n") print("\nRegularized:") evaluate(trained_model_regularized, x_train, y_train) print("\n Cross_validation") print(cvs) print("\nNon Regularized:") evaluate(trained_model_non_regularized, x_train, y_train) # Test the model print("\n Test the model \n") print("\nRegularized:") test(trained_model_regularized, x_test, y_test) print("\nNon Regularized:") test(trained_model_non_regularized, x_test, y_test)
def main(): pretrained_model = 'bert-base-uncased' glove_path = Path('../../glove') model_folder = Path('../../models') data_folder = Path('../../data/processed') model_path = model_folder / 'nvsm_bert.pt' batch_size = 140 # for 150, 8053 / 8113MB GPU memory, to tweak epochs = 3 docs, tokenizer = load_data(data_folder, pretrained_model) # docs = docs[:20] doc_names = [doc['name'] for doc in docs] n_grams, document_ids = create_dataset( tok_docs=[doc['tokens'] for doc in docs], tokenizer=tokenizer, n=10) print('N-grams number', len(n_grams)) k_values = [1, 3, 5, 10] (train_data, eval_data, eval_train_data) = create_pytorch_datasets(n_grams, document_ids) print('Train dataset size', len(train_data)) print('Eval dataset size', len(eval_data)) print('Eval (training) dataset size', len(eval_train_data)) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False) eval_train_loader = DataLoader(eval_train_data, batch_size=batch_size, shuffle=False) device = torch.device('cuda') lamb = 1e-3 nvsm = NVSMBERT( pretrained_model=pretrained_model, n_doc=len(doc_names), dim_doc_emb=20, neg_sampling_rate=10, ).to(device) # BERT custom optimizer param_optimizer = list(nvsm.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=len(train_loader) * epochs) train(nvsm=nvsm, device=device, optimizer=optimizer, epochs=epochs, train_loader=train_loader, eval_loader=eval_train_loader, k_values=k_values, loss_function=loss_function, lamb=lamb, print_every=500) torch.save(nvsm.state_dict(), model_path) nvsm.eval() recall_at_ks = evaluate( nvsm=nvsm, device=device, eval_loader=eval_loader, recalls=k_values, loss_function=loss_function, ) print(generate_eval(k_values, recall_at_ks)) queries_text = [ 'violence king louis decapitated', 'domain language translate', 'governement robespierre', 'perfect imperfect information', 'ontology translation', 'high levels of political violence', 'state education system which promotes civic values', 'political struggles', 'Almost all future revolutionary movements looked back to the Revolution as their predecessor', 'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture', 'mathematical model winning strategy', 'solutions for two-person zero-sum games', 'cooperative coalitions bargaining', 'eigenvalue', 'graph, dimension and components', 'inner product vertex' ] evaluation_results = evaluate_queries_bert(nvsm, queries_text, doc_names, tokenizer, batch_size, device) for query, doc_idx in zip(queries_text, evaluation_results): print(f'{query:35} -> {doc_names[doc_idx]}')
import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=Warning) #Import custom function for evaluation and video recording from evaluate_model import evaluate from record_model import record from nes_py.wrappers import JoypadSpace import gym_tetris from gym_tetris.actions import MOVEMENT, SIMPLE_MOVEMENT, TRAIN_MOVEMENT from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = gym_tetris.make('TetrisA-v3') env = JoypadSpace(env, TRAIN_MOVEMENT) env = DummyVecEnv([lambda: env]) model = DQN.load("TetrisA-v2_DQN_200k", env=env, verbose=1) mean_reward = evaluate(model=model, env=env, episode=20, render=True) #status = evaluate(model, env, num_steps=12000, render = True) #status = record( model=model, env=env, num_episodes=3) print(mean_reward)