def main(): # Step 1: load data X, Y, groups = load_data(track_path=TRACK_PATH, bed_path=BED_PATH, group_path=GROUP_PATH) x_train, y_train, x_test, y_test = split_train_test(X, Y, groups, N_SPLITS) # Step 2: run models models = [ CNN_Builder.build_Sequential(model_config) for model_config in model_configs ] for model in models: compile_model(model, metric_names=("auprc", "auroc"), optimizer="adam") histories = [ fit_model(model, x_train, y_train, x_test, y_test, batch_size=BATCH_SIZE, epochs=EPOCHS, use_class_weight=USE_CLASS_WEIGHT, use_sample_weight=USE_SAMPLE_WEIGHT, use_reduce_rl=USE_REDUCE_RL, use_early_stopping=USE_EARLY_STOPPING, verbose=VERBOSE) for model in models ] # Step 3: save artifacts script_fn = os.path.basename(__file__) folder = script_fn.split(".py")[0] model_names = [ get_model_name(model_configs[i], i + 1) for i in range(len(model_configs)) ] save_single_metric_of_multi_models(folder, model_names, histories, "val_auprc") save_all_metrics_of_multi_models(folder, model_names, histories) save_history_copies(folder, model_names, histories) print("{} finished!".format(script_fn))
def test(): """ 测试函数,即模拟测试模型的训练、预测、保存操作。 :return: """ bad_sentences = extract_sentences_from_file('data/bad.txt') good_sentences = extract_sentences_from_file('data/good.txt') # 进行fit操作,建立词库 word_sequence = Word_Sequence() word_sequence.fit(good_sentences + bad_sentences) good_sentences_vec = word_sequence.transfroms(good_sentences, max_len=25) bad_sentences_vec = word_sequence.transfroms(bad_sentences, max_len=25) # 创建训练和测试数据集 data_sets = list() labels = list() for sent_vec in good_sentences_vec: data_sets.append(sent_vec) labels.append(0) for sent_vec in bad_sentences_vec: data_sets.append(sent_vec) labels.append(1) (X_train, Y_train, X_valid, Y_valid, X_test, Y_test) = \ split_train_test(data_sets, labels, test_size=0.2, valid_size=0.1) # 创建dataloader train_dataloader = pick_dataloader(X_train, Y_train, shuffle=False) test_dataloader = pick_dataloader(X_test, Y_test, shuffle=False) valid_dataloader = pick_dataloader(X_valid, Y_valid, shuffle=False) # 构建模型 lstm_model = \ LSTM_Model(len(word_sequence), embed_size=256, hidden_size=256, output_size=2) # 设置优化器和损失函数 optimizer = optim.Adam(lstm_model.parameters(), lr=0.0001) criterion = NLLLoss() device = 'GPU' if torch.cuda.is_available() else 'CPU' # 训练模型 train_info = train_model(lstm_model, train_dataloader, optimizer=optimizer, criterion=criterion, epochs=2, print_every=100, valid_dataloader=valid_dataloader, device=device, use_accuracy=True, use_valid=True) # print(train_info) # 测试模型 test_model(lstm_model, test_dataloader=test_dataloader, criterion=criterion, device=device, use_accuracy=True) # 保存模型 save_model(lstm_model) # 保存提取的词库 save_word_sequence(word_sequence)
def create_dataset(): # 1. 提取标签信息和数据信息, 保存在字典中 data_dict = {} for label, file in config.data_files.items(): sentences = extract_sentences_from_file(file) data_dict[label] = sentences # 2. 建立词库 # 2.1 提取所有的句子信息,保存在 all_sentences 中, 用以构建词库 all_sentences = list() for sentences in data_dict.values(): all_sentences = all_sentences + sentences # 2.2 构建词库 word_sequence = Word_Sequence() word_sequence.fit(all_sentences) # 3. 创建训练、测试、验证数据集 # 将数据转换为向量,保存在 datasets 中 datasets = [] # 保存标签的名称 label_name = [] for label, dataset in data_dict.items(): # lstm 模型数据集 if config.model.lower() == 'lstm': dataset_vec = word_sequence.transfroms(dataset, max_len=config.max_len) # fnn 模型数据集 elif config.model.lower() == 'fnn': dataset_vec = word_sequence.transfroms_word_bag(dataset) datasets.append(dataset_vec) label_name.append(label) # 标记数据,对datset数据集中的每个句子向量进行标记。 # 如: 句子分析属于 good 类型,则标记为: 0 ,bad 类型,则标记为:1 datasets_vec = list() labels = list() for i, dataset_vec in enumerate(datasets): for sentence_vec in dataset_vec: datasets_vec.append(sentence_vec) labels.append(i) # 划分训练、测试、验证数据集 (X_train, Y_train, X_valid, Y_valid, X_test, Y_test) = \ split_train_test(datasets_vec, labels, test_size=config.test_size, valid_size=config.valid_size) # 4.包装数据集为 dataloader. train_dataloader = pick_dataloader(X_train, Y_train, batch_size=config.batch_size, shuffle=True, data_type=config.model) test_dataloader = pick_dataloader(X_test, Y_test, batch_size=config.batch_size, shuffle=False, data_type=config.model) valid_dataloader = pick_dataloader(X_valid, Y_valid, batch_size=config.batch_size, shuffle=False, data_type=config.model) save_word_sequence(word_sequence, save_path=config.word_sequence_save_path, file_path=config.word_sequence_save_file) save_object(label_name, config.model_save_path, label_file_name) # 返回数据集和单词序列 return (train_dataloader, valid_dataloader, test_dataloader, word_sequence, label_name)