示例#1
0
def main():
    # Step 1: load data
    X, Y, groups = load_data(track_path=TRACK_PATH,
                             bed_path=BED_PATH,
                             group_path=GROUP_PATH)
    x_train, y_train, x_test, y_test = split_train_test(X, Y, groups, N_SPLITS)

    # Step 2: run models
    models = [
        CNN_Builder.build_Sequential(model_config)
        for model_config in model_configs
    ]
    for model in models:
        compile_model(model, metric_names=("auprc", "auroc"), optimizer="adam")
    histories = [
        fit_model(model,
                  x_train,
                  y_train,
                  x_test,
                  y_test,
                  batch_size=BATCH_SIZE,
                  epochs=EPOCHS,
                  use_class_weight=USE_CLASS_WEIGHT,
                  use_sample_weight=USE_SAMPLE_WEIGHT,
                  use_reduce_rl=USE_REDUCE_RL,
                  use_early_stopping=USE_EARLY_STOPPING,
                  verbose=VERBOSE) for model in models
    ]

    # Step 3: save artifacts
    script_fn = os.path.basename(__file__)
    folder = script_fn.split(".py")[0]
    model_names = [
        get_model_name(model_configs[i], i + 1)
        for i in range(len(model_configs))
    ]

    save_single_metric_of_multi_models(folder, model_names, histories,
                                       "val_auprc")
    save_all_metrics_of_multi_models(folder, model_names, histories)
    save_history_copies(folder, model_names, histories)

    print("{} finished!".format(script_fn))
示例#2
0
def test():
    """
    测试函数,即模拟测试模型的训练、预测、保存操作。
    :return:
    """
    bad_sentences = extract_sentences_from_file('data/bad.txt')
    good_sentences = extract_sentences_from_file('data/good.txt')

    # 进行fit操作,建立词库
    word_sequence = Word_Sequence()
    word_sequence.fit(good_sentences + bad_sentences)
    good_sentences_vec = word_sequence.transfroms(good_sentences, max_len=25)
    bad_sentences_vec = word_sequence.transfroms(bad_sentences, max_len=25)

    # 创建训练和测试数据集
    data_sets = list()
    labels = list()
    for sent_vec in good_sentences_vec:
        data_sets.append(sent_vec)
        labels.append(0)
    for sent_vec in bad_sentences_vec:
        data_sets.append(sent_vec)
        labels.append(1)

    (X_train, Y_train, X_valid, Y_valid, X_test, Y_test) = \
        split_train_test(data_sets, labels, test_size=0.2, valid_size=0.1)

    # 创建dataloader
    train_dataloader = pick_dataloader(X_train, Y_train, shuffle=False)
    test_dataloader = pick_dataloader(X_test, Y_test, shuffle=False)
    valid_dataloader = pick_dataloader(X_valid, Y_valid, shuffle=False)

    # 构建模型
    lstm_model = \
        LSTM_Model(len(word_sequence), embed_size=256, hidden_size=256, output_size=2)

    # 设置优化器和损失函数
    optimizer = optim.Adam(lstm_model.parameters(), lr=0.0001)
    criterion = NLLLoss()
    device = 'GPU' if torch.cuda.is_available() else 'CPU'

    # 训练模型
    train_info = train_model(lstm_model, train_dataloader,
                             optimizer=optimizer,
                             criterion=criterion,
                             epochs=2,
                             print_every=100,
                             valid_dataloader=valid_dataloader,
                             device=device,
                             use_accuracy=True,
                             use_valid=True)
    # print(train_info)

    # 测试模型
    test_model(lstm_model,
               test_dataloader=test_dataloader,
               criterion=criterion,
               device=device,
               use_accuracy=True)

    # 保存模型
    save_model(lstm_model)
    # 保存提取的词库
    save_word_sequence(word_sequence)
示例#3
0
def create_dataset():
    # 1. 提取标签信息和数据信息, 保存在字典中
    data_dict = {}
    for label, file in config.data_files.items():
        sentences = extract_sentences_from_file(file)
        data_dict[label] = sentences

    # 2. 建立词库
    # 2.1 提取所有的句子信息,保存在 all_sentences 中, 用以构建词库
    all_sentences = list()
    for sentences in data_dict.values():
        all_sentences = all_sentences + sentences

    # 2.2 构建词库
    word_sequence = Word_Sequence()
    word_sequence.fit(all_sentences)

    # 3. 创建训练、测试、验证数据集
    # 将数据转换为向量,保存在 datasets 中
    datasets = []
    # 保存标签的名称
    label_name = []
    for label, dataset in data_dict.items():
        # lstm 模型数据集
        if config.model.lower() == 'lstm':
            dataset_vec = word_sequence.transfroms(dataset,
                                                   max_len=config.max_len)

        # fnn 模型数据集
        elif config.model.lower() == 'fnn':
            dataset_vec = word_sequence.transfroms_word_bag(dataset)

        datasets.append(dataset_vec)
        label_name.append(label)

    # 标记数据,对datset数据集中的每个句子向量进行标记。
    # 如: 句子分析属于 good 类型,则标记为: 0 ,bad 类型,则标记为:1
    datasets_vec = list()
    labels = list()
    for i, dataset_vec in enumerate(datasets):
        for sentence_vec in dataset_vec:
            datasets_vec.append(sentence_vec)
            labels.append(i)

    # 划分训练、测试、验证数据集
    (X_train, Y_train, X_valid, Y_valid, X_test, Y_test) = \
        split_train_test(datasets_vec, labels,
                         test_size=config.test_size,
                         valid_size=config.valid_size)

    # 4.包装数据集为 dataloader.
    train_dataloader = pick_dataloader(X_train,
                                       Y_train,
                                       batch_size=config.batch_size,
                                       shuffle=True,
                                       data_type=config.model)

    test_dataloader = pick_dataloader(X_test,
                                      Y_test,
                                      batch_size=config.batch_size,
                                      shuffle=False,
                                      data_type=config.model)

    valid_dataloader = pick_dataloader(X_valid,
                                       Y_valid,
                                       batch_size=config.batch_size,
                                       shuffle=False,
                                       data_type=config.model)

    save_word_sequence(word_sequence,
                       save_path=config.word_sequence_save_path,
                       file_path=config.word_sequence_save_file)
    save_object(label_name, config.model_save_path, label_file_name)

    # 返回数据集和单词序列
    return (train_dataloader, valid_dataloader, test_dataloader, word_sequence,
            label_name)