示例#1
0
def train_base(opt, train_feature,dev_feature=None,test_feature=None):
    #加载实体映射表
    with open(os.path.join(opt.mid_data_dir, f'{opt.task_type}_ent2id.json'), encoding='utf-8') as f:
        ent2id = json.load(f)

    train_feature = gen_mrc_data(train_feature,ent2id,'train')
    dev_feature = gen_mrc_data(dev_feature, ent2id, 'dev')
    test_feature = gen_mrc_data(test_feature, ent2id, 'test')

    train_dataset = NERDataset(train_feature,opt,ent2id)
    dev_dataset = NERDataset(dev_feature,opt,ent2id)
    test_dataset = NERDataset(test_feature,opt,ent2id)


    if opt.task_type == 'crf':
        model = build_model('crf', opt.bert_dir, num_tags=len(ent2id),
                            dropout_prob=opt.dropout_prob)
    elif opt.task_type == 'mrc':
        model = build_model('mrc', opt.bert_dir,opt,
                            dropout_prob=opt.dropout_prob,
                            use_type_embed=opt.use_type_embed,
                            loss_type=opt.loss_type)
    else:
        model = build_model('span', opt.bert_dir,opt, num_tags=len(ent2id)+1,
                            dropout_prob=opt.dropout_prob,
                            loss_type=opt.loss_type)

    train(opt, model, train_dataset,dev_dataset,test_dataset,ent2id)
示例#2
0
def train_model(train_graph_list, y_train, test_graph_list, y_test, topic):

    if use_tfidf:
        tfidf = train_tfidf(train_graph_list)
    else:
        tfidf = None

    if embed_retrain:
        tokens_list = graph_utils.extract_node_content(train_graph_list)
        embed_model_retrain = train_w2v.direct_train_w2v(tokens_list)
        # embed_model_retrain = text_utils.load_pretrain_embedding('../../../pretrain_models/cv/' + topic + '.txt')

    train_vector_list = []
    for i, graph in enumerate(train_graph_list):
        if embed_retrain:
            train_vector_list.append(
                get_graph_vector(graph, tfidf, embed_type,
                                 embed_model_retrain))
        else:
            train_vector_list.append(
                get_graph_vector(graph, tfidf, embed_type, embed_model))

    X_train = np.vstack(train_vector_list)

    if use_tfidf:
        tfidf = train_tfidf(train_graph_list + test_graph_list)

    if embed_update:
        embed_model_update = train_w2v.update_model(
            embed_model_retrain,
            graph_utils.extract_node_content(test_graph_list))
        # embed_model_update = text_utils.load_pretrain_embedding('../../../pretrain_models/cv/update_' + topic + '.txt')

    test_vector_list = []
    for i, graph in enumerate(test_graph_list):
        if embed_update:
            test_vector_list.append(
                get_graph_vector(graph, tfidf, embed_type, embed_model_update))
        elif embed_retrain:
            test_vector_list.append(
                get_graph_vector(graph, tfidf, embed_type,
                                 embed_model_retrain))
        else:
            test_vector_list.append(
                get_graph_vector(graph, tfidf, embed_type, embed_model))

    X_test = np.vstack(test_vector_list)

    return trainer.train(X_train, X_test, y_train, y_test, model_type)
示例#3
0
def train_base(opt, train_examples, dev_examples=None):
    with open(os.path.join(opt.mid_data_dir, f'{opt.task_type}_ent2id.json'),
              encoding='utf-8') as f:
        ent2id = json.load(f)

    train_features = convert_examples_to_features(opt.task_type,
                                                  train_examples,
                                                  opt.max_seq_len,
                                                  opt.bert_dir, ent2id)[0]

    train_dataset = NERDataset(opt.task_type,
                               train_features,
                               'train',
                               use_type_embed=opt.use_type_embed)

    if opt.task_type == 'crf':
        model = build_model('crf',
                            opt.bert_dir,
                            num_tags=len(ent2id),
                            dropout_prob=opt.dropout_prob)
    elif opt.task_type == 'mrc':
        model = build_model('mrc',
                            opt.bert_dir,
                            dropout_prob=opt.dropout_prob,
                            use_type_embed=opt.use_type_embed,
                            loss_type=opt.loss_type)
    else:
        model = build_model('span',
                            opt.bert_dir,
                            num_tags=len(ent2id) + 1,
                            dropout_prob=opt.dropout_prob,
                            loss_type=opt.loss_type)

    train(opt, model, train_dataset)

    if dev_examples is not None:

        dev_features, dev_callback_info = convert_examples_to_features(
            opt.task_type, dev_examples, opt.max_seq_len, opt.bert_dir, ent2id)

        dev_dataset = NERDataset(opt.task_type,
                                 dev_features,
                                 'dev',
                                 use_type_embed=opt.use_type_embed)

        dev_loader = DataLoader(dev_dataset,
                                batch_size=opt.eval_batch_size,
                                shuffle=False,
                                num_workers=0)

        dev_info = (dev_loader, dev_callback_info)

        model_path_list = get_model_path_list(opt.output_dir)

        metric_str = ''

        max_f1 = 0.
        max_f1_step = 0

        max_f1_path = ''

        for idx, model_path in enumerate(model_path_list):

            tmp_step = model_path.split('/')[-2].split('-')[-1]

            model, device = load_model_and_parallel(model,
                                                    opt.gpu_ids[0],
                                                    ckpt_path=model_path)

            if opt.task_type == 'crf':
                tmp_metric_str, tmp_f1 = crf_evaluation(
                    model, dev_info, device, ent2id)
            elif opt.task_type == 'mrc':
                tmp_metric_str, tmp_f1 = mrc_evaluation(
                    model, dev_info, device)
            else:
                tmp_metric_str, tmp_f1 = span_evaluation(
                    model, dev_info, device, ent2id)

            logger.info(f'In step {tmp_step}:\n {tmp_metric_str}')

            metric_str += f'In step {tmp_step}:\n {tmp_metric_str}' + '\n\n'

            if tmp_f1 > max_f1:
                max_f1 = tmp_f1
                max_f1_step = tmp_step
                max_f1_path = model_path

        max_metric_str = f'Max f1 is: {max_f1}, in step {max_f1_step}'

        logger.info(max_metric_str)

        metric_str += max_metric_str + '\n'

        eval_save_path = os.path.join(opt.output_dir, 'eval_metric.txt')

        with open(eval_save_path, 'a', encoding='utf-8') as f1:
            f1.write(metric_str)

        with open('./best_ckpt_path.txt', 'a', encoding='utf-8') as f2:
            f2.write(max_f1_path + '\n')

        del_dir_list = [
            os.path.join(opt.output_dir,
                         path.split('/')[-2]) for path in model_path_list
            if path != max_f1_path
        ]

        import shutil
        for x in del_dir_list:
            shutil.rmtree(x)
            logger.info('{}已删除'.format(x))
示例#4
0
def train_base(opt, train_examples, dev_examples=None):
    with open(os.path.join(opt.mid_data_dir, f"{opt.task_type}_ent2id.json"),
              encoding="utf-8") as f:
        ent2id = json.load(f)

    train_features = convert_examples_to_features(opt.task_type,
                                                  train_examples,
                                                  opt.max_seq_len,
                                                  opt.bert_dir, ent2id)[0]

    train_dataset = NERDataset(opt.task_type,
                               train_features,
                               "train",
                               use_type_embed=opt.use_type_embed)

    print(f"len(ent2id): {len(ent2id)}")
    print(f"ent2id: {ent2id}")
    # exit(1)
    if opt.task_type == "crf":
        model = build_model("crf",
                            opt.bert_dir,
                            num_tags=len(ent2id),
                            dropout_prob=opt.dropout_prob)
    elif opt.task_type == "mrc":
        model = build_model(
            "mrc",
            opt.bert_dir,
            dropout_prob=opt.dropout_prob,
            use_type_embed=opt.use_type_embed,
            loss_type=opt.loss_type,
        )
    else:
        model = build_model(
            "span",
            opt.bert_dir,
            num_tags=len(ent2id) + 1,
            dropout_prob=opt.dropout_prob,
            loss_type=opt.loss_type,
        )

    train(opt, model, train_dataset)

    if dev_examples is not None:

        dev_features, dev_callback_info = convert_examples_to_features(
            opt.task_type, dev_examples, opt.max_seq_len, opt.bert_dir, ent2id)

        dev_dataset = NERDataset(opt.task_type,
                                 dev_features,
                                 "dev",
                                 use_type_embed=opt.use_type_embed)

        dev_loader = DataLoader(dev_dataset,
                                batch_size=opt.eval_batch_size,
                                shuffle=False,
                                num_workers=0)

        dev_info = (dev_loader, dev_callback_info)

        model_path_list = get_model_path_list(opt.output_dir)

        metric_str = ""

        max_f1 = 0.0
        max_f1_step = 0

        max_f1_path = ""

        for idx, model_path in enumerate(model_path_list):

            tmp_step = model_path.split("/")[-2].split("-")[-1]

            model, device = load_model_and_parallel(model,
                                                    opt.gpu_ids[0],
                                                    ckpt_path=model_path,
                                                    strict=False)

            if opt.task_type == "crf":
                tmp_metric_str, tmp_f1 = crf_evaluation(
                    model, dev_info, device, ent2id)
            elif opt.task_type == "mrc":
                tmp_metric_str, tmp_f1 = mrc_evaluation(
                    model, dev_info, device)
            else:
                tmp_metric_str, tmp_f1 = span_evaluation(
                    model, dev_info, device, ent2id)

            logger.info(f"In step {tmp_step}:\n {tmp_metric_str}")

            metric_str += f"In step {tmp_step}:\n {tmp_metric_str}" + "\n\n"

            if tmp_f1 > max_f1:
                max_f1 = tmp_f1
                max_f1_step = tmp_step
                max_f1_path = model_path

        max_metric_str = f"Max f1 is: {max_f1}, in step {max_f1_step}"

        logger.info(max_metric_str)

        metric_str += max_metric_str + "\n"

        eval_save_path = os.path.join(opt.output_dir, "eval_metric.txt")

        with open(eval_save_path, "a", encoding="utf-8") as f1:
            f1.write(metric_str)

        with open("./best_ckpt_path.txt", "a", encoding="utf-8") as f2:
            f2.write(max_f1_path + "\n")

        del_dir_list = [
            os.path.join(opt.output_dir,
                         path.split("/")[-2]) for path in model_path_list
            if path != max_f1_path
        ]

        import shutil

        for x in del_dir_list:
            shutil.rmtree(x)
            logger.info("{}已删除".format(x))