示例#1
0
def train(config=None):
    """
    模型训练主入口

    pkuseg.train(trainFile, testFile, savedir, train_iter = 20, init_model = None)
                trainFile		训练文件路径。文件格式为多行文本
                testFile		测试文件路径。
                savedir			训练模型的保存路径。
                train_iter		训练轮数。
                init_model		初始化模型,默认为None表示使用默认初始化,用户可以填自己想要初始化的模型的路径如init_model='./models/'。
    """
    if config is None:
        config = Config()

    if config.init_model is None:  # None
        feature_extractor = FeatureExtractor()
    else:
        feature_extractor = FeatureExtractor.load(config.init_model)
    """
    `build()` 函数包含以下过程 :
        1.逐行读取训练文本,
        2.去除换行符、分隔符
        3.处理数字和英文字母
        4.以字符为单位,以15种方式来抽取特征
        5.定义5中标签
        6.分别将特征和标签转化为id的形式
    
    `save()` 函数保存文件到 "xxx/models/ctb8/features.pkl", 二进制格式, 
    字典结构如下 : 
        data = {'unigram': xx, 'bigram': xx, 'feature_to_idx': xx, 'tag_to_idx': xx}
    """
    feature_extractor.build(config.trainFile)
    feature_extractor.save()

    # 将文本文件转为特征文件
    feature_extractor.convert_text_file_to_feature_file(
        config.trainFile, config.c_train, config.f_train
    )  # ("xxx/data/small_training.utf8", "xxx/train.conll.txt", "xxx/train.feat.txt")
    feature_extractor.convert_text_file_to_feature_file(
        config.testFile, config.c_test, config.f_test
    )  # ("xxx/data/small_test.utf8", "xxx/test.conll.txt", "xxx/test.feat.txt")

    # 将特征文件中特征转化为id
    feature_extractor.convert_feature_file_to_idx_file(
        config.f_train, config.fFeatureTrain, config.fGoldTrain
    )  # ("xxx/train.feat.txt", "xxx/ftrain.txt", "xxx/gtrain.txt")
    feature_extractor.convert_feature_file_to_idx_file(
        config.f_test, config.fFeatureTest, config.fGoldTest
    )  # ("xxx/test.feat.txt", "xxx/ftest.txt", "xxx/gtest.txt")

    # 设置使用的评价指标、部分训练参数
    config.globalCheck()
    """
    `config.outDir` : 'xxx/output/'
    `config.fTune` : 'xxx/output/tune.txt'
    `config.fLog` : 'xxx/output/trainLog.txt'
    `config.fResRaw` : 'xxx/output/rawResult.txt'
    """
    config.swLog = open(os.path.join(config.outDir, config.fLog), "w")
    config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w")
    config.swTune = open(os.path.join(config.outDir, config.fTune), "w")

    print("\nstart training...")
    config.swLog.write("\nstart training...\n")

    print("\nreading training & test data...")
    config.swLog.write("\nreading training & test data...\n")
    """
    self.fFeatureTrain : 'ftrain.txt'
    self.fGoldTrain : 'gtrain.txt'
    self.fFeatureTest : 'ftest.txt'
    self.fGoldTest : 'gtest.txt'
    """
    trainset = DataSet.load(config.fFeatureTrain,
                            config.fGoldTrain)  # ('ftrain.txt', 'gtrain.txt')
    testset = DataSet.load(config.fFeatureTest,
                           config.fGoldTest)  # ('ftest.txt', 'gtest.txt')

    # 是否扩增/缩小数据集,扩增方法是重复取数据,缩小方法是只取部分数据
    trainset = trainset.resize(config.trainSizeScale)  # (1)

    print("done! train/test data sizes: {}/{}".format(len(trainset),
                                                      len(testset)))
    config.swLog.write("done! train/test data sizes: {}/{}\n".format(
        len(trainset), len(testset)))

    config.swLog.write("\nr: {}\n".format(config.reg))  # self.reg = 1
    print("\nr: {}".format(config.reg))
    if config.rawResWrite:  # self.rawResWrite = True
        config.swResRaw.write("\n%r: {}\n".format(config.reg))

    # 使用训练集,初始化训练类
    trainer = Trainer(config, trainset, feature_extractor)

    time_list = []  # 存储 `trainer.train_epoch()` 过程的耗时
    err_list = []
    diff_list = []
    score_list_list = []

    for i in range(
            config.ttlIter):  # self.ttlIter = 20  # of training iterations
        # config.glbIter += 1
        time_s = time.time()

        err, sample_size, diff = trainer.train_epoch()

        time_t = time.time() - time_s
        time_list.append(time_t)

        err_list.append(err)
        diff_list.append(diff)

        score_list = trainer.test(testset, i)
        score_list_list.append(score_list)
        score = score_list[0]

        logstr = "iter{}  diff={:.2e}  train-time(sec)={:.2f}  {}={:.2f}%".format(
            i, diff, time_t, config.metric, score)
        config.swLog.write(logstr + "\n")
        config.swLog.write(
            "------------------------------------------------\n")
        config.swLog.flush()
        print(logstr)

    res_summarize.write(config, time_list, err_list, diff_list,
                        score_list_list)
    if config.save == 1:
        trainer.model.save()

    config.swLog.close()
    config.swResRaw.close()
    config.swTune.close()

    res_summarize.summarize(config)

    print("finished.")
示例#2
0
def train(config=None):
    if config is None:
        config = Config()

    if config.init_model is None:
        feature_extractor = FeatureExtractor()
    else:
        feature_extractor = FeatureExtractor.load(config.init_model)
    feature_extractor.build(config.trainFile)
    feature_extractor.save()

    feature_extractor.convert_text_file_to_feature_file(
        config.trainFile, config.c_train, config.f_train)
    feature_extractor.convert_text_file_to_feature_file(
        config.testFile, config.c_test, config.f_test)

    feature_extractor.convert_feature_file_to_idx_file(config.f_train,
                                                       config.fFeatureTrain,
                                                       config.fGoldTrain)
    feature_extractor.convert_feature_file_to_idx_file(config.f_test,
                                                       config.fFeatureTest,
                                                       config.fGoldTest)

    config.globalCheck()

    config.swLog = open(os.path.join(config.outDir, config.fLog), "w")
    config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w")
    config.swTune = open(os.path.join(config.outDir, config.fTune), "w")

    print("\nstart training...")
    config.swLog.write("\nstart training...\n")

    print("\nreading training & test data...")
    config.swLog.write("\nreading training & test data...\n")

    trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain)
    testset = DataSet.load(config.fFeatureTest, config.fGoldTest)

    trainset = trainset.resize(config.trainSizeScale)

    print("done! train/test data sizes: {}/{}".format(len(trainset),
                                                      len(testset)))
    config.swLog.write("done! train/test data sizes: {}/{}\n".format(
        len(trainset), len(testset)))

    config.swLog.write("\nr: {}\n".format(config.reg))
    print("\nr: {}".format(config.reg))
    if config.rawResWrite:
        config.swResRaw.write("\n%r: {}\n".format(config.reg))

    trainer = Trainer(config, trainset, feature_extractor)

    time_list = []
    err_list = []
    diff_list = []
    score_list_list = []

    for i in range(config.ttlIter):
        # config.glbIter += 1
        time_s = time.time()
        err, sample_size, diff = trainer.train_epoch()
        time_t = time.time() - time_s
        time_list.append(time_t)
        err_list.append(err)
        diff_list.append(diff)

        score_list = trainer.test(testset, i)
        score_list_list.append(score_list)
        score = score_list[0]

        logstr = "iter{}  diff={:.2e}  train-time(sec)={:.2f}  {}={:.2f}%".format(
            i, diff, time_t, config.metric, score)
        config.swLog.write(logstr + "\n")
        config.swLog.write(
            "------------------------------------------------\n")
        config.swLog.flush()
        print(logstr)

    res_summarize.write(config, time_list, err_list, diff_list,
                        score_list_list)
    if config.save == 1:
        trainer.model.save()

    config.swLog.close()
    config.swResRaw.close()
    config.swTune.close()

    res_summarize.summarize(config)

    print("finished.")