def train(config=None): """ 模型训练主入口 pkuseg.train(trainFile, testFile, savedir, train_iter = 20, init_model = None) trainFile 训练文件路径。文件格式为多行文本 testFile 测试文件路径。 savedir 训练模型的保存路径。 train_iter 训练轮数。 init_model 初始化模型,默认为None表示使用默认初始化,用户可以填自己想要初始化的模型的路径如init_model='./models/'。 """ if config is None: config = Config() if config.init_model is None: # None feature_extractor = FeatureExtractor() else: feature_extractor = FeatureExtractor.load(config.init_model) """ `build()` 函数包含以下过程 : 1.逐行读取训练文本, 2.去除换行符、分隔符 3.处理数字和英文字母 4.以字符为单位,以15种方式来抽取特征 5.定义5中标签 6.分别将特征和标签转化为id的形式 `save()` 函数保存文件到 "xxx/models/ctb8/features.pkl", 二进制格式, 字典结构如下 : data = {'unigram': xx, 'bigram': xx, 'feature_to_idx': xx, 'tag_to_idx': xx} """ feature_extractor.build(config.trainFile) feature_extractor.save() # 将文本文件转为特征文件 feature_extractor.convert_text_file_to_feature_file( config.trainFile, config.c_train, config.f_train ) # ("xxx/data/small_training.utf8", "xxx/train.conll.txt", "xxx/train.feat.txt") feature_extractor.convert_text_file_to_feature_file( config.testFile, config.c_test, config.f_test ) # ("xxx/data/small_test.utf8", "xxx/test.conll.txt", "xxx/test.feat.txt") # 将特征文件中特征转化为id feature_extractor.convert_feature_file_to_idx_file( config.f_train, config.fFeatureTrain, config.fGoldTrain ) # ("xxx/train.feat.txt", "xxx/ftrain.txt", "xxx/gtrain.txt") feature_extractor.convert_feature_file_to_idx_file( config.f_test, config.fFeatureTest, config.fGoldTest ) # ("xxx/test.feat.txt", "xxx/ftest.txt", "xxx/gtest.txt") # 设置使用的评价指标、部分训练参数 config.globalCheck() """ `config.outDir` : 'xxx/output/' `config.fTune` : 'xxx/output/tune.txt' `config.fLog` : 'xxx/output/trainLog.txt' `config.fResRaw` : 'xxx/output/rawResult.txt' """ config.swLog = open(os.path.join(config.outDir, config.fLog), "w") config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w") config.swTune = open(os.path.join(config.outDir, config.fTune), "w") print("\nstart training...") config.swLog.write("\nstart training...\n") print("\nreading training & test data...") config.swLog.write("\nreading training & test data...\n") """ self.fFeatureTrain : 'ftrain.txt' self.fGoldTrain : 'gtrain.txt' self.fFeatureTest : 'ftest.txt' self.fGoldTest : 'gtest.txt' """ trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain) # ('ftrain.txt', 'gtrain.txt') testset = DataSet.load(config.fFeatureTest, config.fGoldTest) # ('ftest.txt', 'gtest.txt') # 是否扩增/缩小数据集,扩增方法是重复取数据,缩小方法是只取部分数据 trainset = trainset.resize(config.trainSizeScale) # (1) print("done! train/test data sizes: {}/{}".format(len(trainset), len(testset))) config.swLog.write("done! train/test data sizes: {}/{}\n".format( len(trainset), len(testset))) config.swLog.write("\nr: {}\n".format(config.reg)) # self.reg = 1 print("\nr: {}".format(config.reg)) if config.rawResWrite: # self.rawResWrite = True config.swResRaw.write("\n%r: {}\n".format(config.reg)) # 使用训练集,初始化训练类 trainer = Trainer(config, trainset, feature_extractor) time_list = [] # 存储 `trainer.train_epoch()` 过程的耗时 err_list = [] diff_list = [] score_list_list = [] for i in range( config.ttlIter): # self.ttlIter = 20 # of training iterations # config.glbIter += 1 time_s = time.time() err, sample_size, diff = trainer.train_epoch() time_t = time.time() - time_s time_list.append(time_t) err_list.append(err) diff_list.append(diff) score_list = trainer.test(testset, i) score_list_list.append(score_list) score = score_list[0] logstr = "iter{} diff={:.2e} train-time(sec)={:.2f} {}={:.2f}%".format( i, diff, time_t, config.metric, score) config.swLog.write(logstr + "\n") config.swLog.write( "------------------------------------------------\n") config.swLog.flush() print(logstr) res_summarize.write(config, time_list, err_list, diff_list, score_list_list) if config.save == 1: trainer.model.save() config.swLog.close() config.swResRaw.close() config.swTune.close() res_summarize.summarize(config) print("finished.")
def train(config=None): if config is None: config = Config() if config.init_model is None: feature_extractor = FeatureExtractor() else: feature_extractor = FeatureExtractor.load(config.init_model) feature_extractor.build(config.trainFile) feature_extractor.save() feature_extractor.convert_text_file_to_feature_file( config.trainFile, config.c_train, config.f_train) feature_extractor.convert_text_file_to_feature_file( config.testFile, config.c_test, config.f_test) feature_extractor.convert_feature_file_to_idx_file(config.f_train, config.fFeatureTrain, config.fGoldTrain) feature_extractor.convert_feature_file_to_idx_file(config.f_test, config.fFeatureTest, config.fGoldTest) config.globalCheck() config.swLog = open(os.path.join(config.outDir, config.fLog), "w") config.swResRaw = open(os.path.join(config.outDir, config.fResRaw), "w") config.swTune = open(os.path.join(config.outDir, config.fTune), "w") print("\nstart training...") config.swLog.write("\nstart training...\n") print("\nreading training & test data...") config.swLog.write("\nreading training & test data...\n") trainset = DataSet.load(config.fFeatureTrain, config.fGoldTrain) testset = DataSet.load(config.fFeatureTest, config.fGoldTest) trainset = trainset.resize(config.trainSizeScale) print("done! train/test data sizes: {}/{}".format(len(trainset), len(testset))) config.swLog.write("done! train/test data sizes: {}/{}\n".format( len(trainset), len(testset))) config.swLog.write("\nr: {}\n".format(config.reg)) print("\nr: {}".format(config.reg)) if config.rawResWrite: config.swResRaw.write("\n%r: {}\n".format(config.reg)) trainer = Trainer(config, trainset, feature_extractor) time_list = [] err_list = [] diff_list = [] score_list_list = [] for i in range(config.ttlIter): # config.glbIter += 1 time_s = time.time() err, sample_size, diff = trainer.train_epoch() time_t = time.time() - time_s time_list.append(time_t) err_list.append(err) diff_list.append(diff) score_list = trainer.test(testset, i) score_list_list.append(score_list) score = score_list[0] logstr = "iter{} diff={:.2e} train-time(sec)={:.2f} {}={:.2f}%".format( i, diff, time_t, config.metric, score) config.swLog.write(logstr + "\n") config.swLog.write( "------------------------------------------------\n") config.swLog.flush() print(logstr) res_summarize.write(config, time_list, err_list, diff_list, score_list_list) if config.save == 1: trainer.model.save() config.swLog.close() config.swResRaw.close() config.swTune.close() res_summarize.summarize(config) print("finished.")