def split_sent(paragraph): sents = re.split('(。|,|,|!|\!|\.|?|\?)', paragraph) res = [] for sent in sents: if len(sent) != 0 and not re.match('(。|,|,|!|\!|\.|?|\?)', sent): res.append(sent) return res if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_roberta_large") args = param.get_config(param.PREDICT) # 初始化日志 logger = ULog(param) app_name = args["app_name"] ''' 常数定义 ''' file_name = "File_Directory/results/{}.json".format(app_name) new_data_name = "{}_re_predict_data".format(app_name) new_result_name = "{}_re_predict_out".format(app_name) final_result_name = "{}_final_out".format(app_name) threshold = args["re_predict_threshold"] mix_rate = args['re_predict_mix_rate'] decay_rate = args['re_predict_decay_rate']
import numpy as np import time import sys from engine.train_for_multitask import TrainEngineForMergeModel as TrainEngine from engine.predict import PredictEngine as PredictEngine from data.Dataset import Dataset from preprocess.preprocess_for_mt import ProcessorForMergeModel as PreProcess from util.util_parameter import UtilParameter as UParam from util.util_logging import UtilLogging as ULog import util.util_tool as util_tool if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_test") param.set_config(sys.argv[1:]) args = param.get_config(param.GLOBAL) # 初始化日志 logger = ULog(param) app_name = args["app_name"] # corpus_cleaner = Corpus_cleaner() # # corpus_cleaner.read_from_json("pretrain_corpus.json") # corpus_cleaner.read_from_src() # docs = corpus_cleaner.get_docs() # for i in range(10): # print(docs[i]) # print("###########################################################")
import sys from dataset.dataset_for_mrc_squad import DatasetForMrcSquad as Dataset from preprocess.preprocess_for_mrc import PreprocessForMRCChinese as Preprocess from model.network.mrc_net import MRCNet as Net from engine.mrc_train_engine import MRCTrainEngine as TrainEngine from engine.mrc_predict_engine import MRCPredictEngine as PredictEngine from util.util_parameter import UtilParameter as UParam from util.util_logging import UtilLogging as ULog if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_ernie") param.set_config(sys.argv[1:]) args = param.get_config(param.GLOBAL) # 初始化日志 logger = ULog(args, params=param) app_name = args["app_name"] dataset_args = param.get_config(param.DATASET) # 训练数据预处理 train_dataset = Dataset(dataset_args) train_dataset.read_from_srcfile( dataset_args['train_file_path'], cache=dataset_args['train_example_file_name'], is_training=True) train_preprocess = Preprocess( args=dataset_args, examples=train_dataset.get_examples(), cache=dataset_args['train_feature_file_name']) train_data_generator = train_preprocess.do_preprocess()
import numpy as np import time import sys from engine.train_for_multitask import PredictEngineForMergeModel from engine.predict import PredictEngine as PredictEngine from data.Dataset import Dataset from preprocess.preprocess_for_mt import ProcessorForMergeModel as PreProcess from util.util_parameter import UtilParameter as UParam from util.util_logging import UtilLogging as ULog import util.util_tool as util_tool if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_merge_test") param.set_config(sys.argv[1:]) args = param.get_config(param.GLOBAL) # 初始化日志 logger = ULog(param) app_name = args["app_name"] # corpus_cleaner = Corpus_cleaner() # # corpus_cleaner.read_from_json("pretrain_corpus.json") # corpus_cleaner.read_from_src() # docs = corpus_cleaner.get_docs() # for i in range(10): # print(docs[i]) # print("###########################################################")
from engine.predict import PredictEngine as PredictEngine from engine.pretrain_engine import PreTrainEngine from data.Dataset import Dataset from data.Corpus_cleaner import Corpus_cleaner from preprocess.preprocess import PreProcess from preprocess.preprocess_for_pretrain import ProcessorForPretraining from preprocess.preprocess_for_qa import ProcessorForPretrainingQa from util.util_parameter import UtilParameter as UParam from util.util_logging import UtilLogging as ULog import util.util_tool as util_tool if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_pretrain") param.set_config(sys.argv[1:]) args = param.get_config(param.GLOBAL) # 初始化日志 logger = ULog(param) app_name = args["app_name"] """ # 读取数据集 datasets = Dataset(logger=logger, args=param.get_config(param.DATASET)) # datasets.read_dataset(div_nums=[7, 2, 1]) datasets.load_examples() trainset, validset, testset = datasets.get_split() # 这三个函数要修改,split应该检查是否已分割 # datasets.save_example() for example in validset: print(1)