from dataset.dataset_for_mrc_squad import DatasetForMrcSquad as Dataset from preprocess.preprocess_for_mrc import PreprocessForMRCChinese as Preprocess from model.network.mrc_net import MRCNet as Net from engine.mrc_train_engine import MRCTrainEngine as TrainEngine from engine.mrc_predict_engine import MRCPredictEngine as PredictEngine from util.util_parameter import UtilParameter as UParam from util.util_logging import UtilLogging as ULog if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_ernie") param.set_config(sys.argv[1:]) args = param.get_config(param.GLOBAL) # 初始化日志 logger = ULog(args, params=param) app_name = args["app_name"] dataset_args = param.get_config(param.DATASET) # 训练数据预处理 train_dataset = Dataset(dataset_args) train_dataset.read_from_srcfile( dataset_args['train_file_path'], cache=dataset_args['train_example_file_name'], is_training=True) train_preprocess = Preprocess( args=dataset_args, examples=train_dataset.get_examples(), cache=dataset_args['train_feature_file_name']) train_data_generator = train_preprocess.do_preprocess()
sents = re.split('(。|,|,|!|\!|\.|?|\?)', paragraph) res = [] for sent in sents: if len(sent) != 0 and not re.match('(。|,|,|!|\!|\.|?|\?)', sent): res.append(sent) return res if __name__ == "__main__": # 设置参数 param = UParam() param.read_config_file("config_roberta_large") args = param.get_config(param.PREDICT) # 初始化日志 logger = ULog(param) app_name = args["app_name"] ''' 常数定义 ''' file_name = "File_Directory/results/{}.json".format(app_name) new_data_name = "{}_re_predict_data".format(app_name) new_result_name = "{}_re_predict_out".format(app_name) final_result_name = "{}_final_out".format(app_name) threshold = args["re_predict_threshold"] mix_rate = args['re_predict_mix_rate'] decay_rate = args['re_predict_decay_rate'] select_threshold = args['re_predict_select_threshold'] ''' 预测过程
def __init__(self, args): self.examples = [] self.args = args self.logger = ULog(args, __name__)