示例#1
0

def split_sent(paragraph):
    sents = re.split('(。|,|,|!|\!|\.|?|\?)', paragraph)
    res = []
    for sent in sents:
        if len(sent) != 0 and not re.match('(。|,|,|!|\!|\.|?|\?)', sent):
            res.append(sent)
    return res


if __name__ == "__main__":

    # 设置参数
    param = UParam()
    param.read_config_file("config_roberta_large")
    args = param.get_config(param.PREDICT)
    # 初始化日志
    logger = ULog(param)

    app_name = args["app_name"]
    '''
    常数定义
    '''
    file_name = "File_Directory/results/{}.json".format(app_name)
    new_data_name = "{}_re_predict_data".format(app_name)
    new_result_name = "{}_re_predict_out".format(app_name)
    final_result_name = "{}_final_out".format(app_name)
    threshold = args["re_predict_threshold"]
    mix_rate = args['re_predict_mix_rate']
    decay_rate = args['re_predict_decay_rate']
示例#2
0
import numpy as np
import time
import sys
from engine.train_for_multitask import TrainEngineForMergeModel as TrainEngine
from engine.predict import PredictEngine as PredictEngine
from data.Dataset import Dataset
from preprocess.preprocess_for_mt import ProcessorForMergeModel as PreProcess

from util.util_parameter import UtilParameter as UParam
from util.util_logging import UtilLogging as ULog
import util.util_tool as util_tool

if __name__ == "__main__":
    # 设置参数
    param = UParam()
    param.read_config_file("config_test")
    param.set_config(sys.argv[1:])
    args = param.get_config(param.GLOBAL)
    # 初始化日志
    logger = ULog(param)

    app_name = args["app_name"]

    # corpus_cleaner = Corpus_cleaner()
    # # corpus_cleaner.read_from_json("pretrain_corpus.json")
    # corpus_cleaner.read_from_src()
    # docs = corpus_cleaner.get_docs()
    # for i in range(10):
    #     print(docs[i])
    #     print("###########################################################")
示例#3
0
import sys
from dataset.dataset_for_mrc_squad import DatasetForMrcSquad as Dataset
from preprocess.preprocess_for_mrc import PreprocessForMRCChinese as Preprocess
from model.network.mrc_net import MRCNet as Net
from engine.mrc_train_engine import MRCTrainEngine as TrainEngine
from engine.mrc_predict_engine import MRCPredictEngine as PredictEngine
from util.util_parameter import UtilParameter as UParam
from util.util_logging import UtilLogging as ULog

if __name__ == "__main__":
    # 设置参数
    param = UParam()
    param.read_config_file("config_ernie")
    param.set_config(sys.argv[1:])
    args = param.get_config(param.GLOBAL)
    # 初始化日志
    logger = ULog(args, params=param)
    app_name = args["app_name"]
    dataset_args = param.get_config(param.DATASET)

    # 训练数据预处理
    train_dataset = Dataset(dataset_args)
    train_dataset.read_from_srcfile(
        dataset_args['train_file_path'],
        cache=dataset_args['train_example_file_name'],
        is_training=True)
    train_preprocess = Preprocess(
        args=dataset_args,
        examples=train_dataset.get_examples(),
        cache=dataset_args['train_feature_file_name'])
    train_data_generator = train_preprocess.do_preprocess()
import numpy as np
import time
import sys
from engine.train_for_multitask import PredictEngineForMergeModel
from engine.predict import PredictEngine as PredictEngine
from data.Dataset import Dataset
from preprocess.preprocess_for_mt import ProcessorForMergeModel as PreProcess

from util.util_parameter import UtilParameter as UParam
from util.util_logging import UtilLogging as ULog
import util.util_tool as util_tool

if __name__ == "__main__":
    # 设置参数
    param = UParam()
    param.read_config_file("config_merge_test")
    param.set_config(sys.argv[1:])
    args = param.get_config(param.GLOBAL)
    # 初始化日志
    logger = ULog(param)

    app_name = args["app_name"]

    # corpus_cleaner = Corpus_cleaner()
    # # corpus_cleaner.read_from_json("pretrain_corpus.json")
    # corpus_cleaner.read_from_src()
    # docs = corpus_cleaner.get_docs()
    # for i in range(10):
    #     print(docs[i])
    #     print("###########################################################")
示例#5
0
from engine.predict import PredictEngine as PredictEngine
from engine.pretrain_engine import PreTrainEngine
from data.Dataset import Dataset
from data.Corpus_cleaner import Corpus_cleaner
from preprocess.preprocess import PreProcess
from preprocess.preprocess_for_pretrain import ProcessorForPretraining
from preprocess.preprocess_for_qa import ProcessorForPretrainingQa

from util.util_parameter import UtilParameter as UParam
from util.util_logging import UtilLogging as ULog
import util.util_tool as util_tool

if __name__ == "__main__":
    # 设置参数
    param = UParam()
    param.read_config_file("config_pretrain")
    param.set_config(sys.argv[1:])
    args = param.get_config(param.GLOBAL)
    # 初始化日志
    logger = ULog(param)

    app_name = args["app_name"]
    """
    # 读取数据集
    datasets = Dataset(logger=logger, args=param.get_config(param.DATASET))
    # datasets.read_dataset(div_nums=[7, 2, 1])
    datasets.load_examples()
    trainset, validset, testset = datasets.get_split()  # 这三个函数要修改,split应该检查是否已分割
    # datasets.save_example()
    for example in validset:
        print(1)