def training(opt): processor = NERProcessor(opt.raw_data_dir) train_feature,dev_feature,test_feature,pseudo_feature = processor.get_data_examples() # train_feature = train_feature + pseudo_feature train_feature = train_feature + dev_feature train_base(opt, train_feature,dev_feature,test_feature)
def training(opt): processor = NERProcessor(opt.raw_data_dir) print("开始单折训练和预测:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) train_feature,dev_feature,fu_test_feature,pseudo_feature,chu_test_feature = processor.get_data_examples() train_feature = train_feature + pseudo_feature train_feature = train_feature + dev_feature train_base(opt, train_feature,dev_feature,fu_test_feature) print("结束单折训练和预测:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
def stacking(opt): logger.info('Start to KFold stack attribution model') if args.task_type == 'mrc': # 62 for mrc query processor = NERProcessor(opt.max_seq_len-62) else: processor = NERProcessor(opt.max_seq_len) kf = KFold(5, shuffle=True, random_state=42) stack_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'stack.json')) pseudo_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'pseudo.json')) base_output_dir = opt.output_dir for i, (train_ids, dev_ids) in enumerate(kf.split(stack_raw_examples)): logger.info(f'Start to train the {i} fold') train_raw_examples = [stack_raw_examples[_idx] for _idx in train_ids] # add pseudo data to train data train_raw_examples = train_raw_examples + pseudo_raw_examples train_examples = processor.get_examples(train_raw_examples, 'train') dev_raw_examples = [stack_raw_examples[_idx] for _idx in dev_ids] dev_info = processor.get_examples(dev_raw_examples, 'dev') tmp_output_dir = os.path.join(base_output_dir, f'v{i}') opt.output_dir = tmp_output_dir train_base(opt, train_examples, dev_info)
def training(opt): if args.task_type == "mrc": # 62 for mrc query processor = NERProcessor(opt.max_seq_len - 62) else: processor = NERProcessor(opt.max_seq_len) # todo ??? # train_raw_examples = processor.read_json( # os.path.join(opt.raw_data_dir, "stack.json") # ) train_raw_examples = processor.read_json( os.path.join(opt.raw_data_dir, "train.json")) # add pseudo data to train data # pseudo_raw_examples = processor.read_json( # os.path.join(opt.raw_data_dir, "pseudo.json") # ) # train_raw_examples = train_raw_examples + pseudo_raw_examples train_raw_examples = train_raw_examples train_examples = processor.get_examples(train_raw_examples, "train") dev_examples = None if opt.eval_model: dev_raw_examples = processor.read_json( os.path.join(opt.raw_data_dir, "dev.json")) dev_examples = processor.get_examples(dev_raw_examples, "dev") train_base(opt, train_examples, dev_examples)
def ready_pretrain_data(): #外部数据 # out_data = [] # with open(file='./data/raw_data/addr_sample',mode='r',encoding='utf-8') as lines: # for line in lines: # if line.strip(): # out_data.append(line.strip()) #复赛数据 data = [] processor = NERProcessor('./data/raw_data') train_feature, dev_feature, fu_test_feature, _, chu_test_feature = processor.get_data_examples( ) for feature in [ train_feature, dev_feature, fu_test_feature, chu_test_feature ]: for sample in feature: data.append(''.join(sample.text).strip()) return data
def stacking(opt): processor = NERProcessor(opt.max_seq_len) train_feature,dev_feature,fu_test_feature,pseudo_feature,chu_test_feature = processor.get_data_examples() train = train_feature + dev_feature if opt.cv_infer: print("开始进行cv训练和推理:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) test_feature = fu_test_feature else: print("开始cv生成pseudo数据:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) test_feature = fu_test_feature + chu_test_feature kf = KFold(5, shuffle=True, random_state=42) base_output_dir = opt.output_dir models = [] for i, (train_ids, dev_ids) in enumerate(kf.split(train)): logger.info(f'Start to train the {i} fold') train_raw_examples = [train[_idx] for _idx in train_ids] dev_raw_examples = [train[_idx] for _idx in dev_ids] # add pseudo data to train data train_raw_examples = train_raw_examples + pseudo_feature tmp_output_dir = os.path.join(base_output_dir, f'v{i}') opt.output_dir = tmp_output_dir opt.cv_num = i model,device=train_base(opt, train_raw_examples, dev_raw_examples, test_feature) models.append(model) #生成pseudo数据集 if opt.cv_infer: #ensemble_infer(opt,test_feature,models,device) generate_pseudos(opt) print("结束cv训练和推理:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) else: generate_pseudos(opt) print("结束cv生成pseudo数据:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
def training(opt): if args.task_type == 'mrc': # 62 for mrc query processor = NERProcessor(opt.max_seq_len-62) else: processor = NERProcessor(opt.max_seq_len) train_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'train.json')) # add pseudo data to train data pseudo_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'pseudo.json')) train_raw_examples = train_raw_examples + pseudo_raw_examples train_examples = processor.get_examples(train_raw_examples, 'train') dev_examples = None if opt.eval_model: dev_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'dev.json')) dev_examples = processor.get_examples(dev_raw_examples, 'dev') train_base(opt, train_examples, dev_examples)