def main(): NUM_TRAIN_DATA = 150000 NUM_TEST_DATA = 5000 MODEL_DIR = './albert_base' MAX_LEN = 512 BATCH_SIZE = 16 * 2 # 8gpu * 16 LR = 1e-5 NUM_LABELS = 33 EPOCHS = 4 # read data content, target = read_data('../../corpus/ettoday_2017.json') # train dataloader examples = DataProcessor().get_train_examples(content[:NUM_TRAIN_DATA], target[:NUM_TRAIN_DATA]) train_dataset = convert_examples_to_features(examples, max_length=MAX_LEN, tokenizer=BertTokenizerFast.from_pretrained(MODEL_DIR)) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) # test dataloader examples = DataProcessor().get_test_examples(content[NUM_TRAIN_DATA:NUM_TEST_DATA+NUM_TRAIN_DATA], target[NUM_TRAIN_DATA:NUM_TEST_DATA+NUM_TRAIN_DATA]) test_dataset = convert_examples_to_features(examples, max_length=MAX_LEN, tokenizer=BertTokenizerFast.from_pretrained(MODEL_DIR)) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE) # start training and callback for eval # train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader) train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, lr=LR, epochs=EPOCHS, eval_callback=evaluate, test_loader=test_loader)
def build_features(input_path, tokenizer, poss, labels, config, mode='train', w_tokenizer=None, glabels={}): logger.info("[Creating features from file] %s", input_path) examples = read_examples_from_file(config, input_path, mode=mode) features = convert_examples_to_features( config, examples, poss, labels, config['n_ctx'], tokenizer, cls_token=tokenizer.cls_token, cls_token_segment_id=0, sep_token=tokenizer.sep_token, sep_token_extra=bool(config['emb_class'] in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_token=tokenizer.pad_token, pad_token_id=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_pos_id=config['pad_pos_id'], pad_token_label_id=config['pad_label_id'], pad_token_segment_id=0, sequence_a_segment_id=0, glabel_map=glabels, w_tokenizer=w_tokenizer) return features
def main(): NUM_TRAIN_DATA = 60000 MODEL_DIR = './rbtl3' MAX_LEN = 512 BATCH_SIZE = 12 EPOCHS = 4 # read data content, target = read_data() # train dataloader examples = DataProcessor().get_train_examples(content[:NUM_TRAIN_DATA], target[:NUM_TRAIN_DATA]) train_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=BertTokenizer.from_pretrained(MODEL_DIR)) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) # test dataloader examples = DataProcessor().get_test_examples(content[NUM_TRAIN_DATA:], target[NUM_TRAIN_DATA:]) test_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=BertTokenizer.from_pretrained(MODEL_DIR)) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE) # start training and callback for eval # train(train_loader, MODEL_DIR, num_labels=18, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader) train(train_loader, MODEL_DIR, num_labels=18, epochs=EPOCHS, eval_callback=evaluate, test_loader=test_loader)
def map_eval(eval_file, token_length, tokenizer, device, model, label_list): model.eval() datasets, labels = get_datasets(eval_file) total_batches = 0 total_avp = 0.0 total_mrr = 0.0 # scores, labels = [], [] for k, dataset in tqdm(datasets.items(), desc="Eval datasets"): examples = [] for i, data in enumerate(dataset): examples.append(InputExample(i, data[0], data[1], '0')) eval_features = convert_examples_to_features(examples, label_list, token_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long).to(device) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long).to(device) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long).to(device) # all_label_ids = torch.tensor( # [f.label_id for f in eval_features], dtype=torch.long).to(device) x_input_ids = torch.tensor([f.input_ids_x for f in eval_features], dtype=torch.long).to(device) x_input_mask = torch.tensor([f.input_mask_x for f in eval_features], dtype=torch.long).to(device) x_segment_ids = torch.tensor([f.segment_ids_x for f in eval_features], dtype=torch.long).to(device) y_input_ids = torch.tensor([f.input_ids_y for f in eval_features], dtype=torch.long).to(device) y_input_mask = torch.tensor([f.input_mask_y for f in eval_features], dtype=torch.long).to(device) y_segment_ids = torch.tensor([f.segment_ids_y for f in eval_features], dtype=torch.long).to(device) with torch.no_grad(): logits = model(x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids, all_input_ids, all_segment_ids, all_input_mask) score = F.softmax(logits, dim=1)[:, 1].cpu().numpy() label = np.array(list(map(int, labels[k]))) # print(score, label) # scores.append(score) # labels.append(label) total_avp += mean_average_precision(label, score) total_mrr += mean_reciprocal_rank(label, score) total_batches += 1 mAP = total_avp / total_batches mRR = total_mrr / total_batches logger.info("map is : {}, mrr is : {}".format(mAP, mRR)) data = {'map': mAP, 'mrr': mRR} with open('./result.json', 'w', encoding='utf-8') as f: json.dump(data, f)
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, data_file, is_test=False, is_ens=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( data_file, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length) ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = read_examples_from_file(args.data_dir, data_file, is_test=is_test, is_ens=is_ens) features = convert_examples_to_features( examples, labels, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool("roberta" in args.model_type), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_label_id=pad_token_label_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def get_dataloader(processor, args, tokenizer, mode='test'): eval_examples = processor.get_test_examples() if mode=='test' \ else processor.get_dev_examples() eval_examples = eval_examples[:1000] label_list = processor.get_labels() eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) x_input_ids = torch.tensor([f.input_ids_x for f in eval_features], dtype=torch.long) x_input_mask = torch.tensor([f.input_mask_x for f in eval_features], dtype=torch.long) x_segment_ids = torch.tensor([f.segment_ids_x for f in eval_features], dtype=torch.long) y_input_ids = torch.tensor([f.input_ids_y for f in eval_features], dtype=torch.long) y_input_mask = torch.tensor([f.input_mask_y for f in eval_features], dtype=torch.long) y_segment_ids = torch.tensor([f.segment_ids_y for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) return eval_dataloader
def read_data(file): df = pd.read_json(file) df = shuffle(df) content = (df['title'] + ' ' + df['content']).to_list() target = df['category'].to_list() return content, target if __name__ == '__main__': import pandas as pd NUM_TEST_DATA = 50016 MODEL_DIR = './electra_chinese_base' MAX_LEN = 512 BATCH_SIZE = 16 * 2 # 8gpu * 16 NUM_LABELS = 33 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" content, target = read_data('../../corpus/ettoday_2017.json') examples = DataProcessor().get_test_examples(content[:NUM_TEST_DATA], target[:NUM_TEST_DATA]) test_dataset = convert_examples_to_features( examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR)) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE) evaluate(test_loader, MODEL_DIR, 'step_18749.ckpt', NUM_LABELS)
def train(model, processor, task_name, optimizer, train_examples, label_list, args, tokenizer, device, n_gpu, num_train_optimization_steps, valid=False): # model.train() global_step = 0 nb_tr_steps = 0 tr_loss = 0 # train_features = convert_examples_to_features( # train_examples, label_list, args.max_seq_length, tokenizer) if os.path.exists('./cache_cmed/train_features.pkl'): with open('./cache_cmed/train_features.pkl', 'rb') as f: train_features = pickle.load(f)[:50000] else: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) with open('./cache_cmed/train_features.pkl', 'wb') as f: pickle.dump(train_features, f) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) x_input_ids = torch.tensor([f.input_ids_x for f in train_features], dtype=torch.long) x_input_mask = torch.tensor([f.input_mask_x for f in train_features], dtype=torch.long) x_segment_ids = torch.tensor([f.segment_ids_x for f in train_features], dtype=torch.long) y_input_ids = torch.tensor([f.input_ids_y for f in train_features], dtype=torch.long) y_input_mask = torch.tensor([f.input_mask_y for f in train_features], dtype=torch.long) y_segment_ids = torch.tensor([f.segment_ids_y for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids = batch loss = model(x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids, input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() logger.info(loss.item()) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * \ warmup_linear( global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() #global_step += 1 if valid: logging.info('Start eval the dev set') if task_name in ['lcqmc', 'mrpc', 'qqp', "cmedqa"]: eval_dataloader = get_dataloader(processor, args, tokenizer, mode='dev') eval(model, eval_dataloader, device) else: dev_file = os.path.join(args.data_dir, 'dev.tsv') map_eval(dev_file, args.max_seq_length, tokenizer, device, model, label_list)