def train_process(config, train_load, train_sampler, model_name): # load source bert weights model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="../user_data/bert_source/{}_config.json" .format(model_name)) # model_config = BertConfig() model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForSequenceClassification(config=model_config) checkpoint = torch.load( '../user_data/save_bert/{}_checkpoint.pth.tar'.format(model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) print('***********load pretrained mlm {} weight*************'.format( model_name)) for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # t_total = len(train_load) * config.num_train_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total # ) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() if config.fgm: fgm = FGM(model) for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss model.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) if config.fgm: fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label).loss loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 optimizer.step() # scheduler.step() # dev_auc = model_evaluate(config, model, valid_load) # 同步各个进程的速度,计算分布式loss torch.distributed.barrier() # reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item() # if reduce_dev_auc > best_dev_auc: # best_dev_auc = reduce_dev_auc # is_best = True now = strftime("%Y-%m-%d %H:%M:%S", localtime()) msg = 'model_name:{},time:{},epoch:{}/{}' if config.local_rank in [0, -1]: print( msg.format(model_name, now, epoch + 1, config.num_train_epochs)) checkpoint = {"status": model.module.state_dict()} torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_checkpoint.pth.tar'.format(model_name)) del checkpoint torch.distributed.barrier()
def inference(args): # Check for CUDA device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained(args.bert_name) # Prepare jsons ind2organ = json.load( open(os.path.join(args.organs_dir_path, "ind2organ.json"))) organ2label = json.load( open(os.path.join(args.organs_dir_path, "organ2label.json"))) organ2voxels = json.load( open(os.path.join(args.organs_dir_path, "organ2voxels.json"))) test_dataset = VoxelSentenceMappingTestRegDataset(args.test_json_path, tokenizer, ind2organ) test_loader = DataLoader( test_dataset, batch_size=args.batch_size, collate_fn=collate_pad_sentence_reg_test_batch, ) # Create model config = BertConfig.from_pretrained(args.bert_name) model = nn.DataParallel( RegModel(args.bert_name, config, final_project_size=3)).to(device) # Load model model.load_state_dict(torch.load(args.checkpoint_path, map_location=device)) # Set model in evaluation mode model.train(False) # Create evaluator evaluator = InferenceEvaluatorPerOrgan( ind2organ, organ2label, organ2voxels, args.voxelman_images_path, test_dataset.organ2count, len(test_dataset), ) center = torch.from_numpy(VOXELMAN_CENTER) # Restart counters evaluator.reset_counters() for input_batch, organs_indices, _ in tqdm(test_loader): input_batch = {key: val.to(device) for key, val in input_batch.items()} output_mappings = (model( input_ids=input["sentences"], attention_mask=input_batch["attn_mask"], ).cpu() * center) for output_mapping, organ_indices in zip(output_mappings, organs_indices): evaluator.update_counters(output_mapping.numpy(), organ_indices.numpy()) print( "The avg IOR on the test set is: " f"{evaluator.get_current_ior()} +/- {evaluator.get_ior_error_bar()}" ) print( "The avg distance on the test set is: " f"{evaluator.get_current_distance()} +/- {evaluator.get_distance_error_bar()}" ) print( "The avg miss distance on the test set is: " f"{evaluator.get_current_miss_distance()} +/- {evaluator.get_miss_distance_error_bar()}" ) print("============================================") for organ_name in evaluator.organ2count.keys(): if evaluator.get_current_ior_for_organ(organ_name) > -1: print(f"The avg IOR for {organ_name} is: " f"{evaluator.get_current_ior_for_organ(organ_name)} +/- " f"{evaluator.get_ior_error_bar_for_organ(organ_name)}") print( f"The avg NVD {organ_name} is: " f"{evaluator.get_current_distance_for_organ(organ_name)} +/- " f"{evaluator.get_distance_error_bar_for_organ(organ_name)}" ) print( f"The avg NVD-O {organ_name} is: " f"{evaluator.get_current_miss_distance_for_organ(organ_name)} +/- " f"{evaluator.get_miss_distance_error_bar_for_organ(organ_name)}" ) print("============================================")
mode="test", ) val_dataloader = DataLoader( val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset), collate_fn=nlpbook.data_collator, drop_last=False, num_workers=0, ) # %% initialize model from transformers import BertConfig, BertForSequenceClassification pt_model_config = BertConfig.from_pretrained( args.pretrained_model_name, num_labels=corpus.num_labels, ) model = BertForSequenceClassification.from_pretrained( args.pretrained_model_name, config=pt_model_config, ) # %% prepare training from ratsnlp.nlpbook.classification import ClassificationTask task = ClassificationTask(model, args) # %% trainer = nlpbook.get_trainer(args) # %%
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_type", type=str, required=True, choices=["rbert", "bert_em_cls", "bert_em_es", "bert_em_all"], help="Model type") parser.add_argument("--model_dir", type=str, required=True, help="Path to model directory") parser.add_argument("--input_file", type=str, required=True, help="Path to input file") parser.add_argument("--output_file", type=str, required=True, help="Path to output file (to store predicted labels)") parser.add_argument("--eval_batch_size", type=int, default=32, help="Batch size for evaluation.") parser.add_argument("--no_cuda", action="store_true", help="Whether to use GPU for evaluation.") parser.add_argument("--overwrite_cache", action="store_true", help="Whether to overwrite cached feature file.") args = parser.parse_args() init_logger() logger.info("%s" % args) config = BertConfig.from_pretrained(args.model_dir) train_args = torch.load(os.path.join(args.model_dir, "training_args.bin")) logger.info("Training args: {}".format(train_args)) train_args.eval_batch_size = args.eval_batch_size train_args.overwrite_cache = args.overwrite_cache # For BERT-EM, we have to use GPU because we fix device="cuda" in the code args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" # Check whether model exists if not os.path.exists(args.model_dir): raise Exception("Model doesn't exists! Train first!") # Load tokenizer tokenizer = load_tokenizer(train_args)
except ValueError: tpu = None if tpu: tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) else: # Default distribution strategy in Tensorflow. Works on CPU and single GPU. strategy = tf.distribute.get_strategy() print("REPLICAS: ", strategy.num_replicas_in_sync) model_name = 'bert-base-multilingual-cased' config = BertConfig.from_pretrained(model_name) config.output_hidden_states = False tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) transformer_model = TFBertModel.from_pretrained(model_name, config = config) train = pd.read_csv('/kaggle/input/dataset/train.csv') valid = pd.read_csv('/kaggle/input/dataset/val.csv') test = pd.read_csv('/kaggle/input/dataset/test.csv') #IMP DATA FOR CONFIG AUTO = tf.data.experimental.AUTOTUNE # Configuration EPOCHS = 3
def create_long_model(save_model_to, attention_window, max_pos, pretrained_config, pretrained_checkpoint, pretrained_tokenizer): """ Convert RoBERTa into Long-Version :param save_model_to: the model save path :param attention_window: the long-attention defined above :param max_pos: extend the position embedding to max_pos=4096 :return: modified model and tokenizer """ config = BertConfig.from_pretrained(pretrained_config) model = BertForMaskedLM.from_pretrained(pretrained_checkpoint, config=config) tokenizer = BertTokenizerFast.from_pretrained(pretrained_tokenizer, model_max_length=max_pos) # extend position embedding tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape # RoBERTa has position 0,1 reserved, embedding size = max_pos + 2 #max_pos += 2 # ??? is this fit for BERT-based RoBerta_zh? """ RoBERTa reserved position 0 1, However, Bert-based RoBERTa_zh did not. """ config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # init by duplication k = 0 step = current_max_pos while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.bert.embeddings.position_embeddings.weight[0:] k += step model.bert.embeddings.position_embeddings.weight.data = new_pos_embed # The next problem is that: BERT_Based RoBERTa has not attribute [position_ids] for [bert.embeddings] # model.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos) # replace the modeling_bert.BertSelfAttention obj with LongformerSelfAttention config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.bert.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = copy.deepcopy( layer.attention.self.query) longformer_self_attn.key_global = copy.deepcopy( layer.attention.self.key) longformer_self_attn.value_global = copy.deepcopy( layer.attention.self.value) layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
from transformers import BertForSequenceClassification, BertConfig from torch.utils.data import DataLoader import torch from tools import start_debugger_on_exception from dataset import DataSetBert import numpy as np start_debugger_on_exception() train_dataset = DataSetBert(data_file='./data/data_train/train.csv') val_dataset = DataSetBert(data_file='./data/data_train/val.csv') test_dataset = DataSetBert(data_file='./data/data_train/test.csv') from torch.utils.data import DataLoader device = torch.device('cuda:6') train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True) model_config = BertConfig.from_pretrained('bert-base-chinese') model_config.num_hidden_layers = 3 model = BertForSequenceClassification(model_config) from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = model.config.eos_token_id model.config.max_position_embeddings = 1024 model.to(device) model.train() model.to(device) import pdb pdb.set_trace() from transformers import AdamW optimizer = AdamW(model.parameters(), lr=1e-5) no_decay = ['bias', 'LayerNorm.weight']
def main(): args = parse_args() os.makedirs(args.output_dir, exist_ok=True) set_seed(args.seed) logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s') if args.numnet_model is not None: config = BertConfig.from_pretrained( args.model_name, num_labels=1) # 1 label for regression # if args.contrastive: # model = ContrastiveElectra.from_pretrained(args.model_name, config=config) # else: model = BertForSequenceClassification.from_pretrained(args.model_name, config=config) state_dicts = torch.load(args.numnet_model) if "model" in state_dicts: logging.info("Loading in mutual electra format state_dicts.") model.load_state_dict(state_dicts["model"], strict=False) else: logging.info("Loading model weights only.") model.load_state_dict(state_dicts, strict=False) else: config = ElectraConfig.from_pretrained( args.model_name, num_labels=1) # 1 label for regression model = ElectraForSequenceClassification.from_pretrained( args.model_name, config=config) if args.local_model_path is not None: state_dicts = torch.load(args.local_model_path) model.load_state_dict(state_dicts["model"]) tokenizer = ElectraTokenizer.from_pretrained(args.model_name, do_lower_case=True) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # TODO enable multi-gpu training if necessary pretrain_train_dataset = DapoDataset(args.data_dir, "train", tokenizer) if args.pretrain else None pretrain_dev_dataset = DapoDataset(args.data_dir, "dev", tokenizer) if args.pretrain else None if args.train: if args.contrastive: train_dataset = ContrastiveDataset(args.data_dir, "train", tokenizer) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_contrast_collate) dev_dataset = ContrastiveDataset( args.data_dir, "dev", tokenizer) if args.eval or args.test else None dev_dataloader = DataLoader(dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_contrast_collate ) if dev_dataset is not None else None else: train_dataset = MutualDataset(args.data_dir, "train", tokenizer) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=8, collate_fn=mutual_collate) dev_dataset = MutualDataset( args.data_dir, "dev", tokenizer) if args.eval or args.test else None dev_dataloader = DataLoader( dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_collate) if dev_dataset is not None else None else: train_dataset, train_dataloader = None, None # TODO: add test_dataset if we want to submit to leaderboard pretrain_train_dataloader = DataLoader( pretrain_train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=8, collate_fn=dapo_collate ) if pretrain_train_dataset is not None else None pretrain_dev_dataloader = DataLoader( pretrain_dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=dapo_collate) if pretrain_dev_dataset is not None else None # currently eval_batch_size = train_batch_size if args.pretrain: logging.info("Start pretraining...") args.eval = True trainer = Trainer(args, model, device, pretrain_train_dataloader, pretrain_dev_dataloader) trainer.train() return # fine-tuning should be done separately if args.train: logging.info("Start training...") trainer = Trainer(args, model, device, train_dataloader, dev_dataloader) trainer.train() # TODO: currently testing is on the dev set if args.test: logging.info("Start testing...") tester = Tester(args, model, device, dev_dataset, dev_dataloader) tester.test()
torch.backends.cudnn.benchmark = False DEVICE: str = "cuda" if torch.cuda.is_available() and USE_GPU else "cpu" # Some path for the training phase DATASET_PATH: str = '../../data/train.json' DATASET_DEV_PATH: str = '../../data/dev.json' DATASET_TEST_PATH: str = '../../data/test.json' GLOVE_PATH: str = "../../model/glove.6B.300d.txt" # pre-trained glove embeddings path # read the dataset sentences, labels = read_dataset(DATASET_PATH) sentences_dev, labels_dev = read_dataset(DATASET_DEV_PATH) # -- Initialize bert -- bert_config = BertConfig.from_pretrained(model_name, output_hidden_states=True) bert_tokenizer = BertTokenizer.from_pretrained(model_name) bert_model = BertModel.from_pretrained(model_name, config=bert_config) # -- net configuration -- It improve the code modularity net_configuration: dict = net_configurator( use_bert_embeddings=USE_BERT_EMBEDDINGS, use_crf=USE_CRF, use_biaffine_layer=USE_BIAFFINE_LAYER, use_pretrained=USE_GLOVE, use_dependecy_heads=USE_DEPENDENCY_HEADS, use_predicates=False, use_syntagnet=USE_SYNTAGNET) dataset_train: SRL_Dataset = SRL_Dataset(sentences, labels,
def zero_percent_no_finetuning(): parser = argparse.ArgumentParser() parser.add_argument("--test_data_path", required=True, type=str) parser.add_argument("--output_dir", required=True, type=str) parser.add_argument("--data_column", required=True, type=str) parser.add_argument("--label_column", required=True, type=str) parser.add_argument("--model_type", required=True) #parser.add_argument("--eval_split", # default=0.1, # type=float) #parser.add_argument("--test_split", # default=0.1, # type=float) parser.add_argument("--max_len", default=256, type=int) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--num_epochs", default=4, type=int) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--warmup_proportion", default=0.1, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) args = parser.parse_args() print("Setting the random seed...") random.seed(42) np.random.seed(42) torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False print("Reading data...") df_test_data = pd.read_csv(args.test_data_path, sep="\t") test_data = df_test_data[args.data_column].tolist() test_labels = df_test_data[args.label_column].tolist() label_set = sorted(list(set(df_test_data[args.label_column].values))) test_labels = encode_labels(test_labels, label_set) print("loading model...") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if args.model_type == "shebert": tokenizer = BertTokenizer.from_pretrained( "../models/crosloengual-bert-pytorch/vocab.txt") config = BertConfig.from_pretrained( "../models/crosloengual-bert-pytorch/bert_config.json", num_labels=len(label_set)) model = BertForSequenceClassification.from_pretrained( "../models/crosloengual-bert-pytorch/pytorch_model.bin", config=config) elif args.model_type == "mbert": tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=True) model = BertForSequenceClassification.from_pretrained( 'bert-base-multilingual-cased', num_labels=len(label_set)) else: print("Wrong argument value for model type") sys.exit() output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) log_path = os.path.join(args.output_dir, "log") print("Evaluating on the test set...") test_dataloader = prepare_labeled_data(test_data, test_labels, tokenizer, args.max_len, args.batch_size) metrics = bert_evaluate(model, test_dataloader, device) with open(log_path, 'a') as f: f.write("Acc: " + str(metrics['accuracy']) + "\n") f.write("F1: " + str(metrics['f1']) + "\n") print("Done.")
def zero_percent(): parser = argparse.ArgumentParser() parser.add_argument("--test_data_path", required=True, type=str) parser.add_argument("--output_dir", required=True, type=str) parser.add_argument("--data_column", required=True, type=str) parser.add_argument("--label_column", required=True, type=str) parser.add_argument("--offensive_label", required=True, type=str) parser.add_argument("--tokenizer_file", type=str) parser.add_argument("--config_file", type=str, required=True) parser.add_argument("--model_file", type=str, required=True) #parser.add_argument("--eval_split", # default=0.1, # type=float) #parser.add_argument("--test_split", # default=0.1, # type=float) parser.add_argument("--max_len", default=256, type=int) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--num_epochs", default=4, type=int) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--warmup_proportion", default=0.1, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) args = parser.parse_args() output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) log_path = os.path.join(args.output_dir, "log") print("Reading data...") df_test_data = pd.read_csv(args.test_data_path, sep="\t") df_test_data = consolidate_dataset_modified(df_test_data, args.data_column, args.label_column, args.offensive_label) test_data = df_test_data["data"].tolist() test_labels = df_test_data["labels"].tolist() print(test_labels) label_set = sorted(list(set(df_test_data["labels"].values))) test_labels = encode_labels(test_labels, label_set) print(test_labels) print("loading model...") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if args.tokenizer_file is not None: tokenizer = BertTokenizer.from_pretrained(args.tokenizer_file) else: tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=True) config = BertConfig.from_pretrained(args.config_file, num_labels=len(label_set)) model = BertForSequenceClassification.from_pretrained(args.model_file, config=config) print("Evaluating on the test set...") test_dataloader = prepare_labeled_data(test_data, test_labels, tokenizer, args.max_len, args.batch_size) metrics = bert_evaluate(model, test_dataloader, device) with open(log_path, 'a') as f: f.write("Acc: " + str(metrics['accuracy']) + "\n") f.write("F1: " + str(metrics['f1']) + "\n") print("Done.")
def run(self): num_train_epochs = 3 gradient_accumulation_steps = 1 weight_decay = 0.0 learning_rate = 5e-5 adam_epsilon = 1e-8 warmup_steps = 0 seed = 42 logging_steps = 50 train_batch_size = 16 * max(1, torch.cuda.device_count()) train_dataloader = DataLoader(SnliDataset(config_file=self.config_file, mode=TRAIN).get_dataset(), batch_size=train_batch_size, shuffle=True) dev_loader = DataLoader(SnliDataset(config_file=self.config_file, mode=DEV).get_dataset(), batch_size=train_batch_size, shuffle=True) test_loader = DataLoader(SnliDataset(config_file=self.config_file, mode=TEST).get_dataset(), batch_size=train_batch_size, shuffle=True) t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs if self.mode == 'train': self.logger.info('Loading pretrained model') config = BertConfig.from_pretrained(BERT_MODEL, num_labels=3) model = BertForSequenceClassification.from_pretrained( BERT_MODEL, config=config) else: self.logger.info('Loading trained model from local directory') config = BertConfig.from_json_file( f'{self.path}/checkpoint-best/config.json') model = BertForSequenceClassification.from_pretrained( f'{self.path}/checkpoint-best/pytorch_model.bin', config=config) if torch.cuda.device_count() == 1: model = model.cuda() self.logger.info('GPUs used: 1') elif torch.cuda.device_count() > 1: model = model.cuda() model = torch.nn.DataParallel(model) self.logger.info(f'GPUs used: {torch.cuda.device_count()}') else: self.logger.warn('No GPUs used!') no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) global_step, accuracy = 0, 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(num_train_epochs), desc="Epoch") # Added here for reproductibility (even between python 2 and 3) self.set_seed(seed) if self.mode == 'train': self.logger.info('Running training') for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Train Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.cuda() for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if torch.cuda.device_count() > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if global_step % logging_steps == 0: epoch_iterator.set_description( f'Loss: {(tr_loss - logging_loss)/logging_steps}' ) logging_loss = tr_loss eval_acc = self.evaluate(dev_loader, model) self.logger.info(f'Dev accuracy: {eval_acc}') if accuracy < eval_acc: output_dir = os.path.join(self.path, 'checkpoint-{}'.format('best')) self.logger.info(f'Saving best model to {output_dir}') if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, 'training_args.bin')) else: eval_acc = self.evaluate(train_dataloader, model) self.logger.info(f'Train Accuracy: {eval_acc}') eval_acc = self.evaluate(dev_loader, model) self.logger.info(f'Dev Accuracy: {eval_acc}') eval_acc = self.evaluate(test_loader, model) self.logger.info(f'Test Accuracy: {eval_acc}')
def write_results(dataset, lvl, tokens, epochs, batch, test_labels, train_in, test_in, model): """ evaluate all runs for a model and generate the result table row with all results. For flat and per_level approaches :param dataset: dataset to test on :param lvl: lvl to test :param tokens: maximal token length :param epochs: maximal epochs the model was trained on :param batch: batch size for evaluating, same as for training :param test_labels: labels to used for testing :param train_in: what was used for training :param test_in: what will used for testing :param model: path to model :return: the result table row corresponding to the analysis of the given model """ # Simulate config file arguments = { 'model_name': 'bert-base-uncased', 'max_length': tokens, 'epochs': epochs, 'batch_size': batch, 'data_path': dataset, 'lvl': lvl, 'test_labels': test_labels } # Prepare tokenization for evaluation model_name = arguments['model_name'] config = BertConfig.from_pretrained(model_name) config.output_hidden_states = False data, trunest_class_names, test_target = BERT_per_lvl.get_test_data( arguments) # Get test data x = BERT_per_lvl.get_tokenized(model_name, config, data, tokens) # Tokenize test data runs = [ filename for filename in glob.iglob(model + "/**/model", recursive=True) ] # get the 3 runs for each model res_list = [] for run in runs: # for each run evaluate res_list.append(evaluate(run, x, batch, test_target)) # f1_score, accuracy_score # Mean and std for the 3 runs f1_mean, accu_mean = np.mean(res_list, axis=0) f1_std, accu_std = np.std(res_list, axis=0) f1_string = '{:.3f}({:.3f})'.format(f1_mean, f1_std) acc_string = '{:.3f}({:.3f})'.format(accu_mean, accu_std) # For the levels not predicted by this model give "-" out aux = ['-'] * 6 aux[(lvl - 1) * 2] = acc_string aux[(lvl - 1) * 2 + 1] = f1_string # Get the maximum of how many epochs the runs trained before early stopping kicked in _, _, leng, _ = get_model_plot(model) used_ep = len(leng[0]) # Format data to generate a row of the results table table_data = [ "Per_lvl", dataset, '{}({})'.format(epochs, used_ep), tokens, batch, len(runs), train_in, "Cat" + str(lvl), test_in ] + aux return table_data
import torch from collections import Counter from transformers import BertTokenizer, BertConfig, BertForQuestionAnswering model = BertForQuestionAnswering.from_pretrained("./final_model_split") tokenizer = BertTokenizer.from_pretrained("./final_model_split") config = BertConfig.from_pretrained("./final_model_split") def f1_score(pred, ref): pred_tokens = list(pred) ref_tokens = list(ref) common = Counter(pred_tokens) & Counter(ref_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(pred_tokens) recall = 1.0 * num_same / len(ref_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 def evaluate(predictions, references): f1 = total = 0 for ref, pred in zip(references, predictions): total += 1 f1 += f1_score(pred, ref) f1 = 100.0 * f1 / total return f1
start_labels) end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_preds, end_labels) class_loss = nn.CrossEntropyLoss()(class_preds, class_labels) return start_loss + end_loss + class_loss def loss_fn_classifier(preds, labels): _, _, class_preds = preds _, _, class_labels = labels class_loss = nn.CrossEntropyLoss()(class_preds, class_labels) return class_loss config = BertConfig.from_pretrained(bert_model) config.num_labels = 5 tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) model = BertForQuestionAnswering.from_pretrained( '/data/sv/CS230_Spring-2020/Guanshuo_TFQA_1stplace/code', config) model = model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params':
def main(): parser = argparse.ArgumentParser() parser.add_argument('--adv_type', default='fgm', type=str, choices=['fgm', None]) # /home/bert/ernie1.0_base_zh/torch # /home/bert/bert_base_zh/torch # /home/bert/chinese_roberta_wwm_large_ext_pytorch # /home/bert/roberta_wwm_base_ext_zh/torch parser.add_argument( "--model_name_or_path", default='/home/bert/chinese_roberta_wwm_large_ext_pytorch', type=str, help="Path to pre-trained model ", ) parser.add_argument( "--data_dir", default='../data/Dataset/', type=str, help="Path to data ", ) parser.add_argument( "--task_name", default='pair', type=str, help="The name of the task to train selected in the list: " + ", ".join(PROCESSORS.keys()), ) parser.add_argument( "--output_dir", default='../user_data/tmp_data/checkpoints', type=str, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--max_seq_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_eval_during_train", action="store_true", help="Run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=32, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--warmup_rate", default=0.1, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", type=bool, default=True, help="Overwrite the content of the output directory", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( "--threads", type=int, default=10, help="multiple threads for converting example to features") args = parser.parse_args() # args.output_dir = os.path.join(args.output_dir, args.task_name) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in PROCESSORS: raise ValueError("Task not found: %s" % (args.task_name)) processor = PROCESSORS[args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, ) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) global_step = train(args, tokenizer, model) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Save the trained model and the tokenizer if (args.local_rank == -1 or torch.distributed.get_rank() == 0) and (not args.do_eval_during_train): output_dir = args.output_dir if not os.path.exists(output_dir) and args.local_rank in [-1, 0]: os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if args.do_eval and args.local_rank in [-1, 0]: output_dir = args.output_dir model = BertForSequenceClassification.from_pretrained(output_dir) model.to(args.device) tokenizer = BertTokenizer.from_pretrained( output_dir, do_lower_case=args.do_lower_case) result, _ = evaluate(args, model, tokenizer=tokenizer) output_eval_file = os.path.join(output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for k, v in result.items(): logger.info(" {} : {}".format(k, v)) writer.write("{} : {}\n".format(k, v))
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if getattr(args, "in_huggingface", False): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained( args.in_huggingface, config=model_config ) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if getattr(args, "in_model", None) is not None: checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch_input in enumerate(dataloader): batch_input = utils.any2device(batch_input, device) batch_output = model(**batch_input) mask = ( batch_input["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None ) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states batch_features = process_bert_output( bert_output=batch_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for layer_name, layer_value in batch_features.items(): layer_name = ( layer_name if isinstance(layer_name, str) else f"{layer_name:02d}" ) _, embedding_size = layer_value.shape features[layer_name] = np.memmap( f"{args.out_prefix}.{layer_name}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange( idx * batch_size, min((idx + 1) * batch_size, num_samples) ) for layer_name2, layer_value2 in batch_features.items(): layer_name2 = ( layer_name2 if isinstance(layer_name2, str) else f"{layer_name2:02d}" ) features[layer_name2][indices] = _detach(layer_value2)
from transformers import BertTokenizer, BertModel, BertConfig from torch.nn.utils.rnn import pad_sequence if len(sys.argv) < 4: print("Usage: python makecache.py cacheword_file processed_dir output_file", file=sys.stderr) sys.exit(-1) cachewordfile = sys.argv[1] processed_dir = sys.argv[2] outputfile = sys.argv[3] print('Initializing model', file=sys.stderr) # adjust your model here config = BertConfig.from_pretrained("bert-base-german-cased", output_hidden_states=True) tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased") model = BertModel.from_pretrained("bert-base-german-cased", config=config) assert model.config.output_hidden_states model.to('cuda') model.eval() torch.set_grad_enabled(False) with open(cachewordfile) as f: cachewords = [x.strip() for x in f.readlines()] word_index_map = dict(zip(cachewords, range(len(cachewords)))) tokenized_docs = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir)]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model name") parser.add_argument("--vocab_file", default="", type=str, help="vocab file path if not the same as model name") # parser.add_argument("--tokenizer_name", default="", type=str) parser.add_argument("--max_query_len", default=64, type=int) parser.add_argument("--max_seq_len", default=512, type=int) parser.add_argument("--do_train", action="store_true") parser.add_argument("--do_eval", action="store_true") parser.add_argument("--epoch", default=10, type=int) parser.add_argument("--train_batch_size", default=32, type=int) parser.add_argument("--eval_batch_size", default=32, type=int) parser.add_argument("--learning_rate", default=1e-6, type=float) parser.add_argument("--num_training_steps", default=10000, type=int) parser.add_argument("--num_labels", default=2, type=int) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device params = { 'batch_size': args.train_batch_size, 'shuffle': True, 'num_workers': 8, 'collate_fn': my_collate_fn } tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) config = BertConfig.from_pretrained(args.config_name, num_labels=args.num_labels) config.num_labels = 2 print((config)) model = BertForPassageRerank.from_pretrained(args.model_name_or_path, config=config) model.to(args.device) if args.do_train: print("training...") params = { 'batch_size': args.train_batch_size, 'shuffle': True, 'num_workers': 2, 'collate_fn': my_collate_fn } # tokenizer = BertTokenizer.from_pretrained( # args.tokenizer_name, do_lower_case=True) train_set = PassageData(args.train_file, tokenizer, args.max_query_len, args.max_seq_len) dataloader = DataLoader(train_set, **params) num_train_each_epoch = len(dataloader) print("step: ", num_train_each_epoch) num_training_steps = len(dataloader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) for ep in tqdm(range(args.epoch)): running_loss = 0.0 step = 0 for data in tqdm(dataloader): step += 1 # data = data.to(args.device) inputs_ids, masks, segments_ids, \ labels = [x.to(args.device) for x in data] outputs = model(inputs_ids, masks, segments_ids, labels) loss = outputs optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() if step % 500 == 0: print("loss:", loss) running_loss += loss.item() elif args.do_eval: print("evaling...") model.eval() idmap = "./data/ids_map.json" with open(idmap, 'r') as f: id_map = json.load(f) q_ids = id_map['q_id'] qid_to_pid = id_map['qid_to_pid'] params = { 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': 4, 'collate_fn': my_collate_fn } eval_set = PassageData(args.eval_file, tokenizer, args.max_query_len, args.max_seq_len) dataloader = DataLoader(eval_set, **params) results = [] count = 0 fw = open('./data/output.tsv', 'w') i = 0 for data in dataloader: if count == 2: break i += 1 print(i) inputs_ids, masks, \ segments_ids = [x.to(args.device) for x in data] with torch.no_grad(): result = model(inputs_ids, masks, segments_ids) # print("result: ", result) for res in result: results.append(res[1]) if len(results) == 1000: print('greater than 1000') q_id = q_ids[count] pred_passages = torch.argsort(result[:, 1], descending=True, dim=-1) rank = 1 for idx in pred_passages: p_id = qid_to_pid[q_id][idx.item()] if p_id != '000000': fw.write(q_id + '\t' + p_id + '\t' + str(rank + 1) + '\n') rank += 1 count += 1 results = [] fw.close()
label = torch.tensor(data=label).type(torch.LongTensor) return input_ids, token_type_ids, attention_mask, label print("***********load test data*****************") config = roBerta_Config() vocab = Vocab() train_data, valid_data, test_data = vocab.get_train_dev_test() test_dataset = BuildDataSet(test_data) test_load = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn) print("***********load model weight*****************") model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="bert_source/bert_config.json") model = BertForSequenceClassification(config=model_config) model.load_state_dict(torch.load('save_bert/best_model.pth.tar')) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) config.device = device print("***********make predict for test file*****************") predict = model_infer(model, config, test_load) submit_result(predict) print("***********done*****************")
from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer) from transformers import AdamW, WarmupLinearSchedule testDataPath = './Data/Bert/bert_end_to_end.csv' import torch from torch.utils.data import Dataset, DataLoader import numpy as np import csv maxLen = 512 pretrained_weights = 'bert-base-uncased' config = BertConfig.from_pretrained('./Models/Bert', num_labels=2) tokenizer = BertTokenizer.from_pretrained(pretrained_weights) model = BertForSequenceClassification.from_pretrained('./Models/Bert', config=config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) Q, T, A, L = [], [], [], [] with open(testDataPath) as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: line_count += 1 else:
def inference(onnx_model, model_dir, examples, fast_tokenizer, num_threads): quantized_str = '' if 'quantized' in onnx_model: quantized_str = 'quantized' onnx_inference = [] pytorch_inference = [] # onnx session options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = 1 ort_session = ort.InferenceSession(onnx_model, options) # pytorch pretrained model and tokenizer if fast_tokenizer: tokenizer = BertTokenizerFast.from_pretrained(model_dir) tokenizer_str = "BertTokenizerFast" else: tokenizer = BertTokenizer.from_pretrained(model_dir) tokenizer_str = "BertTokenizer" config = BertConfig.from_pretrained(model_dir) model = BertForSequenceClassification.from_pretrained(model_dir, config=config) #model.to("cpu") print( "**************** {} ONNX inference with batch tokenization and with {} tokenizer****************" .format(quantized_str, tokenizer_str)) start_onnx_inference_batch = time.time() start_batch_tokenization = time.time() tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128) total_batch_tokenization_time = time.time() - start_batch_tokenization total_inference_time = 0 total_build_label_time = 0 for i in range(len(examples)): """ Onnx inference with batch tokenization """ tokens = get_tokens(tokens_dict, i) #inference start_inference = time.time() ort_outs = ort_session.run(None, tokens) total_inference_time = total_inference_time + (time.time() - start_inference) #build label start_build_label = time.time() torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32) onnx_logits = F.softmax(torch_onnx_output, dim=1) logits_label = torch.argmax(onnx_logits, dim=1) label = logits_label.detach().cpu().numpy() onnx_inference.append(label[0]) total_build_label_time = total_build_label_time + (time.time() - start_build_label) end_onnx_inference_batch = time.time() print("Total batch tokenization time (in seconds): ", total_batch_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print( "Duration ONNX inference (in seconds) with {} and batch tokenization: " .format(tokenizer_str), end_onnx_inference_batch - start_onnx_inference_batch) print( "****************{} ONNX inference without batch tokenization and with {} tokenizer****************" .format(quantized_str, tokenizer_str)) start_onnx_inference_no_batch = time.time() total_tokenization_time = 0 total_inference_time = 0 total_build_label_time = 0 for example in examples: """ Onnx inference without batch tokenization """ #input_ids, input_mask, segment_ids = preprocess(tokenizer, example) #tokenization start_tokenization = time.time() tokens = tokenizer.encode_plus(example) tokens = {name: np.atleast_2d(value) for name, value in tokens.items()} total_tokenization_time = total_tokenization_time + ( time.time() - start_tokenization) #inference start_inference = time.time() ort_outs = ort_session.run(None, tokens) total_inference_time = total_inference_time + (time.time() - start_inference) #build_label start_build_label = time.time() torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32) onnx_logits = F.softmax(torch_onnx_output, dim=1) logits_label = torch.argmax(onnx_logits, dim=1) label = logits_label.detach().cpu().numpy() onnx_inference.append(label[0]) total_build_label_time = total_build_label_time + (time.time() - start_build_label) end_onnx_inference_no_batch = time.time() print("One-by-one total tokenization time (in seconds): ", total_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print( "Duration ONNX inference (in seconds) with {} and one-by-one tokenization: " .format(tokenizer_str), end_onnx_inference_no_batch - start_onnx_inference_no_batch) print( "****************Torch inference without batch tokenization, without quantization and with {} tokenizer****************" .format(tokenizer_str)) start_torch_inference_no_quantization = time.time() total_tokenization_time = 0 total_inference_time = 0 total_build_label_time = 0 for example in examples: """ Pretrained bert pytorch model """ # tokenization start_tokenization = time.time() input_ids, input_mask, segment_ids = preprocess(tokenizer, example) total_tokenization_time = total_tokenization_time + ( time.time() - start_tokenization) # inference start_inference = time.time() torch_out = inference_pytorch(model, input_ids, input_mask, segment_ids, quantization=False, num_threads=num_threads) total_inference_time = total_inference_time + (time.time() - start_inference) # build label start_build_label = time.time() logits_label = torch.argmax(torch_out, dim=1) label = logits_label.detach().cpu().numpy() pytorch_inference.append(label[0]) total_build_label_time = total_build_label_time + (time.time() - start_build_label) end_torch_inference_no_quantization = time.time() print("One-by-one total tokenization time (in seconds): ", total_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print( "Duration PyTorch inference (in seconds) with {}, without quantization and with {} threads: " .format(tokenizer_str, num_threads), end_torch_inference_no_quantization - start_torch_inference_no_quantization) print( "****************Torch inference without batch tokenization, with quantization and with {} tokenizer****************" .format(tokenizer_str)) start_torch_inference_w_quantization = time.time() total_tokenization_time = 0 total_inference_time = 0 total_build_label_time = 0 for example in examples: """ Pretrained bert pytorch model """ # tokenization start_tokenization = time.time() input_ids, input_mask, segment_ids = preprocess(tokenizer, example) total_tokenization_time = total_tokenization_time + ( time.time() - start_tokenization) # inference start_inference = time.time() torch_out = inference_pytorch(model, input_ids, input_mask, segment_ids, quantization=False, num_threads=num_threads) total_inference_time = total_inference_time + (time.time() - start_inference) # build label start_build_label = time.time() logits_label = torch.argmax(torch_out, dim=1) label = logits_label.detach().cpu().numpy() pytorch_inference.append(label[0]) total_build_label_time = total_build_label_time + (time.time() - start_build_label) end_torch_inference_w_quantization = time.time() print("One-by-one total tokenization time (in seconds): ", total_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print( "Duration PyTorch inference (in seconds) with {} and with quantization and with {} threads: " .format(tokenizer_str, num_threads), end_torch_inference_w_quantization - start_torch_inference_w_quantization) # # # compare ONNX Runtime and PyTorch results # np.testing.assert_allclose(to_numpy(torch_out), onnx_logits, rtol=1e-03, atol=1e-05) # # print("Exported model has been tested with ONNXRuntime, and the result looks good!") return onnx_inference, pytorch_inference
def __init__(self): super().__init__() config = BertConfig.from_pretrained("bert-base-uncased") self.model = BertModel(config)
def train_and_test(): visual_features = pkl.load( open('tf_features/visual_features_facenet.pkl', 'rb')) audio_features = pkl.load(open('tf_features/audio_features.pkl', 'rb')) x = pkl.load(open('tf_features/linguistic_features.pkl', 'rb')) token_type_ids = pkl.load(open('tf_features/token_type_ids.pkl', 'rb')) attention_mask = pkl.load(open('tf_features/attention_mask.pkl', 'rb')) labels = pkl.load(open('tf_features/labels.pkl', 'rb')) cv5_ids = pkl.load(open('tf_features/cv5_ids.pkl', 'rb')) visual_dim = visual_features.shape[-1] audio_dim = audio_features.shape[-1] print(visual_dim, audio_dim) sp = cv5_ids[0] train_l, train_labels = x[sp[0]], labels[sp[0]] train_v = visual_features[sp[0]] train_a = audio_features[sp[0]] test_l, test_labels = x[sp[1]], labels[sp[1]] test_v = visual_features[sp[1]] test_a = audio_features[sp[1]] print(train_v.shape) train_token_type_ids, test_token_type_ids, train_attention_mask, test_attention_mask = token_type_ids[sp[0]], \ token_type_ids[sp[1]], attention_mask[sp[0]], attention_mask[sp[1]] # shuffle training data for batch reading n_train = len(train_v) n_eval = len(test_v) perm = np.random.permutation(n_train) train_l, train_a, train_v = train_l[perm], train_a[perm], train_v[perm] print(train_l.shape, train_a.shape, train_v.shape) train_labels = np.array(train_labels)[perm] train_token_type_ids, train_attention_mask = train_token_type_ids[ perm], train_attention_mask[perm] train_l, test_l, train_labels, test_labels, train_token_type_ids, test_token_type_ids = torch.LongTensor(train_l), \ torch.LongTensor(test_l), \ torch.LongTensor(train_labels), \ torch.LongTensor(test_labels), \ torch.LongTensor(train_token_type_ids), \ torch.LongTensor(test_token_type_ids) train_a, test_a, train_v, test_v = torch.FloatTensor(train_a), torch.FloatTensor(test_a), \ torch.FloatTensor(train_v), torch.FloatTensor(test_v) train_attention_mask, test_attention_mask = torch.FloatTensor(train_attention_mask), \ torch.FloatTensor(test_attention_mask) config = BertConfig.from_pretrained('bert-base-uncased', num_labels=3) config.visual_dim = visual_dim config.audio_dim = audio_dim bert_external = BertModel.from_pretrained('bert-base-uncased').to('cuda') bert_insert = mBertModel(config) bert_insert.embeddings = bert_external.embeddings bert_insert.encoder = bert_external.encoder bert_insert.pooler = bert_external.pooler model = mBertModel(config).to('cuda') eval_every = 5 batch_size = 32 test_batch_size = 4 max_epochs = 500 t_total = math.ceil(n_train / batch_size) * max_epochs lr = 2e-5 epsilon = 1e-8 max_grad_norm = 1.0 weight_decay = 0.0 optimizer, scheduler = get_optimizers(model, learning_rate=lr, adam_epsilon=epsilon, weight_decay=weight_decay, num_training_steps=t_total) # loss_fn = torch.nn.CrossEntropyLoss().cuda() model.train() model.zero_grad() day = time.localtime().tm_mday minute = time.localtime().tm_min hour = time.localtime().tm_hour save_dir = 'fine_tuning_checkpoints/' + '-%d-%d-%d/' % (day, hour, minute) # os.mkdir(save_dir) for ep in range(max_epochs): idx = 0 avg_loss = 0 n_batch = 0 model.train() while idx < n_train: optimizer.zero_grad() batch_l = train_l[idx:(idx + batch_size)].to('cuda') batch_v = train_v[idx:(idx + batch_size)].to('cuda') batch_a = train_a[idx:(idx + batch_size)].to('cuda') batch_ty = train_token_type_ids[idx:(idx + batch_size)].to('cuda') batch_am = train_attention_mask[idx:(idx + batch_size)].to('cuda') ans = train_labels[idx:(idx + batch_size)].to('cuda') idx += batch_size preds = model(input_ids=batch_l, input_visual=batch_v, input_audio=batch_a, token_type_ids=batch_ty, attention_mask=batch_am, labels=ans) loss = preds[0] # print(preds, ans) loss.backward() # print(loss.data.cpu().numpy()) avg_loss += loss.data.cpu().numpy() n_batch += 1. torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() torch.cuda.empty_cache() avg_loss = avg_loss / n_batch print("epoch: %d avg_loss: %f" % (ep + 1, avg_loss)) del batch_l, batch_v, batch_a, batch_ty, batch_am, ans torch.cuda.empty_cache() # time.sleep(20) if ep % eval_every == 0: idx = 0 model.eval() eval_preds = np.array([]) while idx < n_eval: test_batch_v = test_v[idx:(idx + test_batch_size)].to('cuda') test_batch_l = test_l[idx:(idx + test_batch_size)].to('cuda') test_batch_a = test_a[idx:(idx + test_batch_size)].to('cuda') test_batch_ty = test_token_type_ids[idx:( idx + test_batch_size)].to('cuda') test_batch_am = test_attention_mask[idx:( idx + test_batch_size)].to('cuda') test_ans = test_labels[idx:(idx + test_batch_size)].to('cuda') # time.sleep(20) # exit() test_pred = model(input_ids=test_batch_l, input_visual=test_batch_v, input_audio=test_batch_a, token_type_ids=test_batch_ty, attention_mask=test_batch_am, labels=test_ans) scores = test_pred[1] _, batch_eval_preds = scores.data.cpu().max(1) eval_preds = np.concatenate((eval_preds, batch_eval_preds), axis=-1) idx += test_batch_size torch.cuda.empty_cache() del test_batch_l, test_batch_v, test_batch_a, test_batch_ty, test_batch_am, test_ans torch.cuda.empty_cache() # metrics precison, recall, fscore, support = precision_recall_fscore_support( test_labels.cpu().numpy(), eval_preds, labels=[0, 1, 2], average=None) print( float(sum(eval_preds == test_labels.cpu().numpy())) / len(eval_preds)) print(precison, recall, fscore, support) print('saving:') '''model_dir = save_dir + '%d' % (ep+1)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="数据文件目录,因当有train.text dev.text") parser.add_argument("--vob_file", default=None, type=str, required=True, help="词表文件") parser.add_argument("--model_config", default=None, type=str, required=True, help="模型配置文件json文件") parser.add_argument("--pre_train_model", default=None, type=str, required=True, help="预训练的模型文件,参数矩阵。如果存在就加载") parser.add_argument("--output_dir", default=None, type=str, required=True, help="输出结果的文件") # Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="输入到bert的最大长度,通常不应该超过512") parser.add_argument("--do_train", action='store_true', help="是否进行训练") parser.add_argument("--train_batch_size", default=8, type=int, help="训练集的batch_size") parser.add_argument("--eval_batch_size", default=8, type=int, help="验证集的batch_size") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="梯度累计更新的步骤,用来弥补GPU过小的情况") parser.add_argument("--learning_rate", default=5e-5, type=float, help="学习率") parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="最大的梯度更新") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="epoch 数目") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--warmup_steps", default=0, type=int, help="让学习增加到1的步数,在warmup_steps后,再衰减到0") args = parser.parse_args() assert os.path.exists(args.data_dir) assert os.path.exists(args.vob_file) assert os.path.exists(args.model_config) assert os.path.exists(args.pre_train_model) args.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) #filename = './output/bert-sim.log', processor = SimProcessor() tokenizer_inputs = () tokenizer_kwards = { 'do_lower_case': False, 'max_len': args.max_seq_length, 'vocab_file': args.vob_file } tokenizer = BertTokenizer(*tokenizer_inputs, **tokenizer_kwards) train_dataset = load_and_cache_example(args, tokenizer, processor, 'train') eval_dataset = load_and_cache_example(args, tokenizer, processor, 'dev') test_dataset = load_and_cache_example(args, tokenizer, processor, 'test') bert_config = BertConfig.from_pretrained(args.model_config) bert_config.num_labels = len(processor.get_labels()) model_kwargs = {'config': bert_config} model = BertForSequenceClassification.from_pretrained( args.pre_train_model, **model_kwargs) model = model.to(args.device) if args.do_train: trains(args, train_dataset, eval_dataset, model)
from pprint import pprint model = "python/craftassist/models/semantic_parser/ttad_bert_updated/caip_test_model.pth" args_path = "python/craftassist/models/semantic_parser/ttad_bert_updated/caip_test_model_args.pk" args = pickle.load(open(args_path, "rb")) tokenizer = AutoTokenizer.from_pretrained(args.pretrained_encoder_name) full_tree, tree_i2w = json.load(open(args.tree_voc_file)) dataset = CAIPDataset(tokenizer, args, prefix="", full_tree_voc=(full_tree, tree_i2w)) enc_model = AutoModel.from_pretrained(args.pretrained_encoder_name) bert_config = BertConfig.from_pretrained("bert-base-uncased") bert_config.is_decoder = True bert_config.vocab_size = len(tree_i2w) + 8 bert_config.num_hidden_layers = args.num_decoder_layers dec_with_loss = DecoderWithLoss(bert_config, args, tokenizer) encoder_decoder = EncoderDecoderWithLoss(enc_model, dec_with_loss, args) encoder_decoder.load_state_dict(torch.load(model)) encoder_decoder = encoder_decoder.cuda() _ = encoder_decoder.eval() def get_beam_tree(chat, noop_thres=0.95, beam_size=5, well_formed_pen=1e2): btr = beam_search(chat, encoder_decoder, tokenizer, dataset, beam_size, well_formed_pen) if btr[0][0].get("dialogue_type",
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of task is selected in [imdb, amazon]") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir.") parser.add_argument("--cache_dir", default='../cache', type=str, help="The cache data dir.") parser.add_argument( '--model_type', default=None, type=str, required=True, help="Model type selected in [bert, xlnet, xlm, cnn, lstm]") parser.add_argument( '--model_name_or_path', default='bert-base-uncased', type=str, help="Shortcut name is selected in [bert-base-uncased, ]") parser.add_argument( '--output_dir', default='../out', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--skip", default=20, type=int, help="Evaluate one testing point every skip testing point.") parser.add_argument("--num_random_sample", default=5000, type=int, help="The number of random samples of each texts.") parser.add_argument("--similarity_threshold", default=0.8, type=float, help="The similarity constraint to be " "considered as synonym.") parser.add_argument("--perturbation_constraint", default=100, type=int, help="The maximum size of perturbation " "set of each word.") parser.add_argument( "--mc_error", default=0.01, type=float, help="Monte Carlo Error based on concentration inequality.") parser.add_argument("--train_type", default='normal', type=str, help="Train type is selected in [normal, rs].") # other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--ckpt", default=-1, type=int, help="Which ckpt to load.") parser.add_argument("--seed", default=42, type=int, help="Random seed for initializaiton.") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("model type: %s, task name: %s, device: %s, train_type: %s", args.model_type, args.task_name, device, args.train_type) set_seed(args) if args.task_name not in processors: raise ValueError("Task not found: %s" % args.task_name) task_class = processors[args.task_name]() label_list = task_class.get_labels() num_labels = len(label_list) args.num_labels = num_labels # load vocab. word2index = None if args.model_type != 'bert': with open( args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name), 'rb') as f: vocab = pickle.load(f) index2word = vocab['index2word'] word2index = vocab['word2index'] word_mat = vocab['word_mat'] args.word_mat = word_mat args.vocab_size = len(index2word) tokenizer = None if args.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=True) args.vocab_size = tokenizer.vocab_size config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) elif args.model_type == 'bow': args.embed_size = 300 args.hidden_size = 100 model = BOWModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'decom_att': # No using args.embed_size = 300 args.hidden_size = 100 model = DecompAttentionModel(word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) elif args.model_type == 'esim': args.embed_size = 300 args.hidden_size = 100 model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=torch.tensor(word_mat).float(), padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) else: raise ValueError('model type is not found!') model.to(device) similarity_threshold = args.similarity_threshold perturbation_constraint = args.perturbation_constraint perturbation_file = args.cache_dir + '/' + args.task_name + '_perturbation_constraint_pca' + str( similarity_threshold) + "_" + str(perturbation_constraint) + '.pkl' with open(perturbation_file, 'rb') as f: perturb = pickle.load(f) # random smooth random_smooth = WordSubstitute(perturb) # generate randomized data randomize_testset(args, random_smooth, similarity_threshold, perturbation_constraint) # calculate total variation calculate_tv_perturb(args, perturb) # Evaluation if args.ckpt < 0: checkpoints = glob.glob( args.output_dir + '/{}_{}_{}_checkpoint-*'.format( args.train_type, args.task_name, args.model_type)) checkpoints.sort(key=lambda x: int(x.split('-')[-1])) checkpoint = checkpoints[-1] else: checkpoint = os.path.join( args.output_dir, '{}_{}_{}_checkpoint-{}'.format(args.train_type, args.task_name, args.model_type, args.ckpt)) print("Evaluation result, load model from {}".format(checkpoint)) model = load(args, checkpoint) randomized_evaluate(args, model, tokenizer, word2index)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.device = device seed = 30004 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True # prepare input import pickle with open('../1_1/distribution_dict1.pickle', 'rb') as f: distribution_dict1 = pickle.load(f) with open('../1_1/distribution_dict2.pickle', 'rb') as f: distribution_dict2 = pickle.load(f) with open('../1_1/distribution_dict3.pickle', 'rb') as f: distribution_dict3 = pickle.load(f) with open('../1_1/distribution_dict4.pickle', 'rb') as f: distribution_dict4 = pickle.load(f) json_dir = '../../input/simplified-nq-train.jsonl' max_data = 9999999999 id_list = [] neg_id_list = [] data_dict = {} neg_data_dict = {} with open(json_dir) as f: for n, line in tqdm(enumerate(f)): if n > max_data: break data = json.loads(line) is_pos = False annotations = data['annotations'][0] if annotations['yes_no_answer'] == 'YES': is_pos = True elif annotations['yes_no_answer'] == 'NO': is_pos = True elif annotations['short_answers']: is_pos = True elif annotations['long_answer']['candidate_index'] != -1: is_pos = True if is_pos and len(data['long_answer_candidates'])>1: data_id = data['example_id'] id_list.append(data_id) # random sampling if data_id in distribution_dict1: candidate_index_list = np.array(distribution_dict1[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict1[data_id]['prob_list']),1) elif data_id in distribution_dict2: candidate_index_list = np.array(distribution_dict2[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict2[data_id]['prob_list']),1) elif data_id in distribution_dict3: candidate_index_list = np.array(distribution_dict3[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict3[data_id]['prob_list']),1) else: candidate_index_list = np.array(distribution_dict4[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict4[data_id]['prob_list']),1) prob_list /= sum(prob_list) negative_candidate_index = random_sample_negative_candidates(candidate_index_list, prob_list) # doc_words = data['document_text'].split() # negative candidate = data['long_answer_candidates'][negative_candidate_index] negative_candidate_words = doc_words[candidate['start_token']:candidate['end_token']] negative_candidate_start = candidate['start_token'] negative_candidate_end = candidate['end_token'] # positive candidate = data['long_answer_candidates'][annotations['long_answer']['candidate_index']] positive_candidate_words = doc_words[candidate['start_token']:candidate['end_token']] positive_candidate_start = candidate['start_token'] positive_candidate_end = candidate['end_token'] # initialize data_dict data_dict[data_id] = { 'question_text': data['question_text'], 'annotations': data['annotations'], 'positive_text': positive_candidate_words, 'positive_start': positive_candidate_start, 'positive_end': positive_candidate_end, 'negative_text': negative_candidate_words, 'negative_start': negative_candidate_start, 'negative_end': negative_candidate_end, } elif (not is_pos) and len(data['long_answer_candidates'])>=1: data_id = data['example_id'] neg_id_list.append(data_id) # random sampling if data_id in distribution_dict1: candidate_index_list = np.array(distribution_dict1[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict1[data_id]['prob_list']),1) elif data_id in distribution_dict2: candidate_index_list = np.array(distribution_dict2[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict2[data_id]['prob_list']),1) elif data_id in distribution_dict3: candidate_index_list = np.array(distribution_dict3[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict3[data_id]['prob_list']),1) else: candidate_index_list = np.array(distribution_dict4[data_id]['candidate_index_list']) prob_list = np.power(np.array(distribution_dict4[data_id]['prob_list']),1) prob_list /= sum(prob_list) negative_candidate_index = random_sample_negative_candidates(candidate_index_list, prob_list) # doc_words = data['document_text'].split() # negative candidate = data['long_answer_candidates'][negative_candidate_index] negative_candidate_words = doc_words[candidate['start_token']:candidate['end_token']] negative_candidate_start = candidate['start_token'] negative_candidate_end = candidate['end_token'] # initialize data_dict neg_data_dict[data_id] = { 'question_text': data['question_text'], 'negative_text': negative_candidate_words, 'negative_start': negative_candidate_start, 'negative_end': negative_candidate_end, } print(len(id_list), len(neg_id_list)) random.shuffle(id_list) random.shuffle(neg_id_list) # length of neg_id_list must be longer than id_list otherwise data generator will error # hyperparameters max_seq_len = 360 max_question_len = 64 learning_rate = 0.000002 batch_size = 3 ep = 0 # build model if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model_path = 'model/' config = BertConfig.from_pretrained(model_path) config.num_labels = 5 config.vocab_size = 30531 tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True) model = BertForQuestionAnswering.from_pretrained('weights/epoch2/', config=config) # add new tokens new_token_dict = { '<P>':'qw1', '<Table>':'qw2', '<Tr>':'qw3', '<Ul>':'qw4', '<Ol>':'qw5', '<Fl>':'qw6', '<Li>':'qw7', '<Dd>':'qw8', '<Dt>':'qw9', } new_token_list = [ 'qw1', 'qw2', 'qw3', 'qw4', 'qw5', 'qw6', 'qw7', 'qw8', 'qw9', ] num_added_toks = tokenizer.add_tokens(new_token_list) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # training # iterator for training train_datagen = TFQADataset(id_list=id_list, neg_id_list=neg_id_list) train_sampler = DistributedSampler(train_datagen) train_collate = Collator(id_list=id_list, neg_id_list=neg_id_list, data_dict=data_dict, neg_data_dict=neg_data_dict, new_token_dict=new_token_dict, tokenizer=tokenizer, max_seq_len=max_seq_len, max_question_len=max_question_len) train_generator = DataLoader(dataset=train_datagen, sampler=train_sampler, collate_fn=train_collate, batch_size=batch_size, num_workers=3, pin_memory=True) # train losses1 = AverageMeter() # start losses2 = AverageMeter() # end losses3 = AverageMeter() # class accuracies1 = AverageMeter() # start accuracies2 = AverageMeter() # end accuracies3 = AverageMeter() # class model.train() for j,(batch_input_ids, batch_attention_mask, batch_token_type_ids, batch_y_start, batch_y_end, batch_y) in enumerate(train_generator): batch_input_ids = batch_input_ids.cuda() batch_attention_mask = batch_attention_mask.cuda() batch_token_type_ids = batch_token_type_ids.cuda() labels1 = batch_y_start.cuda() labels2 = batch_y_end.cuda() labels3 = batch_y.cuda() logits1, logits2, logits3 = model(batch_input_ids, batch_attention_mask, batch_token_type_ids) y_true = (batch_y_start, batch_y_end, batch_y) loss1, loss2, loss3 = loss_fn((logits1, logits2, logits3), (labels1, labels2, labels3)) loss = loss1+loss2+loss3 acc1, n_position1 = get_position_accuracy(logits1, labels1) acc2, n_position2 = get_position_accuracy(logits2, labels2) acc3, n_position3 = get_position_accuracy(logits3, labels3) losses1.update(loss1.item(), n_position1) losses2.update(loss2.item(), n_position2) losses3.update(loss3.item(), n_position3) accuracies1.update(acc1, n_position1) accuracies2.update(acc2, n_position2) accuracies3.update(acc3, n_position2) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print('epoch: {}, train_loss1: {}, train_loss2: {}, train_loss3: {}, train_acc1: {}, train_acc2: {}, train_acc3: {}'.format(ep,losses1.avg,losses2.avg,losses3.avg,accuracies1.avg,accuracies2.avg,accuracies3.avg), flush=True) out_dir = 'weights/epoch3/' if not os.path.exists(out_dir): os.makedirs(out_dir) torch.save(model.module.state_dict(), out_dir+'pytorch_model.bin')
import re from random import shuffle import random import numpy as np from transformers import BertTokenizer, BertForMaskedLM, BertConfig import math import time from nltk.tokenize import word_tokenize from nltk.translate.bleu_score import sentence_bleu from EncDecStructure import * nltk.download('stopwords') nltk.download('punkt') stop_words = set(stopwords.words('english')) model_version = 'bert-base-uncased' config = BertConfig.from_pretrained(model_version, output_hidden_states=False) model = BertForMaskedLM.from_pretrained(model_version, config=config) model.train() cuda = torch.cuda.is_available() if cuda: model = model.cuda() tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=model_version.endswith("uncased")) CLS = '[CLS]' SEP = '[SEP]' MASK = '[MASK]' mask_id = tokenizer.convert_tokens_to_ids([MASK])[0] sep_id = tokenizer.convert_tokens_to_ids([SEP])[0] cls_id = tokenizer.convert_tokens_to_ids([CLS])[0] model2 = SentenceTransformer('bert-base-nli-mean-tokens') model2.eval()
def train(self, train_path: str, valid_path: str, types_path: str, input_reader_cls: BaseInputReader): args = self.args train_label, valid_label = 'train', 'valid' self._logger.info("Datasets: %s, %s" % (train_path, valid_path)) self._logger.info("Model type: %s" % args.model_type) # create log csv files self._init_train_logging(train_label) self._init_eval_logging(valid_label) # read datasets input_reader = input_reader_cls(types_path, self._tokenizer, args.neg_term_count, args.neg_relation_count, args.max_span_size, self._logger) input_reader.read({train_label: train_path, valid_label: valid_path}) self._log_datasets(input_reader) train_dataset = input_reader.get_dataset(train_label) train_sample_count = train_dataset.document_count updates_epoch = train_sample_count // args.train_batch_size updates_total = updates_epoch * args.epochs validation_dataset = input_reader.get_dataset(valid_label) self._logger.info("Updates per epoch: %s" % updates_epoch) self._logger.info("Updates total: %s" % updates_total) # create model model_class = models.get_model(self.args.model_type) # load model config = BertConfig.from_pretrained(self.args.model_path, cache_dir=self.args.cache_path) util.check_version(config, model_class, self.args.model_path) config.model_version = model_class.VERSION model = model_class.from_pretrained( self.args.model_path, config=config, cls_token=self._tokenizer.convert_tokens_to_ids('[CLS]'), relation_types=input_reader.relation_type_count - 1, term_types=input_reader.term_type_count, max_pairs=self.args.max_pairs, prop_drop=self.args.prop_drop, size_embedding=self.args.size_embedding, freeze_transformer=self.args.freeze_transformer, args=self.args, beta=self.args.beta, alpha=self.args.alpha, sigma=self.args.sigma) model.to(self._device) # create optimizer optimizer_params = self._get_optimizer_params(model) optimizer = AdamW(optimizer_params, lr=args.lr, weight_decay=args.weight_decay, correct_bias=False) # create scheduler scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.lr_warmup * updates_total, num_training_steps=updates_total) # create loss function rel_criterion = torch.nn.BCEWithLogitsLoss(reduction='none') term_criterion = torch.nn.CrossEntropyLoss(reduction='none') compute_loss = SynFueLoss(rel_criterion, term_criterion, model, optimizer, scheduler, args.max_grad_norm) # eval validation set if args.init_eval: self._eval(model, validation_dataset, input_reader, 0, updates_epoch) # train best_f1 = 0.0 for epoch in range(args.epochs): # train epoch self._train_epoch(model, compute_loss, optimizer, train_dataset, updates_epoch, epoch) # eval validation sets if not args.final_eval or (epoch == args.epochs - 1): rel_nec_eval = self._eval(model, validation_dataset, input_reader, epoch + 1, updates_epoch) if best_f1 < rel_nec_eval[-1]: # save final model best_f1 = rel_nec_eval[-1] extra = dict(epoch=args.epochs, updates_epoch=updates_epoch, epoch_iteration=0) global_iteration = args.epochs * updates_epoch self._save_model(self._save_path, model, self._tokenizer, global_iteration, optimizer=optimizer if self.args.save_optimizer else None, save_as_best=True, extra=extra, include_iteration=False) self._logger.info("Logged in: %s" % self._log_path) self._logger.info("Saved in: %s" % self._save_path) self._close_summary_writer()