def train(args, device): args.dataset_name = "MNLI" # TODO: parametrize model_name = args.model_name log = get_train_logger(args) SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) log.info(f'Using device {device}') tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) xlnet_config = XLNetConfig.from_pretrained( model_name, output_hidden_states=True, output_attentions=True, num_labels=3, finetuning_task=args.dataset_name) model = XLNetForSequenceClassification.from_pretrained(model_name, config=xlnet_config) model.to(device) # Load features from datasets data_loader = MNLIDatasetReader(args, tokenizer, log) train_file = os.path.join(args.base_path, args.train_file) val_file = os.path.join(args.base_path, args.val_file) train_dataloader = data_loader.load_train_dataloader(train_file) val_dataloader = data_loader.load_val_dataloader(val_file) trainer = TrainModel(train_dataloader, val_dataloader, log) trainer.train(model, device, args)
def create_model(self): if self.model_configuration.bert_model in ("xlnet-base-cased",): model = XLNetForSequenceClassification.from_pretrained(self.model_configuration.bert_model, num_labels=self.model_configuration.num_labels) else: model = BertForSequenceClassification.from_pretrained(self.model_configuration.bert_model, num_labels=self.model_configuration.num_labels) model.to(device) return model
def run(args): nli_model_path = 'saved_models/xlnet-base-cased/' model_file = os.path.join(nli_model_path, 'pytorch_model.bin') config_file = os.path.join(nli_model_path, 'config.json') log = get_logger('conduct_test') model_name = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) xlnet_config = XLNetConfig.from_pretrained(config_file) model = XLNetForSequenceClassification.from_pretrained(model_file, config=xlnet_config) dataset_reader = ConductDatasetReader(args, tokenizer, log) file_lines = dataset_reader.get_file_lines('data/dados.tsv') results = [] softmax_fn = torch.nn.Softmax(dim=1) model.eval() with torch.no_grad(): for line in tqdm(file_lines): premise, hypothesys, conflict = dataset_reader.parse_line(line) pair_word_ids, input_mask, pair_segment_ids = dataset_reader.convert_text_to_features( premise, hypothesys) tensor_word_ids = torch.tensor([pair_word_ids], dtype=torch.long, device=args.device) tensor_input_mask = torch.tensor([input_mask], dtype=torch.long, device=args.device) tensor_segment_ids = torch.tensor([pair_segment_ids], dtype=torch.long, device=args.device) model_input = { 'input_ids': tensor_word_ids, # word ids 'attention_mask': tensor_input_mask, # input mask 'token_type_ids': tensor_segment_ids } outputs = model(**model_input) logits = outputs[0] nli_scores, nli_class = get_scores_and_class(logits, softmax_fn) nli_scores = nli_scores.detach().cpu().numpy() results.append({ "conduct": premise, "complaint": hypothesys, "nli_class": nli_class, "nli_contradiction_score": nli_scores[0], "nli_entailment_score": nli_scores[1], "nli_neutral_score": nli_scores[2], "conflict": conflict }) df = pd.DataFrame(results) df.to_csv('results/final_results.tsv', sep='\t', index=False)
def predict_model(args, save=True): dataset_name = args.dataset_name[0] model_type = args.model_type test_dataset = path_tensor_dataset / f"{model_type}_{dataset_name}.pkl" test_dataset = pickle_load(test_dataset) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=4, shuffle=False) model_dir = path_model / f"{args.model_type}_{args.model_name}/checkpoint_epoch{args.epoch_num}" if model_type == "bert": model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=126) elif model_type == "xlnet": model = XLNetForSequenceClassification.from_pretrained(model_dir, num_labels=126) else: raise ValueError("") model.zero_grad() model.eval() model = model.cuda(args.gpu_device_ids[0]) if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids) res = [] for batch in tqdm(test_dataloader, desc="Iteration"): batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } with torch.no_grad(): outputs = model(**inputs)[0] res.append(outputs) res = torch.cat(res, 0).cpu() if save: filename = f"{model_type}_{dataset_name}_epoch{args.epoch_num}_res.pkl" pickle_save(res, path_model_output / filename) return res
def __init__(self, args, task_name, weight_file=None, config_file=None): self.args = args self.device = args.device self.log = self.get_train_logger(args, task_name) self.softmax = Softmax(dim=1) self.tokenizer = XLNetTokenizer.from_pretrained(args.model_name, do_lower_case=True) self.dataset_reader = init_dataset_reader(task_name, args, self.tokenizer, self.log) config = args.model_name if config_file is None else config_file model_weights = args.model_name if weight_file is None else weight_file xlnet_config = XLNetConfig.from_pretrained(config, output_hidden_states=True, output_attentions=True, num_labels=3, finetuning_task=task_name) model = XLNetForSequenceClassification.from_pretrained( model_weights, config=xlnet_config) self.model = model.to(args.device)
def validate_on_test_set(args, device): log = get_logger(f"test-results") SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) log.info(f'Using device {device}') model_name = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True) xlnet_config = XLNetConfig.from_pretrained(args.config_file) data_reader = KaggleMNLIDatasetReader(args, tokenizer, log) model = XLNetForSequenceClassification.from_pretrained(args.model_file, config=xlnet_config) model.to(device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) log.info(f'Running on {args.n_gpu} GPUS') test_executor = KaggleTest(tokenizer, log, data_reader) write_kaggle_results("matched", args.test_matched_file, test_executor, device, model) write_kaggle_results("mismatched", args.test_mismatched_file, test_executor, device, model)
batch_size = 12 # Create the DataLoader for our training set. train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # Load BertForSequenceClassification, the pretrained XLNet model with a single # linear classification layer on top. model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=14) # Tell pytorch to run this model on the GPU. model.cuda() # Get all of the model's parameters as a list of tuples. param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ]
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step, eval_every, early_stop, lr, weight_decay, lr_decay_in_layers, wd_decay_in_layers, max_length, max_title_rate, content_head_rate, batch_size, lr_scheduler_type, input_pattern, clean_method, warmup_rate, classifier_dropout, classifier_active, seed): arg_name_value_pairs = deepcopy(locals()) prefix = time.strftime('%Y%m%d_%H%M') logger = logging.getLogger('default') formatter = logging.Formatter("%(asctime)s %(message)s") if log_in_file: handler1 = logging.FileHandler(prefix + '.log') handler1.setFormatter(formatter) handler1.setLevel(logging.DEBUG) logger.addHandler(handler1) handler2 = logging.StreamHandler() handler2.setFormatter(formatter) handler2.setLevel(logging.DEBUG) logger.addHandler(handler2) logger.setLevel(logging.DEBUG) for arg_name, arg_value in arg_name_value_pairs.items(): logger.info(f'{arg_name}: {arg_value}') global tokenizer if lm_type == 'bert': tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt')) else: tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model')) global PAD, PAD_t, CLS_t, SEP_t PAD_t = '<pad>' CLS_t = '<cls>' SEP_t = '<sep>' PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0] logger.info(f'padding token is {PAD}') processed_train = preprocess( os.path.join(data_path, 'Train_DataSet.csv'), os.path.join(data_path, 'Train_DataSet_Label.csv'), tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'), False, tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) logger.info('seed everything and create model') seed_everything(seed) no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight'] if lm_type == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( lm_path, num_labels=3, summary_last_dropout=classifier_dropout) if classifier_active == 'relu': model.sequence_summary.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = [ 'transformer.mask_emb', 'transformer.word_embedding.weight' ] model_layer_names += [ f'transformer.layer.{i}.' for i in range(model.config.n_layer) ] model_layer_names += ['sequence_summary.summary', 'logits_proj'] else: model = BertForSequenceClassification.from_pretrained( lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout) if classifier_active == 'relu': model.bert.pooler.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = ['bert.embeddings'] model_layer_names += [ 'bert.encoder.layer.{}.'.format(i) for i in range(model.config.num_hidden_layers) ] model_layer_names += ['bert.pooler', 'classifier'] optimizer = optimizer = AdamW([{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and not any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': weight_decay * (wd_decay_in_layers**i) } for i, layer_name in enumerate(model_layer_names[::-1])] + [{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': .0 } for i, layer_name in enumerate(model_layer_names[::-1])]) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError model_state_0 = deepcopy(model.state_dict()) optimizer_state_0 = deepcopy(optimizer.state_dict()) test_iter = get_data_iter(processed_test, batch_size * 4, collect_test_func, shuffle=False) pred = np.zeros((len(processed_test), 3)) val_scores = [] for fold_idx, (train_idx, val_idx) in enumerate( KFold(n_splits=n_fold, shuffle=True, random_state=seed).split(processed_train)): model.load_state_dict(model_state_0) optimizer.load_state_dict(optimizer_state_0) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError train_iter = get_data_iter([processed_train[i] for i in train_idx], batch_size, collect_func) val_iter = get_data_iter([processed_train[i] for i in val_idx], batch_size * 4, collect_func, shuffle=False) best_model, best_score = training(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_iter=train_iter, val_iter=val_iter, total_step=total_step, tokenizer=tokenizer, usegpu=usegpu, eval_every=eval_every, logger=logger, early_stop=early_stop, fold_idx=fold_idx) model.load_state_dict(best_model) val_scores.append(best_score) pred += predict(model, test_iter, usegpu) logger.info(f'average: {np.mean(val_scores):.6f}') pred = pred / n_fold prob_df = pd.DataFrame() submit = pd.DataFrame() submit['id'] = [i['id'] for i in processed_test] submit['label'] = pred.argmax(-1) prob_df['id'] = [i['id'] for i in processed_test] prob_df['0'] = pred[:, 0] prob_df['1'] = pred[:, 1] prob_df['2'] = pred[:, 2] submit.to_csv(f'submit_{prefix}.csv', index=False) prob_df.to_csv(f'probability_{prefix}.csv', index=False)
# Drop last can make batch training better for the last one train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True) valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num) #Train model #Load XLNet model # In this document, contain confg(txt) and weight(bin) files #model_file_address = 'xlnet-base-cased' model_file_address = '/home/saul/deeplearning/xlnet' # Will load config and weight with from_pretrained() # Recommand download the model before using # Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin" # Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json" model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx)) # Set model to GPU,if you are using GPU machine model.to(device) # Add multi GPU support if n_gpu >1: model = torch.nn.DataParallel(model) # Set epoch and grad max num epochs = 10 #epochs = 3 max_grad_norm = 1.0 # Calcuate train optimiazaion num num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs #Set fine tuning method
def Train(inputIds, attention_masks, labels, batch_size=24, epochs=10): train_inputs, validation_inputs, train_labels, validation_labels = train_test_split( inputIds, labels, random_state=2020, test_size=0.2) train_masks, validation_masks, _, _ = train_test_split(attention_masks, inputIds, random_state=2020, test_size=0.2) # Turn data into torch tensors train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # Create Iterators of the datasets train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Loads model into GPU memory model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) # train_loss_set = [] # Find GPU or CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainLoss = [] valAcc = [] for _ in trange(epochs, desc='Epoch'): # Train model.train() trainLoss.append(0) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() # Forward pass and loss calculation outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] logits = outputs[1] # Calculate gradients loss.backward() # Update weights using gradients optimizer.step() trainLoss[-1] += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print('\nTrain loss: {}'.format(trainLoss[-1] / nb_tr_steps)) # Valuation model.eval() nb_eval_steps = 0 valAcc.append(0) for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # Don't calculate gradients since we are evaluating the model with torch.no_grad(): output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = output[0] # Grab logistic values from GPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) valAcc[-1] += tmp_eval_accuracy nb_eval_steps += 1 print('\nValidation Accuracy: {}\n'.format(valAcc[-1] / nb_eval_steps)) return model, trainLoss, valAcc
def main(_): if FLAGS.server_ip and FLAGS.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(FLAGS.server_ip, FLAGS.server_port), redirect_output=True) ptvsd.wait_for_attach() tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.logging.set_verbosity(tf.logging.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.log_step_count_steps = min(FLAGS.log_step_count_steps, FLAGS.save_steps) if FLAGS.do_predict: predict_dir = FLAGS.predict_dir if not tf.gfile.Exists(predict_dir): tf.gfile.MakeDirs(predict_dir) processors = { "mnli_matched": MnliMatchedProcessor, "mnli_mismatched": MnliMismatchedProcessor, 'sts-b': StsbProcessor, 'imdb': ImdbProcessor, "yelp5": Yelp5Processor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval, `do_predict` or " "`do_submit` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) # ########################### LOAD PT model # ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # tf.logging.info("Model loaded from path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True, num_labels=1) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # ########################### LOAD PT model # ########################### LOAD PT model task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() if not FLAGS.is_regression else None sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) # run_config = model_utils.configure_tpu(FLAGS) # model_fn = get_model_fn(len(label_list) if label_list is not None else None) spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = tf.estimator.Estimator( # model_fn=model_fn, # config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) tf.logging.info("Use tfrecord file {}".format(train_file)) train_examples = processor.get_train_examples(FLAGS.data_dir) tf.logging.info("Num of train samples: {}".format(len(train_examples))) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, train_file, FLAGS.num_passes) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) ##### Create input tensors / placeholders bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host params = { "batch_size": FLAGS.train_batch_size # the whole batch } train_set = train_input_fn(params) example = train_set.make_one_shot_iterator().get_next() if FLAGS.num_core_per_host > 1: examples = [{} for _ in range(FLAGS.num_core_per_host)] for key in example.keys(): vals = tf.split(example[key], FLAGS.num_core_per_host, 0) for device_id in range(FLAGS.num_core_per_host): examples[device_id][key] = vals[device_id] else: examples = [example] ##### Create computational graph tower_losses, tower_grads_and_vars, tower_inputs, tower_hidden_states, tower_logits = [], [], [], [], [] for i in range(FLAGS.num_core_per_host): reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), \ tf.variable_scope(tf.get_variable_scope(), reuse=reuse): loss_i, grads_and_vars_i, inputs_i, hidden_states_i, logits_i = single_core_graph( is_training=True, features=examples[i], label_list=label_list) tower_losses.append(loss_i) tower_grads_and_vars.append(grads_and_vars_i) tower_inputs.append(inputs_i) tower_hidden_states.append(hidden_states_i) tower_logits.append(logits_i) ## average losses and gradients across towers if len(tower_losses) > 1: loss = tf.add_n(tower_losses) / len(tower_losses) grads_and_vars = average_grads_and_vars(tower_grads_and_vars) inputs = dict((n, tf.concat([t[n] for t in tower_inputs], 0)) for n in tower_inputs[0]) hidden_states = list( tf.concat(t, 0) for t in zip(*tower_hidden_states)) logits = tf.concat(tower_logits, 0) else: loss = tower_losses[0] grads_and_vars = tower_grads_and_vars[0] inputs = tower_inputs[0] hidden_states = tower_hidden_states[0] logits = tower_logits[0] # Summaries merged = tf.summary.merge_all() ## get train op train_op, learning_rate, gnorm = model_utils.get_train_op( FLAGS, None, grads_and_vars=grads_and_vars) global_step = tf.train.get_global_step() ##### Training loop saver = tf.train.Saver(max_to_keep=FLAGS.max_save) gpu_options = tf.GPUOptions(allow_growth=True) #### load pretrained models model_utils.init_from_checkpoint(FLAGS, global_vars=True) writer = tf.summary.FileWriter(logdir=FLAGS.model_dir, graph=tf.get_default_graph()) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ######### ##### PYTORCH import torch from torch.optim import Adam from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME + '-00') saver.save(sess, save_path) tf.logging.info("Model saved in path: {}".format(save_path)) device = torch.device("cuda", 4) config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) pt_model = XLNetForSequenceClassification.from_pretrained( save_path, from_tf=True, config=config) pt_model.to(device) pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0) ######### ##### PYTORCH f_inp = torch.tensor(inputs_np["input_ids"], dtype=torch.long, device=device) f_seg_id = torch.tensor(inputs_np["segment_ids"], dtype=torch.long, device=device) f_inp_mask = torch.tensor(inputs_np["input_mask"], dtype=torch.float, device=device) f_label = torch.tensor(inputs_np["label_ids"], dtype=torch.float, device=device) # with torch.no_grad(): # _, hidden_states_pt, _ = pt_model.transformer(f_inp, f_seg_id, f_inp_mask) # logits_pt, _ = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask) pt_model.train() outputs = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask, labels=f_label) loss_pt = outputs[0] loss_pt = loss_pt.mean() total_loss_pt += loss_pt.item() # # hidden_states_pt = list(t.detach().cpu().numpy() for t in hidden_states_pt) # # special_pt = special_pt.detach().cpu().numpy() # # Optimizer pt pt_model.zero_grad() loss_pt.backward() gnorm_pt = torch.nn.utils.clip_grad_norm_( pt_model.parameters(), FLAGS.clip) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate_np optimizer.step() ##### PYTORCH ######### if curr_step > 0 and curr_step % FLAGS.log_step_count_steps == 0: curr_loss = total_loss / (curr_step - prev_step) curr_loss_pt = total_loss_pt / (curr_step - prev_step) tf.logging.info( "[{}] | gnorm {:.2f} lr {:8.6f} " "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format( curr_step, gnorm_np, learning_rate_np, curr_loss, math.exp(curr_loss), curr_loss / math.log(2))) ######### ##### PYTORCH tf.logging.info( " PT [{}] | gnorm PT {:.2f} lr PT {:8.6f} " "| loss PT {:.2f} | pplx PT {:>7.2f}, bpc PT {:>7.4f}". format(curr_step, gnorm_pt, learning_rate_np, curr_loss_pt, math.exp(curr_loss_pt), curr_loss_pt / math.log(2))) ##### PYTORCH ######### total_loss, total_loss_pt, prev_step = 0., 0., curr_step writer.add_summary(summary_np, global_step=curr_step) if curr_step > 0 and curr_step % FLAGS.save_steps == 0: save_path = os.path.join(FLAGS.model_dir, "model.ckpt-{}".format(curr_step)) saver.save(sess, save_path) tf.logging.info( "Model saved in path: {}".format(save_path)) ######### ##### PYTORCH # Save a trained model, configuration and tokenizer model_to_save = pt_model.module if hasattr( pt_model, 'module') else pt_model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = os.path.join( FLAGS.output_dir, "pytorch-ckpt-{}".format(curr_step)) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tf.logging.info( "PyTorch Model saved in path: {}".format(output_dir)) ##### PYTORCH ######### if curr_step >= FLAGS.train_steps: break if FLAGS.do_eval: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.eval.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) ########################### LOAD PT model # import torch # from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam # save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME) # saver.save(sess, save_path) # tf.logging.info("Model saved in path: {}".format(save_path)) # device = torch.device("cuda", 4) # config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1) # tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME) # config.to_json_file(config_path) # # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1) # pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True) # pt_model.to(device) # pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7]) # from torch.optim import Adam # optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay, # amsgrad=False) # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps, # eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay) ##### PYTORCH ######### fetches = [ loss, global_step, gnorm, learning_rate, train_op, merged, inputs, hidden_states, logits ] total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0 total_logits = None total_labels = None while True: feed_dict = {} # for i in range(FLAGS.num_core_per_host): # for key in tower_mems_np[i].keys(): # for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]): # feed_dict[m] = m_np fetched = sess.run(fetches) loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched total_loss += loss_np if total_logits is None: total_logits = logits_np total_labels = inputs_np['label_ids'] else: total_logits = np.append(total_logits, logits_np, axis=0) total_labels = np.append(total_labels, inputs_np['label_ids'], axis=0)
def train_model(args): set_seed(args.seed) train_dataset = [ path_tensor_dataset / f"{args.model_type}_{x}.pkl" for x in args.dataset_name ] train_dataset = [pickle_load(x) for x in train_dataset] train_dataset = ConcatDataset(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=4, shuffle=True) if args.model_type == "bert": model = BertForSequenceClassification.from_pretrained(path_bert_model, num_labels=126) elif args.model_type == "xlnet": model = XLNetForSequenceClassification.from_pretrained( path_xlnet_model, num_labels=126) else: raise ValueError("") model.zero_grad() model = model.cuda(args.gpu_device_ids[0]) if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) total_steps = len( train_dataloader) * args.epoch_num // args.gradient_accumulation_steps warmup_steps = int(total_steps * args.warmup_proportion) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) global_step = 0 train_iterator = trange(int(args.epoch_num), desc="Epoch") for i in train_iterator: epoch = i + 1 epoch_iterator = tqdm(train_dataloader, desc="Iteration") for batch in epoch_iterator: model.train() batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3] } outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 output_dir = f"{args.model_type}_{args.model_name}/checkpoint_epoch{epoch}" output_dir = path_model / output_dir output_dir.mkdir(parents=True, exist_ok=True) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) pickle_save(args, os.path.join(output_dir, "training_args.pkl"))