sep_sentence.remove("<pad>") pad_num += 1 for i, j in enumerate(sep_sentence[1:-2]): out[i + pad_num + 1] = max(char_label[current_idx:current_idx + len(j)]) if j == "<unk>": current_idx = current_idx + 1 else: current_idx = current_idx + len(j) return out.tolist() config = AutoConfig.from_pretrained(model_path) tokenizer = XLNetTokenizer.from_pretrained(model_path, unk_token=unk_token) model = XLNetForTokenClassification.from_pretrained(model_path, num_labels=13) if torch.cuda.is_available(): device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') # device = torch.device("cpu") model.to(device) train_input_ids = [] train_labels = [] train_masks = []
def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/train.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Set the seed value all over the place to make this reproducible. seed_val = args.seed random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Choose device device = get_default_device() prompts_train_idxs = np.loadtxt(args.train_prompts_idxs_path, dtype=np.int64) topics_dist = np.loadtxt(args.unique_prompts_distribution_path, dtype=np.int32) # Normalise topics_dist = topics_dist / np.linalg.norm(topics_dist, 1) # Load the BERT tokenizer. print('Loading BERT tokenizer...') tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True) with open(args.unique_prompts_path) as f: topics = f.readlines() # Remove whitespaces and convert to lowercase topics = [x.strip().lower() for x in topics] with open(args.train_resps_path) as f: responses = f.readlines() # Remove whitespaces and convert to lower case responses = [x.strip().lower() for x in responses] # Tokenize all the prompts and the responses and then map the tokens to their word IDs topic_ids = [] for sent in topics: encoded_sent = tokenizer.encode(sent, add_special_tokens=True) topic_ids.append(encoded_sent) resp_ids = [] for sent in responses: encoded_sent = tokenizer.encode(sent, add_special_tokens=True) resp_ids.append(encoded_sent) MAX_LEN_topic = max([len(sen) for sen in topic_ids]) MAX_LEN_resp = max([len(sen) for sen in resp_ids]) print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) # Pad our input tokens with value 0. # "post" indicates that we want to pad and truncate at the end of the sequence, # as opposed to the beginning. topic_ids = pad_sequences(topic_ids, maxlen=MAX_LEN_topic, dtype="long", value=0, truncating="post", padding="post") resp_ids = pad_sequences(resp_ids, maxlen=MAX_LEN_resp, dtype="long", value=0, truncating="post", padding="post") # The attention mask simply makes it explicit which tokens are actual words versus which are padding. attention_masks_topic = [] # For each sentence... for sent in topic_ids: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. att_mask = [int(token_id > 0) for token_id in sent] # Store the attention mask for this sentence. attention_masks_topic.append(att_mask) attention_masks_resp = [] for sent in resp_ids: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. att_mask = [int(token_id > 0) for token_id in sent] # Store the attention mask for this sentence. attention_masks_resp.append(att_mask) # Convert to torch tensors prompts_train_idxs = torch.from_numpy(prompts_train_idxs) prompts_train_idxs = prompts_train_idxs.long() topic_ids = torch.tensor(topic_ids) topic_ids = topic_ids.long() topic_ids = topic_ids.to(device) attention_masks_topic = torch.tensor(attention_masks_topic) attention_masks_topic = attention_masks_topic.long() attention_masks_topic = attention_masks_topic.to(device) resp_ids = torch.tensor(resp_ids) resp_ids = resp_ids.long() resp_ids = resp_ids.to(device) attention_masks_resp = torch.tensor(attention_masks_resp) attention_masks_resp = attention_masks_resp.long() attention_masks_resp = attention_masks_resp.to(device) # Create the DataLoader for our training set. print(prompts_train_idxs.size(0)) print(resp_ids.size(0)) print(attention_masks_resp.size(0)) train_data = TensorDataset(prompts_train_idxs, resp_ids, attention_masks_resp) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = XLNetForSequenceClassification.from_pretrained( "xlnet-base-cased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model.to(device) # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) loss_values = [] # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.n_epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) for epoch in range(args.n_epochs): # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format( epoch + 1, args.n_epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) p_id = batch[0].to(device) r = batch[1].to(device) r_msk = batch[2].to(device) # Perform dynamic shuffling p_id, r, r_msk, y_true, batch_size = _shuffle( p_id, r, r_msk, topics_dist, args.num_topics, device) # Get the prompts from the topics p, p_msk = _get_prompts(p_id, topic_ids, attention_masks_topic) p, p_msk = p.to(device), p_msk.to(device) # Concatenate prompts and responses pr_resp, pr_resp_msk = _join_pr_resp(p, p_msk, r, r_msk, args.reverse) pr_resp, pr_resp_msk = pr_resp.to(device), pr_resp_msk.to(device) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because we # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(pr_resp, token_type_ids=None, attention_mask=pr_resp_msk, labels=y_true) # The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0] # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(format_time(time.time() - t0))) # NEED TO DO THE VALIDATION CODE NOW - see the rest of the tutorial at # https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1 # Save the model to a file file_path = args.save_path + 'xlnet_classifier_seed' + str( args.seed) + '.pt' torch.save(model, file_path)
res2['Patient_is_Pro_Vaccination__c']), 'Hesitancy_Classification__c': max(res1['Hesitancy_Classification__c'], res2['Hesitancy_Classification__c']), 'timestamp': str(datetime.now()) } return model_res checkpoint = torch.load("xlnet_vaccine.bin") model_state_dict = checkpoint['state_dict'] model = XLNetForMultiLabelSequenceClassification( num_labels=model_state_dict["classifier.weight"].size()[0]) model.load_state_dict(model_state_dict) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True) @api_view(['GET', 'POST']) def predictXLNET(request): sessionID = str(request.GET.get('session')) print(sessionID) surveydata = read_bq(sessionID) print(surveydata) label_cols = [ 'Conspiracy: Distrust of government, organizations, big pharma', 'Fear of Critical side-effects (Autism, Brain Damage, SIDS/Death)', 'Fear of Non-critical side-effects (Rash, Pain, Fever, GI problems, Bump on arm)', 'Holistic or alternative medicine', 'Logistic Concerns', 'Pro-vax', 'Religious Beliefs', 'Right to choose',
def main(): num_embeddings = 512 # Select a batch size for training batch_size = 32 """ train_mode: True ==> training or False ==> predict """ train_mode = True load_trained = False train = pd.read_csv("./data/train.csv") test = pd.read_csv("./data/test.csv") label_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] if len(sys.argv) < 3: print("Example: python3 XLNet.py <label> <device_no(int)>") sys.exit() label_start = int(sys.argv[1]) device_no = sys.argv[2] print("GPU Available: {}".format(torch.cuda.is_available())) n_gpu = torch.cuda.device_count() print("Number of GPU Available: {}".format(n_gpu)) device = torch.device( "cuda:{}".format(device_no) if torch.cuda.is_available() else "cpu") print("using device: {}".format(device)) if not os.path.exists("./submission"): os.mkdir("./submission") sample = pd.read_csv("./data/sample_submission.csv") for label in label_cols[label_start:label_start + 3]: print("Label: {}".format(label)) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True) train_text_list = train["comment_text"].values test_text_list = test["comment_text"].values train_input_ids = tokenize_inputs(train_text_list, tokenizer, num_embeddings=num_embeddings) test_input_ids = tokenize_inputs(test_text_list, tokenizer, num_embeddings=num_embeddings) train_attention_masks = create_attn_masks(train_input_ids) test_attention_masks = create_attn_masks(test_input_ids) # add input ids and attention masks to the dataframe train["features"] = train_input_ids.tolist() train["masks"] = train_attention_masks test["features"] = test_input_ids.tolist() test["masks"] = test_attention_masks Y_true = y_split(train, label) # train valid split training, valid = train_test_split(train, test_size=0.2, random_state=23) X_train = training["features"].values.tolist() X_valid = valid["features"].values.tolist() Y_train = y_split(training, label) Y_valid = y_split(valid, label) train_masks = training["masks"].values.tolist() valid_masks = valid["masks"].values.tolist() # Convert all of our input ids and attention masks into # torch tensors, the required datatype X_train = torch.tensor(X_train) X_valid = torch.tensor(X_valid) Y_train = torch.tensor(Y_train, dtype=torch.long) Y_valid = torch.tensor(Y_valid, dtype=torch.long) train_masks = torch.tensor(train_masks, dtype=torch.long) valid_masks = torch.tensor(valid_masks, dtype=torch.long) # Create an iterator of our data with torch DataLoader. This helps save on # memory during training because, unlike a for loop, # with an iterator the entire dataset does not need to be loaded into memory train_data = TensorDataset(X_train, train_masks, Y_train) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(X_valid, valid_masks, Y_valid) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) num_labels = 2 num_epochs = 5 # load model: xlnet_label_3ep_weight.bin (trained on 2.4.2020 score: 0.84) model_save_path = "xlnet_{}_{}embed_{}ep_weights.bin".format( label, num_embeddings, 3) if load_trained: model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model( model_save_path) # print(model) else: model = XLNetForMultiLabelSequenceClassification( num_labels=num_labels) # Freeze pretrained xlnet parameters # model.freeze_xlnet_decoder() model.unfreeze_xlnet_decoder() optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False) if train_mode: model, train_loss_set, valid_loss_set = train_model( model, num_epochs=num_epochs, optimizer=optimizer, train_dataloader=train_dataloader, valid_dataloader=validation_dataloader, model_save_path=model_save_path, device=device) else: # load model model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model( model_save_path) # print(model) # validation valid_preds = generate_predictions(model, valid, num_labels, device=device, batch_size=batch_size) score = roc_auc_score_FIXED(Y_valid, valid_preds) print("Label: {}, ROC_AUC: {}".format(label, score)) predicts = generate_predictions(model, test, num_labels, device=device, batch_size=batch_size) sample[label] = predicts output_filename = "submission_XLNET_{}_{}_{}ep.csv".format( datetime.datetime.now().date(), label, num_epochs) sample.to_csv(output_filename, index=False) print("Label: {}, Output: {}".format(label, output_filename)) # print(predicts) sample[label] = predicts sample.to_csv("submission_XLNET_{}_{}ep.csv".format(label, num_epochs), index=False)
def __init__(self, vocab_path, do_lower_case): self.tokenizer = XLNetTokenizer(vocab_path, do_lower_case)
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='xlnet-base-cased', help='model name or path') args = parser.parse_args() config = XLNetConfig.from_pretrained(args.model) model = XLNetModel.from_pretrained(args.model, config=config) tokenizer = XLNetTokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' ] results = se.eval(transfer_tasks) sts_task_list = [
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print('gpu count:', n_gpu) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(random_seed) os.makedirs(output_dir, exist_ok=True) model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased') model.to(device) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') no_decay = ['bias', 'LayerNorm.weight'] ## note: no weight decay according to XLNet paper optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6) train_data = load_and_cache_examples(data_path, 'race', tokenizer) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) num_train_steps = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) global_step = 0 for ep in range(int(num_train_epochs)): model.train() max_score = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) loss = output.loss if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if step % 800 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) eval_data = load_and_cache_examples(data_path, 'race', tokenizer, evaluate=True) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) logger.info("***** Running Dev Evaluation *****") logger.info(" Num examples = %d", len(eval_dataloader)) logger.info(" Batch size = %d", eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): eval_output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tmp_eval_loss = eval_output.loss logits = eval_output.logits logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } logger.info(" Epoch: %d", (ep + 1)) logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join(output_dir, "results.txt") with open(output_eval_file, "a+") as writer: writer.write(" Epoch: " + str(ep + 1)) for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "fasta": FASTA_DATASET = True datasets = load_dataset_fasta(data_files, data_args.max_seq_length) else: if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = XLNetTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. tokenized_datasets = dict() for dataset_key, dataset in datasets.items(): # Tokenize encodings = tokenizer( dataset['sequences'], truncation=True, padding='max_length', # TODO get from args passed in max_length=data_args.max_seq_length, return_special_tokens_mask=True, return_token_type_ids=False, return_attention_mask=False ) torch_dataset = FastaDataset(encodings) tokenized_datasets[dataset_key] = torch_dataset # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def make_xlnet_tokenizer() -> PreTrainedTokenizer: tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") return tokenizer
parser = ArgumentParser() parser.add_argument('--path') parser.add_argument('--xlnet', action='store_true', default=False, help='whether using xlnet tokenizer for preprocessing') parser.add_argument('--roberta', action='store_true', default=False) parser.add_argument('--bert', action='store_true', default=False) args = parser.parse_args() return args if __name__ == '__main__': args = argparser() index, text, gold = load_data(args.path) if args.xlnet: model_version = 'xlnet-base-cased' tokenizer = XLNetTokenizer.from_pretrained(model_version, do_lower_case=True) elif args.bert: model_version = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) elif args.roberta: model_version = 'roberta-base' tokenizer = RobertaTokenizer.from_pretrained(model_version, do_lower_case=True) context, text_attention_mask = tokenization(text, tokenizer, args) save_preprocessing(context, text_attention_mask, gold)
def __len__(self): return len(self.examples) def __getitem__(self, item): return self.examples[item] if __name__ == '__main__': parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--overwrite_cache", default=False, type=bool, ) parser.add_argument( "--model_type", type=str, default="xlnet", help="The model architecture to be trained or fine-tuned.", ) args = parser.parse_args() path = "/Users/eyalorbach/data/movie_plots_short/valid" tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", cache_dir="/tmp/cache") mds = MaskedPlotDataset(tokenizer, args, path)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print('gpu count:', n_gpu) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(random_seed) os.makedirs(output_dir, exist_ok=True) model_state_dict = torch.load(output_model_file, map_location=device) model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased', state_dict=model_state_dict) model.to(device) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') eval_data = load_and_cache_examples(data_path, 'mc500', tokenizer, test=True) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) logger.info("***** Running Evaluation *****") logger.info(" Num examples = %d", len(eval_dataloader)) logger.info(" Batch size = %d", eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): eval_output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tmp_eval_loss = eval_output.loss logits = eval_output.logits logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join(output_dir, "results.txt") with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key])))
summary = splitted[SUMMARY_INDEX].strip() text = splitted[TEXT_INDEX] for junk in JUNK_HEADER_TEXT: text = text.replace(junk, "").strip() # Don't accept content with too small of text content or title content. Often these are very bad examples. if len(text) < 1024: return None if len(summary) < 30: return None return {"summary": summary, "text": text} tok = XLNetTokenizer.from_pretrained("xlnet-base-cased") # This is a map function for processing reviews. It returns a dict: # { 'text' { input_ids_as_tensor }, # 'target' { input_ids_as_tensor } } def map_tokenize_news(processed): text = processed["text"] text_enc = tok.encode(text, add_special_tokens=False, max_length=None, pad_to_max_length=False) title = processed["summary"] # Insert the title as the second sentence, forcing the proper token types. title_enc = tok.encode(title,
def load(cls, pretrained_model_name_or_path, revision=None, tokenizer_class=None, use_fast=True, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) kwargs["revision"] = revision if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class( pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if "AlbertTokenizer" in tokenizer_class: if use_fast: ret = AlbertTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "XLMRobertaTokenizer" in tokenizer_class: if use_fast: ret = XLMRobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: if use_fast: ret = RobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "XLNetTokenizer" in tokenizer_class: if use_fast: ret = XLNetTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "CamembertTokenizer" in tokenizer_class: if use_fast: ret = CamembertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRQuestionEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRContextEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained XLNet model).") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--layers', type=int, nargs='+', default=[-2], help="choose the layers that used for downstream tasks, " "-2 means use pooled output, -1 means all layer," "else means the detail layers. default is -2") parser.add_argument('--num_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--num_test_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--pooling_type', default=None, type=str, choices=[None, 'mean', 'max']) args = parser.parse_args() processors = { "sst": SSTProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) summary_writer = SummaryWriter(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased") model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased") model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] #no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, correct_bias=False) global_step = 0 global_train_step = 0 all_examples = processor.get_all_examples(args.data_dir) all_features = convert_examples_to_features(all_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = all_features['input_ids'] all_input_mask = all_features['attention_mask'] all_segment_ids = all_features['token_type_ids'] all_label_ids = all_features['labels'] all_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_data, eval_data = random_split(all_data, [100000, 12428]) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print("TOTAL STEPS: ", (len(train_dataloader) * int(args.num_train_epochs))) epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, token_type_ids, label_ids = batch # print("Input ids shape:", input_ids.shape) # print("Input mask shape:", input_mask.shape) # print("Tok type Ids shape:", segment_ids.shape) # print("Labels shape:", label_ids.shape) loss, _ = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients # scheduler.step() summary_writer.add_scalar('Loss/train', loss.item(), global_step) # possibly comment this out max_grad_norm = 1.0 _clip_grad_norm(optimizer_parameters, max_grad_norm) model.zero_grad() global_step += 1 model.eval() eval_loss, eval_accuracy = 0, 0 pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0 neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open( os.path.join(args.output_dir, "results_ep" + str(epoch) + ".txt"), "w") as f: for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluate"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.detach().to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output) + "\n") tmp_eval_accuracy = np.sum(outputs == label_ids) tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent( outputs, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy neg_eval_prec += tmp_eval_prec neg_eval_recall += tmp_eval_recall neg_eval_f1 += tmp_eval_f1 tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent( outputs, label_ids) pos_eval_prec += tmp_eval_prec pos_eval_recall += tmp_eval_recall pos_eval_f1 += tmp_eval_f1 global_train_step += 1 summary_writer.add_scalar("Loss/test", tmp_eval_loss.mean().item(), global_train_step) summary_writer.add_scalar("Accuracy/test", tmp_eval_accuracy, global_train_step) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples pos_eval_prec = pos_eval_prec / nb_eval_steps pos_eval_recall = pos_eval_recall / nb_eval_steps pos_eval_f1 = pos_eval_f1 / nb_eval_steps neg_eval_prec = neg_eval_prec / nb_eval_steps neg_eval_recall = neg_eval_recall / nb_eval_steps neg_eval_f1 = neg_eval_f1 / nb_eval_steps result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'pos_eval_precision': pos_eval_prec, 'neg_eval_precision': neg_eval_prec, 'pos_eval_recall': pos_eval_recall, 'neg_eval_recall': neg_eval_recall, 'pos_eval_f1': pos_eval_f1, 'neg_eval_f1': neg_eval_f1 } summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch) summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch) summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy, epoch) summary_writer.add_scalar("Epoch_positive_precision/test", pos_eval_prec, epoch) summary_writer.add_scalar("Epoch_negative_precision/test", neg_eval_prec, epoch) summary_writer.add_scalar("Epoch_positive_recall/test", pos_eval_recall, epoch) summary_writer.add_scalar("Epoch_negative_recall/test", neg_eval_recall, epoch) summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1, epoch) summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1, epoch) output_eval_file = os.path.join( args.output_dir, "eval_results_ep" + str(epoch) + ".txt") print("output_eval_file=", output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print("Saving model") torch.save( model.module.state_dict(), os.path.join( args.output_dir, "sst-phrases-finetuned-xlnet-model_" + str(epoch) + ".pth"))
'test.json') args.embedding_path = os.path.join(args.tokenizer_dir, 'embedding.bin') args.config = os.path.join(args.tokenizer_dir, 'knowledge_config.json') print(args) if tokenization == 'BERT': tokenizer = BertTokenizer.from_pretrained(args.tokenizer_dir) banwords = tokenizer.convert_tokens_to_ids( ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they']) elif tokenization == 'GPT2': tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_dir) banwords = tokenizer.convert_tokens_to_ids( ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they']) elif tokenization == 'XLNET': tokenizer = XLNetTokenizer.from_pretrained(args.tokenizer_dir) banwords = tokenizer.convert_tokens_to_ids( ['It', 'She', 'They', 'He', 'it', 'she', 'he', 'they']) else: raise NotImplementedError with open(args.config, 'r') as f: knowledge_config = json.load(f) config = SimpleNamespace(**knowledge_config) print(config) if args.option == 'compute_bleu': with open('decoded_results.json', 'r') as f: results = json.load(f) with open(args.test_path, 'r') as f: references = json.load(f)
""" import torch from transformers import XLNetTokenizer from tqdm import tqdm from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \ confusion_matrix, classification_report import time import sys sys.path.append('/home/xijian/pycharm_projects/document-level-classification/') from xlnet_hierarchical_attn.config import * from xlnet_hierarchical_attn.prepare_data import load_data from xlnet_hierarchical_attn.train import MyXLNetModel, printbar tokenizer = XLNetTokenizer.from_pretrained(xlnet_model_dir+'spiece.model') ngpu = 4 # 4 use_cuda = torch.cuda.is_available() # 检测是否有可用的gpu device = torch.device("cuda:0" if (use_cuda and ngpu>0) else "cpu") print('*'*8, 'device:', device) # checkpoint = save_dir + 'epoch011_valacc0.971_ckpt.tar' checkpoint = save_dir + last_new_checkpoint @torch.no_grad() def eval_step(model, inps, labs): input_ids, token_type_ids, attention_mask = inps
def load_tokenizer() -> XLNetTokenizer: tokenizer = XLNetTokenizer.from_pretrained( configs.data.path, max_len=configs.model.max_length, add_special_token=False) tokenizer.return_attention_mask = None return tokenizer
parser.add_argument('--dataset', type=str, default='one-billion-words', choices=['yelp', 'amazon', 'one-billion-words']) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu forward_model_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset) backward_model_path = '../checkpoints/backward_xlnet/{}'.format( args.dataset) forward_model = XLNetLMHeadModel.from_pretrained(forward_model_path) backward_model = XLNetLMHeadModel.from_pretrained(backward_model_path) forward_tokenizer = XLNetTokenizer.from_pretrained(forward_model_path) backward_tokenizer = XLNetTokenizer.from_pretrained(backward_model_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) forward_model = forward_model.to(device) backward_model = backward_model.to(device) forward_testset = XLNetDataset( args.dataset, "test", tokenizer=forward_tokenizer, max_sentence_length=args.max_sentence_length, is_forward=1) backward_testset = XLNetDataset( args.dataset,
def run(gpu_id, options, distributed=False): if distributed: dist.init_process_group( backend="nccl", rank=gpu_id, world_size=options.num_gpus, init_method="env://", ) torch.cuda.set_device(gpu_id) torch.manual_seed(options.seed) use_cuda = torch.cuda.is_available() and not options.no_cuda device = torch.device("cuda" if use_cuda else "cpu") logger = lavd.Logger(options.name, disabled=gpu_id != 0) # Parser needs to be rebuilt, since it can't be serialised and it is needed to even # detect the number of GPUs, but here it's only used to log it. parser = build_parser() if gpu_id == 0 else None spinner = logger.spinner("Initialising") spinner.start() checkpoint = (default_checkpoint if options.checkpoint is None else load_checkpoint( os.path.join(options.checkpoint, "stats.pt"))) # Either use the checkpoint directory as the configuration or use one of the # available pre-trained models. pre_trained = options.checkpoint or options.pre_trained # All but the primary GPU wait here, so that only the primary process loads the # pre-trained model and the rest uses the cached version. if distributed and gpu_id != 0: torch.distributed.barrier() model_kind = checkpoint["model"].get("kind") or options.model_kind use_special = True masked_lm = True if model_kind == "bert": if pre_trained is None: pre_trained = "bert-base-german-cased" config = BertConfig.from_pretrained(pre_trained) model = BertForMaskedLM.from_pretrained(pre_trained, config=config) tokeniser = BertTokenizer.from_pretrained(pre_trained) elif model_kind == "bert-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "bert-base-german-cased" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = BertTokenizer.from_pretrained(vocab) config = BertConfig.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = BertForMaskedLM(config) elif model_kind == "gpt2": if pre_trained is None: pre_trained = "gpt2" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) tokeniser = GPT2Tokenizer.from_pretrained(pre_trained) masked_lm = False use_special = False elif model_kind == "gpt2-german": assert pre_trained is not None, "--pre-trained must be given for gpt2-german" config = GPT2Config.from_pretrained(pre_trained) model = GPT2LMHeadModel.from_pretrained(pre_trained, config=config) # Using the XLNetTokenizer because the pre-trained German GPT-2 model uses # SentencePiece and that's easiest way to use it. # That also means that the automatic tokenisation cannot be done, because XLNet # uses different placing of the special tokens. tokeniser = XLNetTokenizer.from_pretrained( pre_trained, keep_accents=True, unk_token="<unk>", # start and end of sequence use the same token bos_token="<endoftext>", eos_token="<endoftext>", ) masked_lm = False use_special = False elif model_kind == "gpt2-scratch": # The pre_trained here is only for the configuartion (num layers etc.) # But the weights are not loaded if pre_trained is None: pre_trained = "gpt2" # Use either the provided vocabulary or the pre_trained one. vocab = options.vocab or pre_trained tokeniser = GPT2Tokenizer.from_pretrained(vocab) config = GPT2Config.from_pretrained(pre_trained) config.vocab_size = tokeniser.vocab_size model = GPT2LMHeadModel(config) masked_lm = False use_special = False else: raise Exception("No model available for {}".format(model_kind)) model = model.to(device) # Primary process has loaded the model and the other can now load the cached # version. if distributed and gpu_id == 0: torch.distributed.barrier() train_dataset = TextDataset( options.train_text, tokeniser, use_special=use_special, manual_special=model_kind == "gpt2-german", ) train_sampler = (DistributedSampler(train_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) train_data_loader = DataLoader( train_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=train_sampler is None, num_workers=options.actual_num_workers, sampler=train_sampler, pin_memory=True, ) validation_data_loaders = [] for val_file in options.validation_text: vals = val_file.split("=", 1) if len(vals) > 1: # Remove whitespace around the name name = vals[0].strip() # Expand the ~ to the full path as it won't be done automatically since it's # not at the beginning of the word. file_path = os.path.expanduser(vals[1]) else: name = None file_path = vals[0] validation_dataset = TextDataset( file_path, tokeniser, name=name, use_special=use_special, manual_special=model_kind == "gpt2-german", ) validation_sampler = (DistributedSampler( validation_dataset, num_replicas=options.num_gpus, rank=gpu_id) if distributed else None) validation_data_loader = DataLoader( validation_dataset, batch_size=options.batch_size, # Only shuffle when not using a sampler shuffle=validation_sampler is None, num_workers=options.actual_num_workers, sampler=validation_sampler, pin_memory=True, ) validation_data_loaders.append(validation_data_loader) initial_lr = options.lr # Only restore the learning rate if resuming from a checkpoint and not manually # resetting the learning rate. if len(checkpoint["train"]["lr"]) > 0 and not options.reset_lr: initial_lr = checkpoint["train"]["lr"][-1] no_decay = ["bias", "LayerNorm.weight"] optimiser_grouped_parameters = [ { "params": [ param for name, param in model.named_parameters() if not any(nd in name for nd in no_decay) ], "weight_decay": options.weight_decay, }, { "params": [ param for name, param in model.named_parameters() if any(nd in name for nd in no_decay) ], "weight_decay": 0.0, }, ] optimiser = AdamW(optimiser_grouped_parameters, lr=initial_lr, eps=options.adam_eps) lr_scheduler = get_linear_schedule_with_warmup( optimiser, num_warmup_steps=options.lr_warmup, num_training_steps=options.num_epochs, ) amp_scaler = amp.GradScaler() if use_cuda and options.fp16 else None if distributed: model = DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True) validation_details = [ OrderedDict( name=data_loader.dataset.name, path=data_loader.dataset.path, size=len(data_loader.dataset), ) for data_loader in validation_data_loaders ] experiment = OrderedDict( model_kind=model_kind, train=OrderedDict(path=train_dataset.path, size=len(train_dataset)), validation=validation_details, options=options, ) log_experiment(logger, experiment) logger.log_command(parser, options) # Wait for all processes to load eveything before starting training. # Not strictly necessary, since they will wait once the actual model is run, but # this makes it nicer to show the spinner until all of them are ready. if distributed: torch.distributed.barrier() spinner.stop() if options.checkpoint is not None: resume_text = "Resuming from - Epoch {epoch}".format( epoch=checkpoint["epoch"]) logger.set_prefix(resume_text) epoch_results = [ OrderedDict( name="Train", stats=OrderedDict( loss=checkpoint["train"]["stats"]["loss"][-1], perplexity=checkpoint["train"]["stats"]["perplexity"][-1], ), ) ] + [ OrderedDict( name=val_name, stats=OrderedDict( loss=val_result["stats"]["loss"][-1], perplexity=val_result["stats"]["perplexity"][-1], ), ) for val_name, val_result in checkpoint["validation"].items() ] log_epoch_stats(logger, epoch_results, metrics) train( logger, model, optimiser, train_data_loader, validation_data_loaders, lr_scheduler=lr_scheduler, device=device, num_epochs=options.num_epochs, checkpoint=checkpoint, model_kind=model_kind, amp_scaler=amp_scaler, masked_lm=masked_lm, )
args.output_file = os.path.join(output_path, suffix) if args.started_sentence_id==1 and os.path.exists(args.output_file): os.remove(args.output_file) print('The output file is ', args.output_file) args.input_file = os.path.join(args.input_file, f'''{args.dataset}/{args.keywords}keywords.txt''') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) if args.random==0: classifier_model_path = '../checkpoints/xlnet_classifier/{}'.format(args.dataset) args.classifier_model_path = classifier_model_path classifier_model = XLNetForTokenClassification.from_pretrained(classifier_model_path,num_labels=4) classifier_model_tokenizer = XLNetTokenizer.from_pretrained(classifier_model_path) logger.logger.info('Initialize backward XLNetForTokenClassification from checkpoint {}.'.format(classifier_model_path)) classifier_model = classifier_model.to(device) classifier_model.eval() else: classifier_model = None classifier_model_tokenizer = None if args.model_name == 'LSTMLMGenerate': forward_lm_path = '../checkpoints/forward_lstm_lm/{}/best.pt'.format(args.dataset) backward_lm_path = '../checkpoints/backward_lstm_lm/{}/best.pt'.format(args.dataset) args.forward_lm_path = forward_lm_path args.backward_lm_path = backward_lm_path
class XlnetProcessor(object): """Base class for data converters for sequence classification data sets.""" def __init__(self, vocab_path, do_lower_case): self.tokenizer = XLNetTokenizer(vocab_path, do_lower_case) def get_train(self, data_file): """Gets a collection of `InputExample`s for the train set.""" return self.read_data(data_file) def get_dev(self, data_file): """Gets a collection of `InputExample`s for the dev set.""" return self.read_data(data_file) def get_test(self, lines): return lines def get_labels(self): """Gets the list of labels for this data set.""" return [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] @classmethod def read_data(cls, input_file, quotechar=None): """Reads a tab separated value file.""" if 'pkl' in str(input_file): lines = load_pickle(input_file) else: lines = input_file return lines def truncate_seq_pair(self, tokens_a, tokens_b, max_length): # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def create_examples(self, lines, example_type, cached_examples_file): ''' Creates examples for data ''' pbar = ProgressBar(n_total=len(lines)) if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: examples = [] for i, line in enumerate(lines): guid = '%s-%d' % (example_type, i) text_a = line[0] label = line[1] if isinstance(label, str): label = [np.float(x) for x in label.split(",")] else: label = [np.float(x) for x in list(label)] text_b = None example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar.batch_step(step=i, info={}, bar_type='create examples') logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples def create_features(self, examples, max_seq_len, cached_features_file): ''' # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 ''' # Load data features from cache or dataset file pbar = ProgressBar(n_total=len(examples)) if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] pad_token = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.pad_token])[0] cls_token = self.tokenizer.cls_token sep_token = self.tokenizer.sep_token cls_token_segment_id = 2 pad_token_segment_id = 4 for ex_id, example in enumerate(examples): tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = None label_id = example.label if example.text_b: tokens_b = self.tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3) else: # Account for [CLS] and [SEP] with '-2' if len(tokens_a) > max_seq_len - 2: tokens_a = tokens_a[:max_seq_len - 2] # xlnet has a cls token at the end tokens = tokens_a + [sep_token] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + [sep_token] segment_ids += [1] * (len(tokens_b) + 1) tokens += [cls_token] segment_ids += [cls_token_segment_id] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) input_len = len(input_ids) padding_len = max_seq_len - len(input_ids) # pad on the left for xlnet input_ids = ([pad_token] * padding_len) + input_ids input_mask = ([0] * padding_len) + input_mask segment_ids = ([pad_token_segment_id] * padding_len) + segment_ids assert len(input_ids) == max_seq_len assert len(input_mask) == max_seq_len assert len(segment_ids) == max_seq_len if ex_id < 2: logger.info("*** Example ***") logger.info(f"guid: {example.guid}" % ()) logger.info( f"tokens: {' '.join([str(x) for x in tokens])}") logger.info( f"input_ids: {' '.join([str(x) for x in input_ids])}") logger.info( f"input_mask: {' '.join([str(x) for x in input_mask])}" ) logger.info( f"segment_ids: {' '.join([str(x) for x in segment_ids])}" ) feature = InputFeature(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, input_len=input_len) features.append(feature) pbar.batch_step(step=ex_id, info={}, bar_type='create features') logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features def create_dataset(self, features, is_sorted=False): # Convert to Tensors and build dataset if is_sorted: logger.info("sorted data by th length of input") features = sorted(features, key=lambda x: x.input_len, reverse=True) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_tokenizer(self,model_path): tokenizer = XLNetTokenizer.from_pretrained(model_path) return tokenizer
parser.add_argument('--load_from_checkpoint', type=str) parser.add_argument('--continue_training', type=str) parser.add_argument('--output_directory', type=str) parser.add_argument('--tokenizer_path', type=str) parser.add_argument('--max_len', type=int, default=256) parser.add_argument('--max_steps', type=int, default=500) parser.add_argument('--batch_size', type=int, default=8) parser.add_argument('--num_gpus', type=int, default=4) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() task = 'offense_rating' path2spiece = 'xlnet_base_cased\spiece.model' max_len = 64 tokenizer = XLNetTokenizer(vocab_file=path2spiece, do_lower_case=False) data_path = r'C:\Users\krish\hamze\SemEval-2021-Task-7-Hahackathon\xlnet\data\train.csv' df_data = pd.read_csv(data_path,sep=",",encoding="utf-8", usecols=['text', 'offense_rating']) # print(df_data.columns) print(df_data.head(n=20)) # print(df_data.offense_rating.unique()) # print(df_data.offense_rating.value_counts()) sentences = df_data.text.to_list() labels = df_data.offense_rating.to_list() print(sentences[0], labels[0]) tag2idx={'0': 0, '1': 1} tag2name={tag2idx[key] : key for key in tag2idx.keys()} #Tokenization and Segmentation
def __init__(self, args: dict, doLower: bool, train_batchSize: int, testval_batchSize:int, learningRate: float, doLearningRateScheduler: bool, target_columns: list, smartBatching: bool = True, mixedPrecision: bool = True, labelSentences: dict = None, max_label_len= None, model= None, optimizer= None, loss_fct= None, device= "cpu"): self.args = args self.labelSentences = labelSentences self.tokenizer = None self.device = device self.train_batchSize = train_batchSize self.testval_batchSize = testval_batchSize self.learningRate = learningRate self.optimizer = optimizer self.doLearningRateScheduler = doLearningRateScheduler self.learningRateScheduler = None self.smartBatching = smartBatching self.mixedPrecision = mixedPrecision self.max_label_len = max_label_len self.target_columns = target_columns self.input_multiclass_as_one = False if self.args["model"] in ["distilbert", "bert", "xlnet", "lstm", "roberta", "distilroberta"]: # define loss function if loss_fct: self.loss_fct = loss_fct else: self.loss_fct = BCEWithLogitsLoss() # define how many labels need to be classified if self.args["binaryClassification"]: self.num_labels = 1 else: self.num_labels = len(self.labelSentences.keys()) # build model from the model_str if self.args["model"] == "distilbert": if doLower: self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') else: self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') elif self.args["model"] == "bert": if doLower: self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: self.model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') elif self.args["model"] == "xlnet": if doLower: # no lowercase version exists therefore using the cased version in the doLower case as well self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') else: self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') elif self.args["model"] == "roberta": if doLower: self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif self.args["model"] == "distilroberta": if doLower: self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base') else: self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base') #elif self.args["model"] == "CNN": # self.model = MyLSTM(num_labels=self.num_labels) elif self.args["model"] == "gradboost": self.model = GradientBoostingClassifier(learning_rate= self.learningRate, n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1) self.input_multiclass_as_one = True elif self.args["model"] == "randomforest": self.model = RandomForestClassifier(n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1, n_jobs= -1) self.input_multiclass_as_one = True elif self.args["model"] == "naivebayes": self.model = OneVsRestClassifier(MultinomialNB(alpha= self.learningRate)) elif self.args["model"] == "naivebayes_norm": self.model = Pipeline([ ("nb_norm", MinMaxScaler()), ("nb_clf", OneVsRestClassifier(MultinomialNB(alpha= self.learningRate))) ]) elif self.args["model"] == "sgd": self.model = OneVsRestClassifier(SGDClassifier(alpha= self.learningRate, loss='hinge', penalty='l2')) else: logging.error("Define a model in the args dict.") sys.exit("Define a model in the args dict.")
} bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lowercase=True, add_special_tokens=True) albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', lowercase=True, add_special_tokens=True) roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lowercase=True, add_special_tokens=True) xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', lowercase=True, add_special_tokens=True) def data_generator(f_path, params): with open(f_path) as f: for line in f: line = line.rstrip() text, slot_intent = line.split('\t') words = text.split()[1:-1] slot_intent = slot_intent.split() slots, intent = slot_intent[1:-1], slot_intent[-1] words = [ params['word2idx'].get(w, len(params['word2idx'])) for w in words ]
def setupXLNetSentimentAnalysis(modelName): tokenizer = XLNetTokenizer.from_pretrained(modelName) model = XLNetForSequenceClassification.from_pretrained(modelName) return pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)
from transformers.modeling_utils import (WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits) from transformers import XLNetTokenizer, XLNetForSequenceClassification, XLNetPreTrainedModel, XLNetModel from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss from transformers import get_linear_schedule_with_warmup from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score from torch.utils.data.dataset import ConcatDataset from XLNet import XLNetForMultiSequenceClassification, Dataset_multi, Dataset_3Way, get_predictions import pandas as pd import numpy as np import random from IPython.display import clear_output from tqdm.notebook import tqdm, trange tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') trainset = Dataset_3Way("RTE5_train", tokenizer=tokenizer, three_tasks=False) train_sampler = RandomSampler(trainset) train_dataloader = DataLoader(trainset, sampler=train_sampler, batch_size=1) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) PRETRAINED_MODEL_NAME = "xlnet-base-cased" model = XLNetForMultiSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, output_attentions=True,)
def __init__(self, model='bert', model_size='base', cased=True, fine_tune=False, use_proj=False, proj_dim=256): super(Encoder, self).__init__() assert (model in MODEL_LIST) self.base_name = model self.model = None self.tokenizer = None self.num_layers = None self.hidden_size = None self.fine_tune = fine_tune # First initialize the model and tokenizer model_name = '' # Do we want the tokenizer to lower case or not do_lower_case = False if model == 'bert' and (not cased): # For other models this choice doesn't make sense since they are trained # on cased version of text. do_lower_case = True # Model is one of the BERT variants if 'bert' in model: assert (model_size in BERT_MODEL_SIZES) model_name = model + "-" + model_size if model == 'bert' and not cased: # Only original BERT supports uncased models model_name += '-uncased' elif model == 'roberta': # RoBERTa model types have no casing suffix in HuggingFace map # So we don't modify the model name pass else: model_name += '-cased' if model == 'bert': self.model = BertModel.from_pretrained( model_name, output_hidden_states=True) self.tokenizer = BertTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) elif model == 'roberta': self.model = RobertaModel.from_pretrained( model_name, output_hidden_states=True) self.tokenizer = RobertaTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) elif model == 'spanbert': # Model is loaded in a different way # Earlier "pytorch_transformers" required a .tar.gz URL/file. # Updated library "transformers" requires pytorch_model.bin and config.json # separately. That's why we have to keep the SpanBERT codebase around and initialize # the model using that codebase (based on pytorch_pretrained_bert). # NOTE: By default transformer models are initialized to eval() mode! # Not using the eval() mode will result in randomness. self.model = SpanbertModel.from_pretrained(model_name).eval() # SpanBERT uses the same tokenizer as BERT (that's why the slicing in model name). # We use the tokenizer from "transformers" since it provides an almost unified API. self.tokenizer = BertTokenizer.from_pretrained( model_name[4:], do_lower_case=do_lower_case) self.num_layers = self.model.config.num_hidden_layers + 1 self.hidden_size = self.model.config.hidden_size elif model == "xlnet": model_name = model + "-" + model_size + "-cased" self.model = XLNetModel.from_pretrained(model_name, output_hidden_states=True) self.tokenizer = XLNetTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) self.num_layers = self.model.config.num_hidden_layers + 1 self.hidden_size = self.model.config.hidden_size # Set the model name self.model_name = model_name # Set shift size due to introduction of special tokens if self.base_name == 'xlnet': self.start_shift = 0 self.end_shift = 2 else: self.start_shift = (1 if self.tokenizer._cls_token else 0) self.end_shift = (1 if self.tokenizer._sep_token else 0) # Set requires_grad to False if not fine tuning if not fine_tune: for param in self.model.parameters(): param.requires_grad = False if use_proj: # Apply a projection layer to output of pretrained models self.proj = nn.Linear(self.hidden_size, proj_dim) # Update the hidden size self.hidden_size = proj_dim else: self.proj = None # Set parameters required on top of pre-trained models self.weighing_params = nn.Parameter(torch.ones(self.num_layers))