type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") args = parser.parse_args() dir = args.dir mc_model_path = args.mc_model_path mc_model = RobertaForMultipleChoice.from_pretrained(mc_model_path) mc_tokenizer = RobertaTokenizer.from_pretrained(mc_model_path) mc_model.eval() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") sent_encoder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens', device=device) mc_model.to(device) tagger = spacy.load("en_core_web_lg") word_vector = gensim.models.KeyedVectors.load_word2vec_format( '/net/nfs.websail/yyv959/counter-fitted-vectors.txt', binary=False) stop_words = stopwords.words('english')
def __init__(self, model_name='microsoft/codebert-base'): # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = RobertaTokenizer.from_pretrained(model_name) self.model = RobertaModel.from_pretrained(model_name) self.vector_length = self.urls[model_name]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_pattern", default=None, type=str, required=True) parser.add_argument("--output_dir", default=None, type=str, required=True) parser.add_argument("--vocab_file", default=None, type=str, required=True) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--max_seq_length", default=512, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--include_unknowns", default=0.03, type=float) parser.add_argument("--max_position", default=50, type=int) parser.add_argument("--num_threads", default=16, type=int) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--start_num', type=int, default=-1) parser.add_argument('--end_num', type=int, default=-1) parser.add_argument('--generate_count', type=int, default=100) parser.add_argument('--hard_mode',type=bool,default=False) parser.add_argument('--DataName',type=str,default="SST") args = parser.parse_args() #tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) tokenizer = RobertaTokenizer.from_pretrained("roberta-base") print("Vocab SIze!",tokenizer.vocab_size) prefix = "cached_{0}_{1}_{2}_{3}".format(str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length),args.DataName) prefix = os.path.join(args.output_dir, prefix) os.makedirs(prefix, exist_ok=True) for input_path in glob(args.input_pattern): if args.start_num >= 0 and args.end_num >= 0: continue cached_path = os.path.join(prefix, os.path.split(input_path)[1] + ".pkl") if os.path.exists(cached_path): logging.info("{} already exists.".format(cached_path)) continue is_training = True if input_path.find("train") != -1 else False logging.info("train:{}".format(is_training)) examples = [] train = pd.read_csv(args.input_pattern, sep='\t', header=0) for i in range(len(train)): examples.append(NqExample(train['sentence'][i].split(' '),train['label'][i])) # for dirname in os.listdir(args.input_pattern): # label = dirname.split(".")[0] # dirname = os.path.join(args.input_pattern,dirname) # for filename in os.listdir(dirname): # filepath = os.path.join(dirname,filename) # with open(filepath,'r',encoding='utf-8') as f: # doc_tokens = f.read().split(' ') # examples.append( # NqExample( # doc_tokens=doc_tokens, # label=label)) run_convert_examples_to_features(args=args, examples=examples, tokenizer=tokenizer, is_training=is_training, cached_path=cached_path)
###### Set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) ###### INSTANTIATE MODEL tokenizer = None config = None model = None successful_download = False retries = 0 while (retries < 5 and not successful_download): try: tokenizer = RobertaTokenizer.from_pretrained( PRE_TRAINED_MODEL_NAME) config = RobertaConfig.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=len(CLASS_NAMES), id2label={ 0: -1, 1: 0, 2: 1, }, label2id={ -1: 0, 0: 1, 1: 2, }) config.output_attentions = True model = RobertaForSequenceClassification.from_pretrained(
def __init__(self, model_path, base_model='roberta'): self.model = ReRanker(base_model=base_model) self.model.load_state_dict(torch.load(model_path)) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_len=512) print("load reranker model from ", model_path)
from argparse import ArgumentParser import h5py import numpy as np import json argp = ArgumentParser() argp.add_argument('--input_path') argp.add_argument('--output_path') argp.add_argument('--bert_model', help='code_bert or graph_code_bert') args = argp.parse_args() print(args) # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. if args.bert_model == 'code_bert': tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base') model = RobertaModel.from_pretrained('microsoft/codebert-base', output_hidden_states=True) LAYER_COUNT = 12 FEATURE_COUNT = 768 elif args.bert_model == 'graph_code_bert': tokenizer = RobertaTokenizer.from_pretrained( 'microsoft/graphcodebert-base') model = RobertaModel.from_pretrained('microsoft/graphcodebert-base', output_hidden_states=True) LAYER_COUNT = 12 FEATURE_COUNT = 768 else: raise ValueError("BERT model must be base or large") code_list = [] #codesearchnet里所有数据的列表
def __init__(self, pretrain_path, max_length, cat_entity_rep=False): nn.Module.__init__(self) self.roberta = RobertaModel.from_pretrained(pretrain_path) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.cat_entity_rep = cat_entity_rep
x = torch.mean(x, 0) x = self.dropout(x) if task_id == 0: ret = self.classifier(x) elif task_id == 1: ret = self.topic_classifier(x) return ret # In[8]: # 初回実行時のみ保存 # トークンidの順番は,seed_everythingで固定できなかったので,実行する度に変動します. # tokenizer = RobertaTokenizer.from_pretrained('roberta-base', additional_special_tokens=sorted(topic_tokens)) # tokenizer.save_pretrained('../models/topic_tokenizer/') tokenizer = RobertaTokenizer.from_pretrained('../models/topic_tokenizer/') # ### トピックトークン付与 # In[9]: X_val = '[' + val_df.topic_id.map( str).values + '] </s> ' + val_df.description.values X_val2 = '[' + val2_df.topic_id.map( str).values + '] </s> ' + val2_df.description.values test_X = '[' + test_df.topic_id.map( str).values + '] </s> ' + test_df.description.values X_val = np.array(X_val) X_val2 = np.array(X_val2) test_X = np.array(test_X)
def __init__(self): self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base") self.model = RobertaForSequenceClassification.from_pretrained( "ghanashyamvtatti/roberta-fake-news") self.softmax_fn = torch.nn.Softmax(dim=1) self.client = language_v1.LanguageServiceClient()
def predict(): """Determine which are yes-ands are not from a given dialogue data set with a finetuned BERT yes-and classifier""" parser = ArgumentParser() parser.add_argument( "--model", default="bert-base-uncased", help= "Provide pretrained model type that is consisten with BERT model that was fine-tuned." ) parser.add_argument( "--model_checkpoint", default="runs/yesand_cornell_bert_base_iter1", help="Provide a directory for a pretrained BERT model.") parser.add_argument( "--data_path", default="data/reformatted_cornell.json", help="Provide a datapath for which predictions will be made.") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--predictions_folder", default="data/opus_predictions/", help="Provide a folderpath for which predictions will be saved to.") parser.add_argument("--test", default=False, dest='test', action='store_true', help='runs validation after 1 training step') args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger.info("Arguments: {}".format(pformat(args))) logger.info("Loading model and tokenizer.") if 'roberta' in args.model: model = RobertaForSequenceClassification.from_pretrained( args.model_checkpoint) tokenizer = RobertaTokenizer.from_pretrained(args.model_checkpoint) args.max_len = ROBERTA_MAX_LEN elif 'bert' in args.model: model = BertForSequenceClassification.from_pretrained( args.model_checkpoint) tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint) args.max_len = BERT_MAX_LEN else: error = f"Invalid model type given for args.model: {args.model}. Must either contain 'bert' or 'roberta" logger.info(error) return logger.info("Loading data to predict: {}".format(args.data_path)) if 'opus' in args.data_path: data_to_predict = get_opus_data(args.data_path) else: data_to_predict = get_list_data(args.data_path) logger.info("Building data loader...") prediction_dataloader = get_data_loader(args, data_to_predict, tokenizer) logger.info("Making predictions...") predictions = predict_label(args, model, prediction_dataloader, data_to_predict) logger.info("Predictions complete for {} dialogue pairs. ".format( len(predictions))) logger.info("Saving predictions...") if not Path(args.predictions_folder).is_dir(): Path(args.predictions_folder).mkdir(parents=True, exist_ok=False) identifier = Path(args.data_path).name checkpoint = Path(args.model_checkpoint).name predictions_fp = f"{args.predictions_folder}pred_{checkpoint}_{identifier}" with open(predictions_fp, 'w') as f: json.dump(predictions, f, indent=4) logger.info("Predictions saved to {}.".format(predictions_fp))
def main(): parser = argparse.ArgumentParser() ## Required parameters ############### parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--pretrain_model", default='bert-case-uncased', type=str, required=True, help="Pre-trained model") parser.add_argument("--num_labels_task", default=None, type=int, required=True, help="num_labels_task") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--task", default=None, type=int, required=True, help="Choose Task") ############### args = parser.parse_args() processors = Processor_1 num_labels = args.num_labels_task if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model) train_examples = None num_train_steps = None aspect_list = None sentiment_list = None processor = processors() num_labels = num_labels train_examples, aspect_list, sentiment_list = processor.get_train_examples( args.data_dir) if args.task == 1: num_labels = len(aspect_list) elif args.task == 2: num_labels = len(sentiment_list) else: print("What's task?") exit() num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=args.num_labels_task, output_hidden_states=False, output_attentions=False, return_dict=True) model = RobertaForMaskedLMDomainTask.from_pretrained( args.pretrain_model, num_labels=args.num_labels_task, output_hidden_states=False, output_attentions=False, return_dict=True) # Prepare optimizer t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] #no_decay = ['bias', 'LayerNorm.weight'] no_grad = [ 'bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent' ] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * 0.1), num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) exit() model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in train_features], dtype=torch.long) if args.task == 1: print("Excuting the task 1") elif args.task == 2: all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) else: print("Wrong here2") all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) if args.task == 1: train_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids) elif args.task == 2: train_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids) else: print("Wrong here1") ''' print("========") print(train_data) print(type(train_data)) exit() ''' if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() ##########Pre-Pprocess######### ############################### for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch)) batch = tuple(t.to(device) for i, t in enumerate(batch)) if args.task == 1: input_ids, attention_mask, label_ids = batch elif args.task == 2: input_ids, attention_mask, segment_ids, label_ids = batch else: print("Wrong here3") if args.task == 1: #loss, logits, hidden_states, attentions #output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) #loss = output.loss loss, logit = model(input_ids_org=input_ids, token_type_ids=None, attention_mask=attention_mask, sentence_label=label_ids, func="task_class") elif args.task == 2: #loss, logits, hidden_states, attentions #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids) #output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, labels=label_ids) #output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) #loss = output.loss loss, logit = model(input_ids_org=input_ids, token_type_ids=None, attention_mask=attention_mask, sentence_label=label_ids, func="task_class") else: print("Wrong!!") if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: ### #optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() ### else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses ### if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 ### if epoch < 2: continue else: model_to_save = model.module if hasattr(model, 'module') else model #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_{}".format(global_step)) output_model_file = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(epoch)) torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size): # Load configuration defined in the metadata file with open(metadata_path) as metadata_file: metadata = json.load(metadata_file) config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"]) # Load in the weights from the checkpoint_path state_dict = torch.load(checkpoint_path, map_location="cpu") # Load the entity vocab file entity_vocab = load_entity_vocab(entity_vocab_path) tokenizer = RobertaTokenizer.from_pretrained( metadata["model_config"]["bert_model_name"]) # Add special tokens to the token vocabulary for downstream tasks entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False) entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False) tokenizer.add_special_tokens( dict(additional_special_tokens=[entity_token_1, entity_token_2])) config.vocab_size += 2 print(f"Saving tokenizer to {pytorch_dump_folder_path}") tokenizer.save_pretrained(pytorch_dump_folder_path) with open( os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f: json.dump(entity_vocab, f) tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path) # Initialize the embeddings of the special tokens word_emb = state_dict["embeddings.word_embeddings.weight"] ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0) ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0) state_dict["embeddings.word_embeddings.weight"] = torch.cat( [word_emb, ent_emb, ent2_emb]) # Initialize the query layers of the entity-aware self-attention mechanism for layer_index in range(config.num_hidden_layers): for matrix_name in ["query.weight", "query.bias"]: prefix = f"encoder.layer.{layer_index}.attention.self." state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name] state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name] state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name] # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"] entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]] model = LukeModel(config=config).eval() missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"): raise ValueError( f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids" ) if not (all( key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)): raise ValueError( "Unexpected keys" f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}" ) # Check outputs tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification") text = ( "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the" " new world number one avoid a humiliating second- round exit at Wimbledon ." ) span = (39, 42) encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt") outputs = model(**encoding) # Verify word hidden states if model_size == "large": expected_shape = torch.Size((1, 42, 1024)) expected_slice = torch.tensor([[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]) else: # base expected_shape = torch.Size((1, 42, 768)) expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]]) if not (outputs.last_hidden_state.shape == expected_shape): raise ValueError( f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}" ) if not torch.allclose( outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): raise ValueError # Verify entity hidden states if model_size == "large": expected_shape = torch.Size((1, 1, 1024)) expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]) else: # base expected_shape = torch.Size((1, 1, 768)) expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]]) if not (outputs.entity_last_hidden_state.shape != expected_shape): raise ValueError( f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is" f" {expected_shape}") if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): raise ValueError # Finally, save our PyTorch model and tokenizer print("Saving PyTorch model to {}".format(pytorch_dump_folder_path)) model.save_pretrained(pytorch_dump_folder_path)
DistilBertTokenizer, DistilBertForMaskedLM, \ RobertaTokenizer, RobertaForMaskedLM albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval() albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2') albert_large_model = AlbertForMaskedLM.from_pretrained( 'albert-large-v2').eval() distilbert_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') distilbert_model = DistilBertForMaskedLM.from_pretrained( 'distilbert-base-cased').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx: token = ''.join(tokenizer.decode(w).split()) if token not in ignore_tokens: tokens.append(token.replace('##', '')) return '\n'.join(tokens[:top_clean])
def test(config, model_name="ddi_e-5-0.9229.pkl"): lable_error = {1: 0, 2: 0, 3: 0, 4: 0} # vocab = torch.load(os.path.join(config.ROOT_DIR, 'vocab.pt')) #+++ # logging.info('Load pretrained vectors: {}*{}'.format(vocab.word_num, vocab.word_dim)) # logging.info('Number of classes: {}'.format(vocab.class_num)) if config.BERT_MODE == 2: logging.info('Model: {}'.format(config.pretrained_model_name)) tokenizer = RobertaTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens( {"additional_special_tokens": ["<e1>", "</e1>", "<e2>", "</e2>"]}) bert_config = RobertaConfig.from_pretrained( config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = MyRoberta(bert_config, config) model.resize_token_embeddings(len(tokenizer)) if config.BERT_MODE == 3: logging.info('Model: {}'.format(config.pretrained_model_name)) tokenizer = BertTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens({ "additional_special_tokens": [ "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>", "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>", "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>", "</e23>" ] }) bert_config = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = Mybert_without_entity_information(bert_config, config) model.resize_token_embeddings(len(tokenizer)) if config.BERT_MODE == 1: logging.info('Model: {}'.format(config.pretrained_model_name)) tokenizer = BertTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens({ "additional_special_tokens": [ "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>", "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>", "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>", "</e23>", "drug1", "drug2" ] }) bert_config = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = Mybert(bert_config, config) model.resize_token_embeddings(len(tokenizer)) if config.BERT_MODE == 5: logging.info('Model: {}'.format(config.pretrained_model_name)) tokenizer = BertTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens({ "additional_special_tokens": [ "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>", "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>", "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>", "</e23>" ] }) bert_config = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = Mybert_without_attention(bert_config, config) model.resize_token_embeddings(len(tokenizer)) if config.BERT_MODE == 6: logging.info('Model: {}'.format(config.pretrained_model_name)) tokenizer = BertTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens({ "additional_special_tokens": [ "<e1>", "</e1>", "<e2>", "</e2>", "<e10>", "<e11>", "<e12>", "<e13>", "</e10>", "</e11>", "</e12>", "</e13>", "<e20>", "<e21>", "<e22>", "<e23>", "</e20>", "</e21>", "</e22>", "</e23>" ] }) bert_config = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = Mybert_without_packedBiGRU(bert_config, config) model.resize_token_embeddings(len(tokenizer)) if config.BERT_MODE == 7: logging.info('Model: {}'.format(config.pretrained_model_name)) logging.info('Model: {}'.format("Mybert_startent")) tokenizer = BertTokenizer.from_pretrained( config.pretrained_model_name, do_lower_case=config.do_lower_case) tokenizer.add_special_tokens({ "additional_special_tokens": [ "<e1>", "</e1>", "<e2>", "</e2>", "<e11>", "</e11>", "<e12>", "</e12>", "<e10>", "</e10>", "<e13>", "</e13>", "<e20>", "</e20>", "<e23>", "</e23>", "<e21>", "</e21>", "<e22>", "</e22>" ] }) bert_config = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task) model = Mybert_startent(bert_config, config) model.resize_token_embeddings(len(tokenizer)) test_dataset = torch.load(os.path.join(config.ROOT_DIR, 'test_c.pt')) test_loader = DataLoader(test_dataset, config.BATCH_SIZE, shuffle=True) logging.info('Number of test pair: {}'.format(len(test_dataset))) # num_params = sum(np.prod(p.size()) for p in model.parameters()) # num_embedding_params = np.prod(model.word_emb.weight.size()) + np.prod(model.tag_emb.weight.size()) # print('# of parameters: {}'.format(num_params)) # print('# of word embedding parameters: {}'.format(num_embedding_params)) # print('# of parameters (excluding embeddings): {}'.format(num_params - num_embedding_params)) if model_name is None: model_path = utils.best_model_path(config.SAVE_DIR, config.DATA_SET, i=0) logging.info( 'Loading the best model on validation set: {}'.format(model_path)) model.load_state_dict(torch.load(model_path, map_location='cpu')) else: model_path = os.path.join(config.SAVE_DIR, config.DATA_SET, model_name) model_path = r"checkpoint/BioBert\drugmask\addClassifieddata0.25effect0.125Int0.5\lossweight\lossweight-0.8411.pkl" # model_path = os.path.join('checkpoint/BioBert/biobert_gru2_drop00_ddi_e-5', model_name) logging.info('Loading the model: {}'.format(model_path)) model.load_state_dict(torch.load(model_path, map_location='cpu')) model.eval() model.to(DEVICE) # model.display() torch.set_grad_enabled(False) def run_iter(batch): sent = batch[0].to(DEVICE) mask = batch[1].to(DEVICE) segment = batch[2].to(DEVICE) label = batch[3].to(DEVICE) e1_mask = batch[4].to(DEVICE) e2_mask = batch[5].to(DEVICE) length = batch[6].to(DEVICE) logits = model(input_ids=sent, attention_mask=mask, token_type_ids=segment, labels=label, e1_mask=e1_mask, e2_mask=e2_mask, length=length) label_pred = logits.max(1)[1] return label_pred.cpu() test_labels = [] test_preds = [] for test_batch in test_loader: test_pred = run_iter(batch=test_batch) test_labels.extend(test_batch[3]) test_preds.extend(test_pred) test_p, test_r, test_f1, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[1, 2, 3, 4], average='micro') test_p_n, test_r_n, test_f1_n, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[0], average='micro') test_p_a, test_r_a, test_f1_a, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[1], average='micro') test_p_e, test_r_e, test_f1_e, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[2], average='micro') test_p_m, test_r_m, test_f1_m, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[3], average='micro') test_p_i, test_r_i, test_f1_i, _ = metrics.precision_recall_fscore_support( test_labels, test_preds, labels=[4], average='micro') # plt.figure("ROC Curve") # plt.title("ROC Curve") # plt.xlabel('Recall') # plt.ylabel('Precision') # precision, recall, _ = metrics.roc_curve(test_labels, test_preds) # plt.plot(recall,precision) # plt.show() # for i, l in enumerate(test_labels): # if l!=test_preds[i] and int(l)!=0: # lable_error[int(l)]+=1 logging.info( 'precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format( test_p, test_r, test_f1)) logging.info( 'negative: precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'. format(test_p_n, test_r_n, test_f1_n)) logging.info( 'advise: precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format( test_p_a, test_r_a, test_f1_a)) logging.info( 'effect: precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format( test_p_e, test_r_e, test_f1_e)) logging.info( 'mechanism: precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'. format(test_p_m, test_r_m, test_f1_m)) logging.info( 'int: precision = {:.4f}: recall = {:.4f}, fscore = {:.4f}'.format( test_p_i, test_r_i, test_f1_i))
num_labels = 3 if task == 'c' else 2 # Set tokenizer for different models if model_name == 'bert': if task == 'all': model = MTL_Transformer_LSTM(model_name, model_size, args=args) else: model = BERT(model_size, args=args, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased') elif model_name == 'roberta': if task == 'all': model = MTL_Transformer_LSTM(model_name, model_size, args=args) else: model = RoBERTa(model_size, args=args, num_labels=num_labels) tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}') elif model_name == 'bert-gate' and task == 'all': model_name = model_name.replace('-gate', '') model = GatedModel(model_name, model_size, args=args) tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased') elif model_name == 'roberta-gate' and task == 'all': model_name = model_name.replace('-gate', '') model = GatedModel(model_name, model_size, args=args) tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}') # Move model to correct device model = model.to(device=device) if args['ckpt'] != '': model.load_state_dict(load(args['ckpt']))
def run(args, config, train_data, valid_data, test_data=None): ############################ PARAMETER SETTING ########################## num_workers = config['dataloader']['n_jobs'] batch_size = config['dataloader']['batch_size'] # learning_rate = config['optimizer']['learning_rate'] # warmup_proportion = config['optimizer']['warmup_proportion'] # save_ckpt_dir = os.path.join(args.save_path, 'checkpoints') audio_length = 3000 epochs = args.epochs tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_path) ############################## PREPARE DATASET ########################## train_dataset = DownstreamDataset(train_data, tokenizer, audio_length) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=batch_size, collate_fn=lambda x: collate(x, tokenizer, config['upstream'][ 'acoustic']), shuffle=True, num_workers=num_workers) valid_dataset = DownstreamDataset(valid_data, tokenizer, audio_length) valid_loader = torch.utils.data.DataLoader( dataset=valid_dataset, batch_size=batch_size, collate_fn=lambda x: collate(x, tokenizer, config['upstream'][ 'acoustic']), shuffle=False, num_workers=num_workers) if test_data is None: test_data = valid_data test_dataset = DownstreamDataset(test_data, tokenizer, audio_length) test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=batch_size, collate_fn=lambda x: collate(x, tokenizer, config['upstream'][ 'acoustic']), shuffle=False, num_workers=num_workers) ########################### CREATE MODEL ################################# model = MultiModalEncoderDecoder( ckpt_path=args.ckpt_path, num_classes=config['downstream']['label_num']) model.cuda() no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) ########################### TRAINING ##################################### count, best_metric, save_metric, best_epoch = 0, -np.inf, None, 0 for epoch in range(epochs): epoch_train_loss = [] model.train() start_time = time.time() time.sleep( 2 ) # avoid the deadlock during the switch between the different dataloaders progress = tqdm(train_loader, desc='Epoch {:0>3d}'.format(epoch)) for acoustic_inputs, semantic_inputs, label_inputs, _ in progress: speech_inputs = acoustic_inputs[0].cuda() speech_attention_mask = acoustic_inputs[1].cuda() text_inputs = semantic_inputs[0].cuda() text_attention_mask = semantic_inputs[1].cuda() label_inputs = label_inputs.cuda() model.zero_grad() logits, _ = model( text_encoder_inputs=speech_inputs, text_encoder_attention_mask=speech_attention_mask, text_decoder_inputs=text_inputs, text_decoder_attention_mask=text_attention_mask, speech_encoder_inputs=text_inputs, speech_encoder_attention_mask=text_attention_mask, speech_decoder_inputs=speech_inputs, speech_decoder_attention_mask=speech_attention_mask, ) loss = loss_fn(logits, label_inputs, num_classes=config['downstream']['label_num']) epoch_train_loss.append(loss) loss.backward() optimizer.step() scheduler.step() count += 1 acc_train_loss = torch.mean( torch.tensor(epoch_train_loss)).cpu().detach().numpy() progress.set_description("Epoch {:0>3d} - Loss {:.4f}".format( epoch, acc_train_loss)) model.eval() pred_y, true_y = [], [] with torch.no_grad(): time.sleep( 2 ) # avoid the deadlock during the switch between the different dataloaders for acoustic_inputs, semantic_inputs, label_inputs, _ in valid_loader: speech_inputs = acoustic_inputs[0].cuda() speech_attention_mask = acoustic_inputs[1].cuda() text_inputs = semantic_inputs[0].cuda() text_attention_mask = semantic_inputs[1].cuda() true_y.extend(list(label_inputs.numpy())) logits, hiddens = model( text_encoder_inputs=speech_inputs, text_encoder_attention_mask=speech_attention_mask, text_decoder_inputs=text_inputs, text_decoder_attention_mask=text_attention_mask, speech_encoder_inputs=text_inputs, speech_encoder_attention_mask=text_attention_mask, speech_decoder_inputs=speech_inputs, speech_decoder_attention_mask=speech_attention_mask, ) if config['downstream']['label_num'] == 1: prediction = logits.view(-1) label_outputs = prediction.cpu().detach().numpy().astype( float) else: if args.task_name == "verification": # for speaker verification we take the hidden before the classifier as the output label_outputs = hiddens.cpu().detach().numpy().astype( float) else: prediction = torch.argmax(logits, axis=1) label_outputs = prediction.cpu().detach().numpy( ).astype(int) pred_y.extend(list(label_outputs)) # think about the metric calculation key_metric, report_metric = downstream_metrics(pred_y, true_y, args.task_name) epoch_train_loss = torch.mean( torch.tensor(epoch_train_loss)).cpu().detach().numpy() elapsed_time = time.time() - start_time print("The time elapse of epoch {:03d}".format(epoch) + " is: " + time.strftime("%H: %M: %S", time.gmtime(elapsed_time))) print('Valid Metric: {} - Train Loss: {:.3f}'.format( ' - '.join([ '{}: {:.3f}'.format(key, value) for key, value in report_metric.items() ]), epoch_train_loss)) if key_metric > best_metric: best_metric, best_epoch = key_metric, epoch print('Better Metric found on dev, calculate performance on Test') pred_y, true_y = [], [] with torch.no_grad(): time.sleep( 2 ) # avoid the deadlock during the switch between the different dataloaders for acoustic_inputs, semantic_inputs, label_inputs, _ in test_loader: speech_inputs = acoustic_inputs[0].cuda() speech_attention_mask = acoustic_inputs[1].cuda() text_inputs = semantic_inputs[0].cuda() text_attention_mask = semantic_inputs[1].cuda() true_y.extend(list(label_inputs.numpy())) logits, hiddens = model( text_encoder_inputs=speech_inputs, text_encoder_attention_mask=speech_attention_mask, text_decoder_inputs=text_inputs, text_decoder_attention_mask=text_attention_mask, speech_encoder_inputs=text_inputs, speech_encoder_attention_mask=text_attention_mask, speech_decoder_inputs=speech_inputs, speech_decoder_attention_mask=speech_attention_mask, ) if config['downstream']['label_num'] == 1: prediction = logits.view(-1) label_outputs = prediction.cpu().detach().numpy( ).astype(float) else: if args.task_name == "verification": label_outputs = hiddens.cpu().detach().numpy( ).astype(float) else: prediction = torch.argmax(logits, axis=1) label_outputs = prediction.cpu().detach().numpy( ).astype(int) pred_y.extend(list(label_outputs)) _, save_metric = downstream_metrics(pred_y, true_y, args.task_name) print("Test Metric: {}".format(' - '.join([ '{}: {:.3f}'.format(key, value) for key, value in save_metric.items() ]))) print("End. Best epoch {:03d}: {}".format( best_epoch, ' - '.join([ '{}: {:.3f}'.format(key, value) for key, value in save_metric.items() ]))) return save_metric
def _define_tokenizer(self): return RobertaTokenizer.from_pretrained(self.params["model_name"], do_lower_case=True)
def train(train_Xy, n_epochs=4, batch_size=4): # val_Xy tokenizer = RobertaTokenizer.from_pretrained("allenai/biomed_roberta_base") model = RobertaForSequenceClassification.from_pretrained( "allenai/biomed_roberta_base").to(device=device) #from transformers import Adam, AdamW from transformers import AdamW #optimizer = AdamW(model.parameters()) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) best_val = np.inf train_epoch_loss = 0 for epoch in range(n_epochs): model.train() print("on epoch ", epoch) train_epoch_loss = 0 batch_X, batch_y = [], [] cur_batch_size = 0 for i, article in enumerate(train_Xy): if (i % 100) == 0: print("on article", i) # sample instances from current article cur_X, cur_y = instances_from_article(article, max_instances=batch_size - cur_batch_size) batch_X.extend(cur_X) batch_y.extend(cur_y) cur_batch_size += len(cur_X) if cur_batch_size >= batch_size: optimizer.zero_grad() batch_X_tensor = tokenizer.batch_encode_plus( batch_X[:batch_size], max_length=512, add_special_tokens=True, pad_to_max_length=True) batch_y_tensor = torch.tensor(batch_y[:batch_size]) loss, logits = model( torch.tensor( batch_X_tensor['input_ids']).to(device=device), attention_mask=torch.tensor( batch_X_tensor['attention_mask']).to(device=device), labels=batch_y_tensor.to(device=device)) train_epoch_loss += loss.cpu().detach().numpy() #import pdb; pdb.set_trace() #print("batch loss: {}".format(loss)) loss.backward() optimizer.step() # empty out current batch cur_batch_size = 0 batch_X, batch_y = [], [] print("total epoch train loss {}".format(train_epoch_loss)) #### # eval on val set ### print("evaluating on val...") model.eval() total_correct, total_preds = 0, 0 val_loss = 0 for j, article in enumerate(val_Xy): val_X, val_y = instances_from_article(article, max_instances=batch_size) val_X_tensor = tokenizer.batch_encode_plus(val_X[:batch_size], max_length=512, add_special_tokens=True, pad_to_max_length=True) val_y_tensor = torch.tensor(val_y[:batch_size]) loss, logits = model( torch.tensor(val_X_tensor['input_ids']).to(device=device), attention_mask=torch.tensor( val_X_tensor['attention_mask']).to(device=device), labels=torch.tensor(val_y_tensor).to(device=device)) val_loss += loss.cpu().detach().numpy() class_preds = torch.argmax(logits, dim=1).detach().cpu() total_correct += (class_preds == val_y_tensor).sum() total_preds += len(val_X) #import pdb; pdb.set_trace() val_acc = total_correct / float( total_preds) # note that the baseline depends on neg samples print("val loss, acc after epoch {} is: {}, {}".format( epoch, val_loss, val_acc)) if val_loss < best_val: print("new best loss: {}".format(val_loss)) best_val = val_loss torch.save(model.state_dict(), "inference.model")
def evaluate(args): """ Evaluate a masked language model using CrowS-Pairs dataset. """ print("Evaluating:") print("Input:", args.input_file) print("Model:", args.lm_model) print("=" * 100) logging.basicConfig(level=logging.INFO) # load data into panda DataFrame df_data = read_data(args.input_file) # supported masked language models if args.lm_model == "scibert-bert": tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased') uncased = True elif args.lm_model == "biobert-bert": tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1') model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1') uncased = True elif args.lm_model == "scibert-roberta": tokenizer = RobertaTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = RobertaForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased') uncased = True elif args.lm_model == "biobert-roberta": tokenizer = RobertaTokenizer.from_pretrained('dmis-lab/biobert-v1.1') model = RobertaForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1') uncased = True elif args.lm_model == "roberta": tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMaskedLM.from_pretrained('roberta-large') uncased = False elif args.lm_model == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') uncased = True model.eval() model.to('cuda') mask_token = tokenizer.mask_token log_softmax = torch.nn.LogSoftmax(dim=0) vocab = tokenizer.get_vocab() with open(args.lm_model + ".vocab", "w") as f: f.write(json.dumps(vocab)) lm = {"model": model, "tokenizer": tokenizer, "mask_token": mask_token, "log_softmax": log_softmax, "uncased": uncased } # score each sentence. # each row in the dataframe has the sentid and score for pro and anti stereo. df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 'sent_more_score', 'sent_less_score', 'score', 'stereo_antistereo', 'bias_type']) total_stereo, total_antistereo = 0, 0 stereo_score, antistereo_score = 0, 0 N = 0 neutral = 0 total = len(df_data.index) with tqdm(total=total) as pbar: for index, data in df_data.iterrows(): direction = data['direction'] bias = data['bias_type'] score = mask_unigram(data, lm) for stype in score.keys(): score[stype] = round(score[stype], 3) N += 1 pair_score = 0 pbar.update(1) if score['sent1_score'] == score['sent2_score']: neutral += 1 else: if direction == 'stereo': total_stereo += 1 if score['sent1_score'] > score['sent2_score']: stereo_score += 1 pair_score = 1 elif direction == 'antistereo': total_antistereo += 1 if score['sent2_score'] > score['sent1_score']: antistereo_score += 1 pair_score = 1 sent_more, sent_less = '', '' if direction == 'stereo': sent_more = data['sent1'] sent_less = data['sent2'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] else: sent_more = data['sent2'] sent_less = data['sent1'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] df_score = df_score.append({'sent_more': sent_more, 'sent_less': sent_less, 'sent_more_score': sent_more_score, 'sent_less_score': sent_less_score, 'score': pair_score, 'stereo_antistereo': direction, 'bias_type': bias }, ignore_index=True) df_score.to_csv(args.output_file) print('=' * 100) print('Total examples:', N) print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2)) print('Stereotype score:', round(stereo_score / total_stereo * 100, 2)) if antistereo_score != 0: print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2)) print("Num. neutral:", neutral, round(neutral / N * 100, 2)) print('=' * 100) print()
def main(): parser = argparse.ArgumentParser() ## Required parameters ############### parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--pretrain_model", default='bert-case-uncased', type=str, required=True, help="Pre-trained model") parser.add_argument("--num_labels_task", default=None, type=int, required=True, help="num_labels_task") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--task", default=None, type=int, required=True, help="Choose Task") ############### args = parser.parse_args() #print(args.do_train, args.do_eval) #exit() processors = Processor_1 num_labels = args.num_labels_task if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") ''' if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) ''' os.makedirs(args.output_dir, exist_ok=True) tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model) train_examples = None num_train_steps = None aspect_list = None sentiment_list = None processor = processors() num_labels = num_labels #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir) filenames = os.listdir(args.output_dir) filenames = [x for x in filenames if "pytorch_model.bin_" in x] print(filenames) file_mark = [] model_performace = dict() for x in filenames: #file_mark.append([x, True]) file_mark.append([x, False]) #### #### test_examples, aspect_list, sentiment_list = processor.get_test_examples( args.data_dir) if args.task == 1: num_labels = len(aspect_list) elif args.task == 2: num_labels = len(sentiment_list) else: print("What's task?") exit() test = convert_examples_to_features(test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) eval_examples = test_examples ### for x, mark in file_mark: print(x, mark) output_model_file = os.path.join(args.output_dir, x) #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True) model = RobertaForMaskedLMDomainTask.from_pretrained( args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task) model.load_state_dict(torch.load(output_model_file), strict=False) #strict False: ignore non-matching keys #param_optimizer = [para[0] for para in model.named_parameters()] #param_optimizer = [para for para in model.named_parameters()][-2] #print(param_optimizer) model.to(device) if mark: eval_features = dev else: eval_features = test logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in eval_features], dtype=torch.long) if args.task == 1: print("Excuting the task 1") elif args.task == 2: all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) else: print("Wrong here2") all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) if args.task == 1: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids) elif args.task == 2: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids) else: print("Wrong here1") if args.local_rank == -1: eval_sampler = RandomSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if mark: output_eval_file = os.path.join( args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1])) else: output_eval_file = os.path.join( args.output_dir, "test_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1])) fpred = open(output_file_pred, "w") fgold = open(output_file_glod, "w") model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")): #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch)) batch = tuple(t.to(device) for i, t in enumerate(batch)) if args.task == 1: input_ids, attention_mask, label_ids = batch elif args.task == 2: input_ids, attention_mask, segment_ids, label_ids = batch else: print("Wrong here3") if args.task == 1: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class") #logits = output.logits #tmp_eval_loss = output.loss elif args.task == 2: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class") #exit() #logits = output.logits #tmp_eval_loss = output.loss else: print("Wrong!!") logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy, pred = accuracy(logits, label_ids) for a, b in zip(pred, label_ids): fgold.write("{}\n".format(b)) fpred.write("{}\n".format(a)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) model_performace[x] = eval_accuracy ################# ################# '''
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) self.roberta = RobertaForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
help='output model path and name') parser.add_argument('--benchmark', action='store_true', default=False, help='Get benchmark performance of quantized model.') parser.add_argument('--benchmark_nums', type=int, default=1000, help="Benchmark numbers of samples") parser.add_argument('--mode', type=str, default='performance', choices=['performance', 'accuracy'], help="Mode of benchmark") args = parser.parse_args() tokenizer = RobertaTokenizer.from_pretrained(args.input_dir, do_lower_case=True) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, \ batch_size=args.eval_batch_size) def eval_func(model): return evaluate_onnxrt(args, model, tokenizer, eval_dataloader)
'cnn_filters': 300, 'cnn_kernel_size': 5, 'init_lr': 1e-4, 'max_lr': 8e-4 } bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', lowercase=True, add_special_tokens=True) albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', lowercase=True, add_special_tokens=True) roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', lowercase=True, add_special_tokens=True) xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', lowercase=True, add_special_tokens=True) def data_generator(f_path, params): with open(f_path) as f: for line in f: line = line.rstrip() text, slot_intent = line.split('\t') words = text.split()[1:-1] slot_intent = slot_intent.split() slots, intent = slot_intent[1:-1], slot_intent[-1]
def setup_python_tokenizer(self): self.base_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False, cache_dir=self.test_dir)
# In[2]: ################################################################# ### Step 1 ################################################################# from torch.utils.data import TensorDataset, random_split from torch.utils.data import DataLoader, SequentialSampler import json from transformers import RobertaTokenizer # Load the RoBERTa tokenizer. print('Loading BERT tokenizer...') tokenizer = RobertaTokenizer.from_pretrained('roberta-large', do_lower_case=True) from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = RobertaForSequenceClassification.from_pretrained( './step_1_casual_sentence_classifier_model', # use my stored model num_labels=2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states=False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda()
type=str, help= 'directory that contains corpus files to be encoded, in jsonl format.', required=True) parser.add_argument('--index', type=str, help='directory to store brute force index of corpus', required=True) parser.add_argument('--batch', type=int, help='batch size', default=8) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cuda:0') args = parser.parse_args() tokenizer = RobertaTokenizer.from_pretrained(args.encoder) model = AnceEncoder.from_pretrained(args.encoder) model.to(args.device) index = faiss.IndexFlatIP(args.dimension) if not os.path.exists(args.index): os.mkdir(args.index) texts = [] with open(os.path.join(args.index, 'docid'), 'w') as id_file: for file in sorted(os.listdir(args.corpus)): file = os.path.join(args.corpus, file) if file.endswith('json') or file.endswith('jsonl'): print(f'Loading {file}') with open(file, 'r') as corpus:
dev_set.append(line) print('building dataloaders ...') if args.model == 'visualbert': with open(f'{args.images_features}/images_features_dict.pkl', 'rb') as f: images_features_dict = pickle.load(f) else: images_features_dict = None if args.model == 'visualbert': config = BertConfig.from_pretrained('bert-base-uncased') tkz = BertTokenizer.from_pretrained('bert-base-uncased') else: config = RobertaConfig.from_pretrained('roberta-base') tkz = RobertaTokenizer.from_pretrained('roberta-base') print("train set") train_dataloader = create(data=train_set, datatype='train', batch_size=args.train_batch_size, images_features_dict=images_features_dict, tkz=tkz, config=config) print("dev set") dev_dataloader = create(data=dev_set, datatype='dev', batch_size=args.dev_batch_size, images_features_dict=images_features_dict, tkz=tkz, config=config)
cluster_flag = False pass if cluster_flag: uncompress_object(args.pretrained, ".") train_df = pd.read_csv(args.traindata) test_df = pd.read_csv(args.testdata) else: print("local file reading") train_df = pd.read_csv('notebooks/files/unlabel_train1.csv') test_df = pd.read_csv('notebooks/files/unlabel_test1.csv') Num_label = len(train_df.label_id.value_counts()) device = torch.device(args.device) tokenizer = RobertaTokenizer.from_pretrained("./pretrained", do_lower_case=False) model = TransferRobertaNet(path="./pretrained", embedding_dim=768, num_class=Num_label, num_class1=args.classes) criterion = FocalLoss(alpha=0.97, reduce=True) model.to(device) criterion.to(device) optimizer = Adam(model.parameters(), lr=0.00008) if args.scheduler == "cosine": scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=10, eta_min=0) else:
parser.add_argument("--t_model", default="MIXED", help="Type of trained model ('MRPC', 'MIXED')") # parser.add_argument("--f_model", default="MRPC_model.pkl", help="Trained model file") # parser.add_argument("--t_model", default="MRPC", help="Type of trained model ('MRPC', 'MIXED')") parser.add_argument("--max_seq_length", default=128, help="Max sequence length") args = parser.parse_args() # Set device device = set_device(args.device) # Define Tokenizer tokenizer = RobertaTokenizer.from_pretrained("roberta-base") # Define dataset and data loader raw_train_data = load_dataset('glue', 'mrpc', split='train') raw_val_data = load_dataset('glue', 'mrpc', split='validation') raw_test_data = load_dataset('glue', 'mrpc', split='test') # Define model manager manager = ModelManager(args.models_dir) # Define model if args.t_model == 'MRPC': model = ROBERTAOnMRPC() elif args.t_model == 'MIXED': model = ROBERTA_FT_MRPC(ROBERTAOnSTS()) else: raise ("Expected 'MRPC' or 'MIXED', got '{}'".format(args.t_model))
from transformers import RobertaTokenizer, RobertaModel import os import torch import xml.etree.ElementTree as ET import nltk from nltk.tokenize import sent_tokenize import numpy as np from os import listdir from os.path import isfile, join os.environ["CUDA_VISIBLE_DEVICES"] = "0" tokenizer = RobertaTokenizer.from_pretrained('roberta-base', unk_token='<unk>') import spacy nlp = spacy.load("en_core_web_sm") #model = RobertaModel.from_pretrained('roberta-base') space = ' ' #dir_name = "/shared/why16gzl/logic_driven/Quizlet/Quizlet_2/LDC2020E20_KAIROS_Quizlet_2_TA2_Source_Data_V1.0/data/ltf/ltf/" #file_name = "K0C03N4LR.ltf.xml" # Use ltf_reader #dir_name = "/home1/w/why16gzl/KAIROS/hievents_v2/processed/" #file_name = "article-10901.tsvx" # Use tsvx_reader # ============================ # PoS Tagging # ============================ pos_tags = [ "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE" ] identity_matrix = np.identity(len(pos_tags)) postag_to_OneHot = {} postag_to_OneHot["None"] = np.zeros(len(pos_tags)) for (index, item) in enumerate(pos_tags):