def main(): model = BertForNextSentencePrediction.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') seq_A = 'I like cookies !' seq_B = 'Do you like them ?' probs = bert_seq(model, tokenizer, seq_A, seq_B) print(probs)
def add_sc(data): print("Computing Semantic Coherence") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased") softmax = torch.nn.Softmax(dim=1) model.eval() output = [] for ex in tqdm(data): summary = ex["summary"] scores = [] sentences = sent_tokenize(summary) if len(sentences) <= 1: ex["coherence"] = 1 else: numerator = 0 denominator = len(sentences) - 1 for i in range(len(sentences) - 1): prev = sentences[i] curr = sentences[i + 1] s = "[CLS] " + prev + " [SEP] " + curr + " [SEP]" tokenized_text = tokenizer.tokenize(s) boundary = tokenized_text.index("[SEP]") segment_ids = [0] * boundary + [1] * (len(tokenized_text) - boundary) indexed_tokens = tokenizer.convert_tokens_to_ids( tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segment_ids]) with torch.no_grad(): prediction = model(tokens_tensor, token_type_ids=segments_tensors)[0] prediction_sm = softmax(prediction)[0].tolist() if prediction_sm[0] > 0.5: numerator += 1 ex["coherence"] = numerator / denominator
def __init__(self, bert_device): #self.tokenizer = tokenizer = BertTokenizer('./models/vocab.txt', do_lower_case=True) #self.model = BertForSequenceClassification.from_pretrained('./models/', cache_dir=None, from_tf=False, state_dict=None).to("cuda:0") self.bert_device = bert_device self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased').cuda(self.bert_device) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.max_seq_len = 128 #TODO: Dont hard code this
def __init__(self, categories: list): # load pretrained BERT self.categories = categories # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') self.model.eval()
def main(): path = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' analysis = text_analysis() # analysis.read_realtion(path) analysis.read_videoinfo(path_new) # questions=analysis.gather_question() question = analysis.video_question # for item in question: # print(question[item]['quizzes'][0].keys()) """ self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer' multiple-choices open-ended """ scripts = analysis.gather_transcripts(path) temp_dic = analysis.build_question_transcripts(path_new) temp = [] for item in temp_dic: print(item) for quiz in temp_dic[item]['questions']: if quiz['question_type'] == 'multiple-choices': temp.append(temp_dic[item]) break new_element = get_question_hint_sentence_x(temp[1]) #for question in temp[-1]['questions']: # print(question) # #print(temp[-1]) #print(len(temp)) tokenizer = BertTokenizer.from_pretrained('bert-large-cased') model = BertForNextSentencePrediction.from_pretrained('bert-large-cased') #tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') #model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') print(temp[1]) for i in range(len(new_element['question'])): first_sentence = new_element['question'][i]['quiz_description'].strip( ' ') next_sentences = new_element['question'][i]['responding_candidate'] x = [] for sentence in next_sentences: encoding = tokenizer(first_sentence, sentence, return_tensors='pt') outputs = model(**encoding, labels=torch.LongTensor([1])) logits = outputs.logits probs = softmax(logits, dim=1)[0][0].item() x.append(probs) a = argmax(x) #print(next_sentences) print(first_sentence) print(next_sentences) print(new_element['question'][i]['video_answer_hinted']) break
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertForNextSentencePrediction, BertTokenizer # download model self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose) # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.path_model) # Load pre-trained model (weights) self.model = BertForNextSentencePrediction.from_pretrained(self.path_model, output_hidden_states = True, # Whether the model returns all hidden-states. )
def test_get_probability_of_next_sentence_portuguesee(self): tokenizer = BertTokenizer.from_pretrained('models/neuralmind/bert-base-portuguese-cased') model = BertForNextSentencePrediction.from_pretrained('models/neuralmind/bert-base-portuguese-cased') text1 = "Quantos anos você tem?" text2 = "A Torre Eiffel fica em Paris" text3 = "Eu tenho 22 anos" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.5229671) assert_almost_equal(prob2, 0.9979677)
def test_get_probability_of_next_sentence(self): tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertForNextSentencePrediction.from_pretrained('bert-base-cased') text1 = "How old are you?" text2 = "The Eiffel Tower is in Paris" text3 = "I am 22 years old" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.0149559) assert_almost_equal(prob2, 0.9997911)
def get_nsp_model_and_optimizer(params, device): model_name = params["model_name"] assert model_name[:4] == "bert", f"Non-BERT models not supported for NSP. Supplied model name was {model_name}" nsp_model = BertForNextSentencePrediction.from_pretrained(model_name) nsp_model = nsp_model.to(device) optimizer = get_weighted_adam_optimizer(nsp_model, params) return nsp_model, optimizer
def __init__(self, **kwargs): """ Initialized the BERT model :param batch_size: [int] batch size to used for bert """ super().__init__() self.batch_size = kwargs['batch_size'] self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') self.model.eval() if torch.cuda.is_available(): self.model.cuda()
def __init__(self): self.filename = 'name' self.verbose_response = True self.output = "" #self.kernel = aiml_std.Kernel() self.tree = None self.root = None self.l = [] self.score = [] self.memory = {} self.index = -1 self.incomplete = False self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased')
def __init__(self, surprise_weight: float = 2.0, sentence_weight: float = 1.0): self.tokenizer = BertTokenizer.from_pretrained( self.PRETRAINED_MODEL_NAME) self.sentence_model = BertForNextSentencePrediction.from_pretrained( self.PRETRAINED_MODEL_NAME) self.sentence_model.eval() self.language_model = BertForMaskedLM.from_pretrained( self.PRETRAINED_MODEL_NAME) self.language_model.eval() self.surprise_weight = surprise_weight self.sentence_weight = sentence_weight
def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): self.model = BertForNextSentencePrediction.from_pretrained( self.pretrained_bert, output_attentions=False, output_hidden_states=False) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = BertForNextSentencePrediction(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device)
def prep_for_training(num_training_steps): if args.model == "Albert": model = AlbertForDebateSequenceClassification.from_pretrained( "albert-base-v2", newly_added_config=args) tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") elif args.model == "BertNSP": model = BertForNextSentencePrediction.from_pretrained( "bert-base-uncased") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") else: raise ValueError("request model is not available") model.to(DEVICE) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(num_training_steps * args.warmup_ratio), num_training_steps=num_training_steps, ) return model, optimizer, scheduler, tokenizer
def task_2(): # 任务二:句子预测任务 sample_1 = ("今天天气怎么样", "今天天气很好") sample_2 = ("小明今年几岁了", "我不喜欢学习") tokenizer = BertTokenizer.from_pretrained(bert_path) sen_code = tokenizer.batch_encode_plus( [sample_1, sample_2]) # 上下句结合可以这样传参 List[Tuple[str, str]] input_ids = torch.tensor(sen_code["input_ids"]) model = BertForNextSentencePrediction.from_pretrained(bert_path) model.eval() outputs = model(input_ids) seq_relationship_scores = outputs.logits # torch.Size([batch, 2]) # pred_lst = seq_relationship_scores.max(dim=1).indices # torch.Size([batch, 2]) pred_lst = seq_relationship_scores.argmax(axis=1) # torch.Size([batch, 2]) for pred in pred_lst: print(f"预测结果:{pred}") # 0表示是上下句,1表示不是上下句(第二句明明不是前后句关系,不知道为什么会输出0)
def test_get_probability_of_next_sentence(self): tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertForNextSentencePrediction.from_pretrained( 'bert-base-cased') wrong_sentence = "How old are you? The Eiffel Tower is in Paris" correct_sentence = "How old are you? I am 22 years old" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def test_get_probability_of_next_sentence_multilingual(self): tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased') text1 = "How old are you?" text2 = "The Eiffel Tower is in Paris" text3 = "I am 22 years old" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.5525756) assert_almost_equal(prob2, 0.9784408) text1 = "Quantos anos você tem?" text2 = "A Torre Eiffel fica em Paris" text3 = "Eu tenho 22 anos" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.8567284) assert_almost_equal(prob2, 0.9410717)
def __init__(self, extractor, config): super().__init__() self.config = config if config["hidden"] == 0: self.combine = nn.Linear(config["topk"], 1, bias=False) with torch.no_grad(): self.combine.weight = nn.Parameter( torch.ones_like(self.combine.weight) / config["topk"]) else: assert config["hidden"] > 0 self.combine = nn.Sequential( nn.Linear(config["topk"], config["hidden"]), nn.ReLU(), nn.Linear(config["hidden"], 1)) # original model file (requires apex): # state = torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/saved.msmarco_mb_1", map_location="cpu") # self.bert = state["model"] # saved.msmarco_mb_1 weights exported from the official apex model: # self.bert = BertForNextSentencePrediction.from_pretrained("bert-large-uncased") # self.bert.load_state_dict(torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/converted")) # converted_weights.msmarco_mb # kevin's base model: # self.bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased") # saved_bert = torch.load("/GW/NeuralIR/nobackup/birch/models/saved.tmp_1")["model"] # self.bert.load_state_dict(saved_bert.state_dict()) # also /GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/export/birch-bert-base-kevin self.bert = BertForNextSentencePrediction.from_pretrained( f"Capreolus/birch-bert-large-{config['pretrained']}") if not config["finetune"]: self.bert.requires_grad = False self.bert_context = torch.no_grad else: self.bert_context = contextlib.nullcontext
def test_get_probability_of_next_sentence_multilingual(self): tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') model = BertForNextSentencePrediction.from_pretrained( 'bert-base-multilingual-cased') wrong_sentence = "How old are you? The Eiffel Tower is in Paris" correct_sentence = "How old are you? I am 22 years old" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0 wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris" correct_sentence = "Quantos anos você tem? Eu tenho 22 anos" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def test_get_probability_of_next_sentence_portuguesee(self): tokenizer = BertTokenizer.from_pretrained( 'models/neuralmind/bert-base-portuguese-cased') model = BertForNextSentencePrediction.from_pretrained( 'models/neuralmind/bert-base-portuguese-cased') wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris" correct_sentence = "Quantos anos você tem? Eu tenho 22 anos" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def main(): path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' correct_path = '/home/shuo/Documents/AI_learning/LearningQ/code/teded/video_hint/question_corrected.txt' analysis = text_analysis() analysis.read_relation(path_new) analysis.read_videoinfo(path_new) # questions=analysis.gather_question() question = analysis.video_question analysis.read_video_questions_from_JSON(correct_path) # for item in question: # print(question[item]['quizzes'][0].keys()) """ self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer' multiple-choices open-ended """ tokenizer = BertTokenizer.from_pretrained('bert-large-cased') model = BertForNextSentencePrediction.from_pretrained('bert-large-cased') scripts = analysis.gather_transcripts(path_new) temp_dic = analysis.build_question_transcripts(path_new) temp_dic = analysis.align_subject(temp_dic) cateloged = gather_subjects(temp_dic) stats = [] for c in cateloged: if len(cateloged[c]) <= 20: temp = cateloged[c] else: temp = random.sample(cateloged[c], 10) print(c + '\n') # for i in range(2, 5): print_stats(temp, model, tokenizer, i, add_answer=False) break
def __init__(self, number_of_sentence, adjust_weight, trained_baseline_model=None, transform=True): super(MultiBERTsModel, self).__init__() self.number_of_sentence = number_of_sentence self.adjust_weight = adjust_weight self.bertNSP = BertForNextSentencePrediction.from_pretrained( 'bert-base-chinese') self.softmax = nn.Softmax(dim=1) #self.linear = nn.Linear(768 * self.number_of_sentence, 1) #self.bert = BertModel.from_pretrained('bert-base-chinese') if trained_baseline_model: self.bert = trained_baseline_model.bert self.sp_linear = trained_baseline_model.linear else: self.bert = BertModel.from_pretrained('bert-base-chinese') self.sp_linear = nn.Linear(768, 1) if transform: self.transform = nn.Linear(768, 768, bias=False)
def predict_next(sen1, sen2, tokenizer, model_path): model = BertForNextSentencePrediction.from_pretrained(model_path) tokenized_sen1 = tokenizer.tokenize(sen1) tokenized_sen2 = tokenizer.tokenize(sen2) tokenized_text = tokenized_sen1 + tokenized_sen2 print(tokenized_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids_1 = [0] * len(tokenized_sen1) segments_ids_2 = [1] * len(tokenized_sen2) segments_ids = segments_ids_1 + segments_ids_2 segments_tensors = torch.tensor([segments_ids]) print(segments_tensors) with torch.no_grad(): outputs = model(torch.tensor([indexed_tokens]), token_type_ids=segments_tensors) predictions = outputs[0].cpu().numpy() result = predictions[0][1] > predictions[0][0] print(predictions) print(result)
partition = int(sys.argv[1]) from_idx = partition * 1000 to_idx = (partition + 1) * 1000 test_book_ids = test_book_ids[from_idx:to_idx] print(len(test_book_ids), 'books') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) device = torch.device("cuda:" + str(partition + 1) if torch.cuda.is_available() else "cpu") print(device) print(torch.cuda.device_count(), "GPUs") model = BertForNextSentencePrediction.from_pretrained(model_dir) model = model.to(device) for book_id in test_book_ids: print(book_id) try: process_book(bert_tok_dir, pred_scores_dir, model, device, cls, sep, book_id) except Exception as e: print(book_id, e) print('Done!')
def _get_next_sentence_prediction(self): """ Initializes the BertForNextSentencePrediction transformer """ self.nsp = BertForNextSentencePrediction.from_pretrained(self.model) self.nsp.eval()
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertForNextSentencePrediction, BertTokenizer # import gpt_2_simple as gpt2 tokenizer_qa = AutoTokenizer.from_pretrained( "deepset/bert-large-uncased-whole-word-masking-squad2") model_qa = AutoModelForQuestionAnswering.from_pretrained( "deepset/bert-large-uncased-whole-word-masking-squad2") print("BERT For Q/A downloaded") model_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-cased') tokenizer_nsp = BertTokenizer.from_pretrained('bert-base-cased') print("BERT NSP downloaded") # sess = gpt2.start_tf_sess() # gpt2.load_gpt2(sess, run_name='run1_topical_token') # print("GPT2 loaded")
# model-related parser.add_argument("--model", default="albert-base-v2", type=str, help="Model Name") parser.add_argument("--batch_size", default=2, type=int, help="Training batch size") # data-related parser.add_argument("--data_path", default="./data/augmented_data.csv", type=str) parser.add_argument("--train_size", default=0.5, type=float, help="Training Size (ratio)") args = parser.parse_args() from transformers import BertForNextSentencePrediction, BertTokenizer args.model = BertForNextSentencePrediction.from_pretrained( "bert-base-uncased") args.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") seed() main(args)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--trained_model_dir", default="", type=str, help="Where is the fine-tuned BERT model?") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) print("WARNING: Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: train_dataset = BERTDataset(args.data_dir, tokenizer, seq_len=args.max_seq_length) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.trained_model_dir: if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! model = BertForNextSentencePrediction.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict) else: model = BertForNextSentencePrediction.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, attention_masks, token_type_ids, next_sentence_labels = batch output = model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids, labels=next_sentence_labels) loss = output['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, help= "An optional input evaluation data file to evaluate the perplexity on (a text file)." ) parser.add_argument( "--model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint for weights initialization.") parser.add_argument( "--config_name", default="", type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path" ) parser.add_argument( "--cache_dir", default="", type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)" ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens)." ) # ====== 学習 ====== parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") # バリデーション parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") # ====== 学習オプション ====== parser.add_argument("--method", default="method1", type=str, help="NSP method.") parser.add_argument("--nsp_swap_ratio", default=0.5, type=float, help="random Swap ratio of next sntences.") parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") # 重み減衰 parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( '--save_total_limit', type=int, default=None, help= 'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default' ) parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device('cuda:0') args.n_gpu = torch.cuda.device_count() # args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", -1, device, args.n_gpu, bool(False), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer # config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) bert_tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) jp_tokenizer = JumanTokenizer() if args.block_size <= 0: args.block_size = bert_tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model else: args.block_size = min(args.block_size, bert_tokenizer.max_len_single_sentence) if args.method == "method1" or args.method == "method3": model = BertForNextSentencePrediction.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== else: # 未完成 model = BertSepInputNSPModel.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy( model.bert_for_double.bert_model.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert_for_double.bert_model.pooler) # - BERT凍結 for param in model.bert_for_double.bert_model.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert_for_double.bert_model.encoder.layer[-1] = bert_last_layer model.bert_for_double.bert_model.pooler = bert_pooler # ===================================== model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, bert_tokenizer, jp_tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, bert_tokenizer, jp_tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) bert_tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned bert_tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) if args.method == "method1" or args.method == "method3": model = BertForNextSentencePrediction.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== else: # method2 # 未完成 model = BertSepInputNSPModel.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy( model.bert_for_double.bert_model.encoder.layer[-1]) bert_pooler = copy.deepcopy( model.bert_for_double.bert_model.pooler) # - BERT凍結 for param in model.bert_for_double.bert_model.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert_for_double.bert_model.encoder.layer[ -1] = bert_last_layer model.bert_for_double.bert_model.pooler = bert_pooler # ===================================== model.to(args.device) # Evaluation results = {} if args.do_eval: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" if args.method == "method1" or args.method == "method3": model = BertForNextSentencePrediction.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1]) bert_pooler = copy.deepcopy(model.bert.pooler) # - BERT凍結 for param in model.bert.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert.encoder.layer[-1] = bert_last_layer model.bert.pooler = bert_pooler # ===================================== else: # method2 # 未完成 model = BertSepInputNSPModel.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) # ====== BERT一部パラメータ凍結 ======= # - BERTエンコーダ最終層,プーラーのみ凍結回避 bert_last_layer = copy.deepcopy( model.bert_for_double.bert_model.encoder.layer[-1]) bert_pooler = copy.deepcopy( model.bert_for_double.bert_model.pooler) # - BERT凍結 for param in model.bert_for_double.bert_model.parameters(): param.requires_grad = False # - 非凍結レイヤーで置換 model.bert_for_double.bert_model.encoder.layer[ -1] = bert_last_layer model.bert_for_double.bert_model.pooler = bert_pooler # ===================================== model.to(args.device) result = evaluate(args, model, bert_tokenizer, jp_tokenizer, prefix=prefix) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
print('Reading training data file...') df = pd.read_csv(training_data_loc, usecols=[ 'para1_tokens', 'para2_tokens', 'para1_len', 'para2_len', 'label' ]) df = df[(df['para1_len'] > 0) & (df['para2_len'] > 0)] df['para1_tokens'] = df['para1_tokens'].apply(literal_eval) df['para2_tokens'] = df['para2_tokens'].apply(literal_eval) print('Loading tokenizer and BertNSP...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') BertNSP = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) # Use appropriate locations try: with open(os.path.join(output_loc, 'input_tokens.pkl'), 'rb') as f: input_tokens = pickle.load(f) with open(os.path.join(output_loc, 'input_seg_ids.pkl'), 'rb') as f: input_seg_ids = pickle.load(f) with open(os.path.join(output_loc, 'labels.pkl'), 'rb') as f: labels = pickle.load(f) except: print('Generating training input...') input_tokens = list()