def load(cls, model_fqdn, weights_path=None, **model_kwargs): cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), "distributed_-1") tokenizer = BertTokenizer.from_pretrained(model_fqdn, do_lower_case="uncase" in model_fqdn) model = BertForMaskedLM.from_pretrained(model_fqdn, cache_dir=cache_dir) if weights_path is not None: model.load_state_dict(torch.load(weights_path), strict=False) return cls(model, tokenizer, **model_kwargs)
def __init__(self, model_name, do_lower_case=True, base_model="bert-base-uncased", use_untuned=False, use_stop=False): self.model_name = model_name bert_model = base_model self.tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case=do_lower_case) if use_untuned: cache_dir = PYTORCH_PRETRAINED_BERT_CACHE self.model = BertForMaskedLM.from_pretrained(bert_model, cache_dir=cache_dir) self.model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 5, 768) self.model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) else: weights_path = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), model_name) self.model = torch.load(weights_path) self.model.cuda() self.MAX_LEN = 10 self.__segment_proc_flag = True if use_stop: self.__stop_words = set(stopwords.words('english')) else: self.__stop_words = []
def __init__(self, ds_path, model_name, base_model="bert-base-uncased", do_lower_case=True, num_epochs=4): self.path = ds_path self.num_epochs = num_epochs self.save_path = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), str(model_name)) self.model_name = model_name self.batch_size = 32 self.max_seq_len = 64 self.masked_lm_prob = 0.15 self.max_predictions_per_seq = 20 self.max_token = 30000 bert_model = base_model cache_dir = PYTORCH_PRETRAINED_BERT_CACHE self.model = BertForMaskedLM.from_pretrained(bert_model, cache_dir=cache_dir) self.model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 5, 768) self.model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) self.tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case=do_lower_case)
def bertForMaskedLM(*args, **kwargs): """ BertForMaskedLM includes the BertModel Transformer followed by the (possibly) pre-trained masked language modeling head. """ model = BertForMaskedLM.from_pretrained(*args, **kwargs) return model
def bertForMaskedLM(*args, **kwargs): """ BertForMaskedLM includes the BertModel Transformer followed by the (possibly) pre-trained masked language modeling head. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> masked_index = 8 >>> tokenized_text[masked_index] = '[MASK]' >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForMaskedLM >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased') >>> model.eval() # Predict all tokens >>> with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) >>> predicted_index = torch.argmax(predictions[0, masked_index]).item() >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 'henson' """ model = BertForMaskedLM.from_pretrained(*args, **kwargs) return model
def test_BertForMaskedLM(): input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForMaskedLM(config) print(model(input_ids, token_type_ids, input_mask))
def load_model(bison_args, device, data_handler, output_model_file=None): """ Load a model. :param bison_args: instance of :py:class:BisonArguments :param device: the device to move the model to :param data_handler: the dataset handler, an instance of :py:class:BitextHandler or a subclass :param output_model_file: the location of the model to load :return: the loaded model """ model_state_dict = None if output_model_file is not None: model_state_dict = torch.load(output_model_file) if bison_args.bert_model == 'bert-vanilla': # randomly initialises BERT weights instead of using a pre-trained model model = BertForMaskedLM(BertConfig.from_default_settings()) else: model = BertForMaskedLM.from_pretrained(bison_args.bert_model, state_dict=model_state_dict) model.to(device) return model
def __init__(self, label_list, device): self._label_list = label_list self._tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) self._model = BertForMaskedLM.from_pretrained(BERT_MODEL) if len(self._label_list) != 2: self._model.bert.embeddings.token_type_embeddings = \ nn.Embedding(len(label_list), 768) self._model.bert.embeddings.token_type_embeddings.weight.data.\ normal_(mean=0.0, std=0.02) self._device = device self._model.to(self._device) self._optimizer = None self._dataset = {} self._data_loader = {}
def __init__(self, featQty, headFeatQty, useTfIdfTransform=False, useWordFeatures=True, useBERT=False, useHeadBERT=False, bertModelPath=None, torch_device='cuda', bertModelType='bert-base-uncased'): self.useWordFeatures = useWordFeatures if self.useWordFeatures: self.featQty = featQty self.countVect = CountVectorizer(ngram_range=(1, 1)) self.tfidf = TfidfTransformer() if useTfIdfTransform else None self.headFeatQty = headFeatQty self.headCountVect = CountVectorizer(ngram_range=(1, 1)) self.headTfidf = TfidfTransformer() if useTfIdfTransform else None self.useBERT = useBERT self.useHeadBERT = useHeadBERT if useBERT or useHeadBERT: self.torch_device = torch.device(torch_device) if bertModelPath is not None: print('Loading fine-tuned model from file:', bertModelPath) self.bertModelWrapper = BertForPreTraining.from_pretrained( bertModelType) self.bertModelWrapper.load_state_dict( torch.load(bertModelPath)) else: print('Loading standard pre-trained model') self.bertModelWrapper = BertForMaskedLM.from_pretrained( bertModelType) self.bertModelWrapper.eval() self.bertModelWrapper.to(torch_device) self.bert_tokenizer = BertTokenizer.from_pretrained( bertModelType, do_lower_case=True)
def model_transfer(): model = BertForMaskedLM( config=BertConfig.from_json_file(args.bert_config_json)) # print('language_model',model.state_dict()['bert.embeddings.word_embeddings.weight']) # print('language_model',model.state_dict()['bert.embeddings.LayerNorm.weight']) # print('language_model',model.state_dict()['bert.encoder.layer.0.attention.self.key.weight']) model = model.bert # print('bert_model',model.state_dict()['embeddings.word_embeddings.weight']) # print('bert_model',model.state_dict()['embeddings.LayerNorm.weight']) # print('bert_model',model.state_dict()['encoder.layer.0.attention.self.key.weight']) model_dict = model.state_dict() lm_dict = torch.load('./lm_smallBert/outputs/1.41_150000_step') for k, v in lm_dict.items(): print(k, v) # print('lm_dict',lm_dict['bert.embeddings.word_embeddings.weight']) # print('lm_dict',lm_dict['bert.embeddings.LayerNorm.weight']) # print('lm_dict',lm_dict['bert.encoder.layer.0.attention.self.key.weight']) pretrained_dict = { k[5:]: v for k, v in lm_dict.items() if k[5:] in model_dict.keys() } # print('pretrained_dict',pretrained_dict) model.load_state_dict(pretrained_dict) torch.save(model.state_dict(), '1.41_bert_weight.bin')
def main(): parser = create_parser() args = parser.parse_args() logger.info(args) random.seed(args.seed) with open(ARGUMENT_RATE) as fi: argument_w = { line.split()[0]: float(line.rstrip("\n").split()[-1]) for line in fi } test_create_mask_indices(argument_w) if path.exists(args.out_file): raise FileExistsError("Already exists: {}".format(args.out_file)) if args.where_mask not in WHERE_MASK: raise ValueError("Unsupported mode = '{}'\nChoose from: {}".format( args.where_mask, WHERE_MASK)) if args.which_arg not in WHICH_ARG: raise ValueError("Unsupported mode = '{}'\nChoose from: {}".format( args.which_arg, WHICH_ARG)) logger.info("Where to mask: '{}'".format(args.where_mask)) logger.info("Whether to mask the argument: '{}'".format(args.which_arg)) logger.info("Random rate: {}".format(args.random_rate)) logger.info("Minus: {}".format(args.minus)) logger.info("How select tokens: {}".format(args.how_select)) logger.info("How many tokens to predict at once: {}".format(args.how_many)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info("device: {}".format(device)) logger.info("BERT model: {}".format(args.bert_model)) logger.debug("Loading BERT model...") max_seq_length = 128 model = BertForMaskedLM.from_pretrained(args.bert_model) model.to(device) model.eval() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) black_list = [] logger.debug("sort by length of tokens") instances = [instance for instance in tqdm(read_file(args.in_file))] sorted_instances = sorted(instances, key=lambda x: len(x["surfaces"])) logger.debug("sort is done") fo = open(args.out_file, "w") logger.debug("Start to fill the mask") for instance in tqdm(sorted_instances[10000:15000]): for pas in instance["pas"]: if len(set(pas["args"])) == 1: continue if "zero" not in pas["types"]: continue predict_sents = [] mask_indices = create_mask_indices(instance=instance, pas=pas, where_mask=args.where_mask, which_arg=args.which_arg, random_rate=args.random_rate, minus=args.minus, argument_w=argument_w) if not mask_indices: continue original_tokens = copy.deepcopy(instance["surfaces"]) masked_tokens = [ MASK if idx in mask_indices else surf for idx, surf in enumerate(instance["surfaces"]) ] feature = InputFeatures(tokens=masked_tokens, tokenizer=tokenizer, max_seq_length=max_seq_length) if feature.len > max_seq_length: continue if args.how_select == "beam": output_sents, output_tokens = prediction_with_beam_search( device=device, model=model, feature=feature, tokenizer=tokenizer, black_list=black_list, k=args.topk) for sent in output_sents: predict_sents.append(sent[1:feature.len - 1]) else: if args.how_many == "single": predict_tokens = prediction_single( device=device, model=model, feature=feature, tokenizer=tokenizer, how_select=args.how_select, black_list=black_list) elif args.how_many == "multi": predict_tokens = prediction_multi( device=device, model=model, feature=feature, tokenizer=tokenizer, how_select=args.how_select, black_list=black_list) else: raise ValueError("Unsupported value: {}".format( args.how_many)) assert len(predict_tokens) == len(feature.token_mask_ids) # tokens = feature.tokens # for idx, p_token in zip(feature.token_mask_ids, predict_tokens): # tokens[idx] = p_token filled_tokens = copy.deepcopy(masked_tokens) for idx, p_token in zip(sorted(list(mask_indices)), predict_tokens): filled_tokens[idx] = p_token predict_sents.append(filled_tokens) print("{}: {}".format(instance["file name"], instance["sentence id"]), file=fo) for idx, tokens in enumerate( [original_tokens, masked_tokens, *predict_sents]): case_ids = [(c_id, case) for c_id, case in enumerate(pas["args"]) if case != 3] tokens[pas["p_id"]] = add_color(tokens[pas["p_id"]], "underline") for c_id, case in case_ids: tokens[c_id] = add_color(tokens[c_id], CASE_COLOR[case]) print("{} :{}".format(idx, " ".join(tokens)), file=fo) print("\n", file=fo) fo.close() logger.info("done")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model_or_config_file", default=None, type=str, required=True, help= "Directory containing pre-trained BERT model or path of configuration file (if no pre-training)." ) parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--num_gpus", type=int, default=-1, help="Num GPUs to use for training (0 for none, -1 for all available)") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # Check whether bert_model_or_config_file is a file or directory if os.path.isdir(args.bert_model_or_config_file): pretrained = True targets = [WEIGHTS_NAME, CONFIG_NAME, "tokenizer.pkl"] for t in targets: path = os.path.join(args.bert_model_or_config_file, t) if not os.path.exists(path): msg = "File '{}' not found".format(path) raise ValueError(msg) fp = os.path.join(args.bert_model_or_config_file, CONFIG_NAME) config = BertConfig(fp) else: pretrained = False config = BertConfig(args.bert_model_or_config_file) # What GPUs do we use? if args.num_gpus == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() device_ids = None else: device = torch.device("cuda" if torch.cuda.is_available() and args.num_gpus > 0 else "cpu") n_gpu = args.num_gpus if n_gpu > 1: device_ids = list(range(n_gpu)) if args.local_rank != -1: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # Check some other args if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) # Seed RNGs random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare output directory if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Make tokenizer if pretrained: fp = os.path.join(args.bert_model_or_config_file, "tokenizer.pkl") with open(fp, "rb") as f: tokenizer = pickle.load(f) else: training_data = [ line.strip() for line in open(args.train_file).readlines() ] tokenizer = CuneiformCharTokenizer(training_data=training_data) tokenizer.trim_vocab(config.min_freq) # Adapt vocab size in config config.vocab_size = len(tokenizer.vocab) print("Size of vocab: {}".format(len(tokenizer.vocab))) # Get training data num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if pretrained: model = BertForMaskedLM.from_pretrained(args.bert_model_or_config_file) else: model = BertForMaskedLM(config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Prepare training log output_log_file = os.path.join(args.output_dir, "training_log.txt") with open(output_log_file, "w") as f: f.write("Steps\tTrainLoss\n") # Start training global_step = 0 total_tr_steps = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 avg_loss = tr_loss / nb_tr_examples # Update training log total_tr_steps += nb_tr_steps log_data = [str(total_tr_steps), "{:.5f}".format(avg_loss)] with open(output_log_file, "a") as f: f.write("\t".join(log_data) + "\n") # Save model logger.info("** ** * Saving model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) fn = os.path.join(args.output_dir, "tokenizer.pkl") with open(fn, "wb") as f: pickle.dump(tokenizer, f)
import plotly.offline as offline import numpy as np from pytorch_pretrained_bert.tokenization import load_vocab, BertTokenizer from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler import torch import argparse from tqdm import tqdm, trange import os base_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path), do_lower_case=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.to(device) model.eval() vocab = load_vocab(vocab_file='{}/data/vocab.txt'.format(base_path)) inv_vocab = {v: k for k, v in vocab.items()} def getMI(sentence): tokens = tokenizer.tokenize(sentence) tokens.insert(0, "[CLS]") tokens.append("[SEP]") tokens_length = len(tokens)
def main(): save_every_epoch = False args, train_dataloader, t_total, device, n_gpu = load_data() print("**********************************************************") print(args) # Prepare model model = BertForMaskedLM.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 model.train() save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) for e in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss, avg_loss, avg_acc = 0, 0, 0. nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() avg_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_step += 1 if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0 if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) + modified save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) + modified save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--eval_dir", default=None, type=str, required=True, help="The evaluation data dir.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_SR_file", default=None, type=str, required=True, help="The output directory of writing substitution selection.") parser.add_argument("--word_embeddings", default=None, type=str, required=True, help="The path of word embeddings") parser.add_argument("--word_frequency", default=None, type=str, required=True, help="The path of word frequency.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--num_selections", default=10, type=int, help="Total number of training epochs to perform.") parser.add_argument("--num_eval_epochs", default=1, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError("At least `do_eval` must be True.") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=cache_dir) if args.fp16: model.half() model.to(device) output_sr_file = open(args.output_SR_file, "a+") print("Loading embeddings ...") wordVecPath = args.word_embeddings #wordVecPath = "/media/qiang/ee63f41d-4004-44fe-bcfd-522df9f2eee8/glove.840B.300d.txt" fasttext_dico, fasttext_emb = getWordmap(wordVecPath) stopword = set(stopwords.words('english')) word_count_path = args.word_frequency #word_count_path = "word_frequency_wiki.txt" word_count = getWordCount(word_count_path) ps = PorterStemmer() SS = [] substitution_words = [] source_words = [] num_selection = args.num_selections bre_i = 0 window_context = 11 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): fileName = args.eval_dir.split('/')[-1][:-4] if fileName == 'lex.mturk': eval_examples, mask_words, mask_labels = read_eval_dataset( args.eval_dir) else: eval_examples, mask_words, mask_labels = read_eval_index_dataset( args.eval_dir) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) #logger.info(" Batch size = %d", args.eval_batch_size) model.eval() eval_size = len(eval_examples) for i in range(eval_size): print('Sentence {} rankings: '.format(i)) #output_sr_file.write(str(i)) #output_sr_file.write(' sentence: ') #output_sr_file.write('\n') tokens, words, position = convert_sentence_to_token( eval_examples[i], args.max_seq_length, tokenizer) assert len(words) == len(position) mask_index = words.index(mask_words[i]) mask_context = extract_context(words, mask_index, window_context) len_tokens = len(tokens) mask_position = position[mask_index] if isinstance(mask_position, list): feature = convert_whole_word_to_feature( tokens, mask_position, args.max_seq_length, tokenizer) else: feature = convert_token_to_feature(tokens, mask_position, args.max_seq_length, tokenizer) tokens_tensor = torch.tensor([feature.input_ids]) token_type_ids = torch.tensor([feature.input_type_ids]) attention_mask = torch.tensor([feature.input_mask]) tokens_tensor = tokens_tensor.to('cuda') token_type_ids = token_type_ids.to('cuda') attention_mask = attention_mask.to('cuda') # Predict all tokens with torch.no_grad(): prediction_scores = model(tokens_tensor, token_type_ids, attention_mask) if isinstance(mask_position, list): predicted_top = prediction_scores[0, mask_position[0]].topk(20) else: predicted_top = prediction_scores[0, mask_position].topk(20) #print(predicted_top[0].cpu().numpy()) pre_tokens = tokenizer.convert_ids_to_tokens( predicted_top[1].cpu().numpy()) #print(pre_tokens) #print(predicted_top[0].cpu().numpy()) #break ss = substitution_selection(mask_words[i], pre_tokens, predicted_top[0].cpu().numpy(), ps, num_selection) print('ssss------') print(ss) SS.append(ss) #break #print(mask_words[i], ":", ss) source_words.append(mask_words[i]) #pre_word = substitution_ranking2(mask_words[i], ss, fasttext_dico, fasttext_emb,word_count) pre_word = substitution_ranking(mask_words[i], mask_context, ss, fasttext_dico, fasttext_emb, word_count, tokenizer, model, mask_labels[i]) substitution_words.append(pre_word) #if(bre_i==5): # break #bre_i += 1 potential, precision, recall, F_score = evaulation_SS_scores( SS, mask_labels) print("The score of evaluation for substitution selection") output_sr_file.write(str(args.num_selections)) output_sr_file.write('\t') output_sr_file.write(str(precision)) output_sr_file.write('\t') output_sr_file.write(str(recall)) output_sr_file.write('\t') output_sr_file.write(str(F_score)) output_sr_file.write('\t') print(potential, precision, recall, F_score) precision, accuracy, changed_proportion = evaulation_pipeline_scores( substitution_words, source_words, mask_labels) print("The score of evaluation for full LS pipeline") print(precision, accuracy, changed_proportion) output_sr_file.write(str(precision)) output_sr_file.write('\t') output_sr_file.write(str(accuracy)) output_sr_file.write('\n')
def attack(fuzz_val, top_k_words, qrs, wts, sample_index, text_ls, true_label, predictor, stop_words_set, word2idx, idx2word, cos_sim, word_embedding, sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15, synonym_num=50, batch_size=32): rows = [] nlp = spacy.load('en_core_web_sm') masked_lang_model = BertForMaskedLM.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') changed_with = [] doc = nlp(' '.join(text_ls)) text = [] for sent in doc.sents: for token in sent: text.append(token.text) tok_text = [] for item in text: ap = item.find("'") if ap >= 0: tok_text.append(item[0:ap]) tok_text.append("'") tok_text.append(item[ap + 1:len(item)]) else: tok_text.append(item) text = [] for item in tok_text: if len(item) > 0: text.append(item) text_ls = text[:] # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) orig_prob = orig_probs.max() if true_label != orig_label: return '', 0, orig_label, orig_label, 0, [], [] else: len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(text_ls) # get importance score leave_1_texts = [ text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):] for ii in range(len_text) ] leave_1_probs = predictor(leave_1_texts, batch_size=batch_size) num_queries += len(leave_1_texts) leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1) import_scores = ( orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * (leave_1_probs.max(dim=-1)[0] - torch.index_select( orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy() # get words to perturb ranked by importance score for word in words_perturb words_perturb = [] for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True): try: if score > import_score_threshold and text_ls[ idx] not in stop_words_set and len(text_ls[idx]) > 2: words_perturb.append((idx, score)) except: print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts)) #return '', 0, orig_label, orig_label, 0, [], words_perturb # find synonyms words_perturb_idx = [ word2idx[word] for idx, word in words_perturb if word in word2idx ] #synonym_words, synonym_values, synonyms_dict = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, -1.0) # start replacing and attacking text_prime = text_ls[:] sims = [] text_cache = text_prime[:] num_changed = 0 for idx, score in words_perturb: #print(text_ls[idx]) text_range_min, text_range_max = calc_window(idx, 3, 10, len_text) sliced_text = text_prime[text_range_min:text_range_max] #print(sliced_text) new_index = idx - text_range_min #print(sliced_text[new_index]) masked_idx = new_index tokens, words, position = gen.convert_sentence_to_token( ' '.join(sliced_text), 1000, tokenizer) assert len(words) == len(position) len_tokens = len(tokens) mask_position = position[masked_idx] if isinstance(mask_position, list): feature = gen.convert_whole_word_to_feature( tokens, mask_position, 1000, tokenizer) else: feature = gen.convert_token_to_feature(tokens, mask_position, 1000, tokenizer) tokens_tensor = torch.tensor([feature.input_ids]) token_type_ids = torch.tensor([feature.input_type_ids]) attention_mask = torch.tensor([feature.input_mask]) tokens_tensor = tokens_tensor.to('cuda') token_type_ids = token_type_ids.to('cuda') attention_mask = attention_mask.to('cuda') #new_probs = predictor(new_texts, batch_size=batch_size) masked_lang_model.to('cuda') masked_lang_model.eval() ps = PorterStemmer() with torch.no_grad(): prediction_scores = masked_lang_model(tokens_tensor, token_type_ids, attention_mask) if isinstance(mask_position, list): predicted_top = prediction_scores[0, mask_position[0]].topk(50) else: predicted_top = prediction_scores[0, mask_position].topk(50) pre_tokens = tokenizer.convert_ids_to_tokens( predicted_top[1].cpu().numpy()) synonyms_initial = gen.substitution_generation( words[masked_idx], pre_tokens, predicted_top[0].cpu().numpy(), ps, 50) new_texts = [] avg = [] synonyms = [] assert words[masked_idx] == text_ls[idx] #print(synonyms) for candidate_word in synonyms_initial: if candidate_word in word_embedding and words[ masked_idx] in word_embedding: candidate_similarity = calc_similarity( word_embedding[words[masked_idx]], word_embedding[candidate_word]) avg.append(candidate_similarity) #print(words[masked_idx], candidate_similarity, candidate_word) if candidate_similarity >= 0.2: new_texts.append(text_prime[:idx] + [candidate_word] + text_prime[min(idx + 1, len_text):]) synonyms.append(candidate_word) else: new_texts.append(text_prime[:idx] + [candidate_word] + text_prime[min(idx + 1, len_text):]) synonyms.append(candidate_word) #print(len(new_texts)) if len(new_texts) == 0: continue text_range_min, text_range_max = calc_window( idx, half_sim_score_window, sim_score_window, len_text) semantic_sims = \ sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts), list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0] sims.append(np.sum(semantic_sims) / len(semantic_sims)) new_probs_mask = np.ones( len(new_texts) ) #(orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() # prevent bad synonyms new_probs_mask *= (semantic_sims >= sim_score_threshold) # prevent incompatible pos synonyms_pos_ls = [ criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts ] pos_mask = np.array( criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) new_probs_mask *= pos_mask new_vals = semantic_sims * new_probs_mask index = [] mini = 2 for i in range(len(new_vals)): if new_vals[i] > 0: index.append((new_vals[i], i)) if len(index) == 0: continue new_texts1 = [new_texts[ind] for val, ind in index] #print(len(new_texts1)) num_queries += len(new_texts1) if num_queries > qrs: return '', 0, orig_label, orig_label, 0, [], [] new_probs = predictor(new_texts1, batch_size=batch_size) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) pr = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() if np.sum(pr) > 0: text_prime[idx] = synonyms[index[pr.argmax( )][1]] #synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 break else: new_label_probs = new_probs[:, orig_label] new_label_prob_min, new_label_prob_argmin = torch.min( new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[index[new_label_prob_argmin][1]] num_changed += 1 text_cache = text_prime[:] if fuzz.token_set_ratio(' '.join(text_ls), ' '.join(text_cache)) < fuzz_val: return ' '.join( text_prime), num_changed, orig_label, torch.argmax( predictor([text_prime ])), num_queries, words_perturb, sims return ' '.join(text_prime), num_changed, orig_label, torch.argmax( predictor([text_prime])), num_queries, words_perturb, sims
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--eval_dir", default=None, type=str, required=True, help="The evaluation data dir.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_SR_file", default=None, type=str, required=True, help="The output directory of writing substitution selection.") parser.add_argument("--word_embeddings", default=None, type=str, required=True, help="The path of word embeddings") parser.add_argument("--word_frequency", default=None, type=str, required=True, help="The path of word frequency.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=250, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--num_selections", default=20, type=int, help="Total number of training epochs to perform.") parser.add_argument("--num_eval_epochs", default=1, type=int, help="Total number of training epochs to perform.") parser.add_argument("--prob_mask", default=0.5, type=float, help="Proportion of the masked words in first sentence. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--ppdb", default="./ppdb-2.0-tldr", type=str, required=True, help="The path of word frequency.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError("At least `do_eval` must be True.") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model,output_attentions=True) model.to(device) output_sr_file = open(args.output_SR_file,"a+") print("Loading embeddings ...") wordVecPath = args.word_embeddings #wordVecPath = "/media/qiang/ee63f41d-4004-44fe-bcfd-522df9f2eee8/glove.840B.300d.txt" fasttext_dico, fasttext_emb = getWordmap(wordVecPath) #stopword = set(stopwords.words('english')) word_count_path = args.word_frequency #word_count_path = "word_frequency_wiki.txt" word_count = getWordCount(word_count_path) ps = PorterStemmer() print("loading PPDB ...") ppdb_path = args.ppdb ppdb_model = Ppdb(ppdb_path) CGBERT = [] substitution_words = [] num_selection = args.num_selections window_context = 11 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): fileName = args.eval_dir.split('/')[-1][:-4] if fileName=='lex.mturk': eval_examples, mask_words, mask_labels = read_eval_dataset(args.eval_dir) else: eval_examples, mask_words, mask_labels = read_eval_index_dataset(args.eval_dir) eval_size = len(eval_examples) print("***** Running evaluation *****") print(" Num examples = %d", eval_size) #logger.info(" Batch size = %d", args.eval_batch_size) model.eval() for i in range(eval_size): print('Sentence {} rankings: '.format(i)) #output_sr_file.write(str(i)) #output_sr_file.write(' sentence: ') #output_sr_file.write('\n') print(eval_examples[i]) print(mask_words[i]) tokens, words, position = convert_sentence_to_token(eval_examples[i], args.max_seq_length, tokenizer) assert len(words)==len(position) mask_index = words.index(mask_words[i]) mask_context = extract_context(words,mask_index,window_context) len_tokens = len(tokens) mask_position = position[mask_index] if isinstance(mask_position,list): feature = convert_whole_word_to_feature(tokens, mask_position, args.max_seq_length, tokenizer, args.prob_mask) else: feature = convert_token_to_feature(tokens, mask_position, args.max_seq_length, tokenizer, args.prob_mask) tokens_tensor = torch.tensor([feature.input_ids]) token_type_ids = torch.tensor([feature.input_type_ids]) attention_mask = torch.tensor([feature.input_mask]) tokens_tensor = tokens_tensor.to('cuda') token_type_ids = token_type_ids.to('cuda') attention_mask = attention_mask.to('cuda') # Predict all tokens with torch.no_grad(): all_attentions,prediction_scores = model(tokens_tensor, token_type_ids,attention_mask) if isinstance(mask_position,list): predicted_top = prediction_scores[0, mask_position[0]].topk(80) else: predicted_top = prediction_scores[0, mask_position].topk(80) #print(predicted_top[0].cpu().numpy()) pre_tokens = tokenizer.convert_ids_to_tokens(predicted_top[1].cpu().numpy()) #print(predicted_top[0].cpu().numpy()) sentence = eval_examples[i].lower() words = word_tokenize(sentence) words_tag = nltk.pos_tag(words) complex_word_index = words.index(mask_words[i]) complex_word_tag = words_tag[complex_word_index][1] complex_word_tag = preprocess_tag(complex_word_tag) cgPPDB = ppdb_model.predict(mask_words[i],complex_word_tag) cgBERT = BERT_candidate_generation(mask_words[i], pre_tokens, predicted_top[0].cpu().numpy(), ps, args.num_selections) print(cgBERT) CGBERT.append(cgBERT) pre_word = substitution_ranking(mask_words[i], mask_context, cgBERT, fasttext_dico, fasttext_emb,word_count,cgPPDB,tokenizer,model,mask_labels[i]) substitution_words.append(pre_word) potential,precision,recall,F_score=evaulation_SS_scores(CGBERT, mask_labels) print("The score of evaluation for BERT candidate generation") print(potential,precision,recall,F_score) output_sr_file.write(str(args.num_selections)) output_sr_file.write('\t') output_sr_file.write(str(potential)) output_sr_file.write('\t') output_sr_file.write(str(precision)) output_sr_file.write('\t') output_sr_file.write(str(recall)) output_sr_file.write('\t') output_sr_file.write(str(F_score)) output_sr_file.write('\t') precision,accuracy,changed_proportion=evaulation_pipeline_scores(substitution_words, mask_words, mask_labels) print("The score of evaluation for full LS pipeline") print(precision,accuracy,changed_proportion) output_sr_file.write(str(precision)) output_sr_file.write('\t') output_sr_file.write(str(accuracy)) output_sr_file.write('\t') output_sr_file.write(str(changed_proportion)) output_sr_file.write('\n') output_sr_file.close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--discriminative_finetuning', action='store_true', help='Whether to use discriminative fine-tuning') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model ############################################################################# # model = BertForPreTraining.from_pretrained(args.bert_model) model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch logits = model(input_ids, segment_ids, input_mask) loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, model.module.config.vocab_size) for i in range(len(logits)) ] loss = loss_fct(logits, lm_label_ids.view(-1)) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument("--data_dir", default="/work01/ryuto/data/NTC_processed", type=str) parser.add_argument( "--bert_model", default="/home/ryuto/data/jap_BERT/", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--vocab", default="/home/ryuto/data/NTC_Matsu_original/wordIndex.txt", type=str) # model parameters parser.add_argument( "--do_lower_case", action='store_true', help= "Set this flag if you are using an uncased model. (If Japanese model, set false)" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) # Hyper parameter parser.add_argument('--seed', type=int, default=2020) parser.add_argument('--insert_max', type=int, default=10) parser.add_argument('--insert_min', type=int, default=3) parser.add_argument('--target_max', type=int, default=3) parser.add_argument('--target_min', type=int, default=1) parser.add_argument('--iteration', type=int, default=3) parser.add_argument('--data_ratio', type=float, default=100) args = parser.parse_args() # Seed random.seed(args.seed) # vocab & tokenizer vocab = set_vocab(args.vocab) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Extract predicates predicates = extract_predicates(vocab=vocab, data_dir=args.data_dir) random.shuffle(predicates) # model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained(args.bert_model) model.to(device) model.eval() counter = 0 data_size = int(len(predicates) * args.data_ratio / 100) with open(args.output_file, "w", encoding='utf-8') as writer: for predicate in tqdm(predicates[:data_size]): for case in CASES: for _ in range(args.iteration): # insert MASK and case n_target = random.randint(args.target_min, args.target_max) text_a = [CLS] + [MASK] * n_target + [case, predicate, SEP] tokens = tokenizer.tokenize(" ".join(text_a)) mask_ids = [ idx for idx, token in enumerate(tokens) if token == MASK ] trg_id = mask_ids[-1] black_list = [predicate] # predict MASK tokens = prediction(model=model, seq_length=args.max_seq_length, device=device, tokenizer=tokenizer, tokens=tokens, mask_ids=mask_ids, black_list=black_list, how_select="sample") # insert MASK n_insert = random.randint(args.insert_min, args.insert_max) tokens = tokens[:trg_id + 2] + [MASK] * n_insert + tokens[trg_id + 2:] mask_ids2 = [ idx for idx, token in enumerate(tokens) if token == MASK ] # predict MASK tokens = prediction(model=model, seq_length=args.max_seq_length, device=device, tokenizer=tokenizer, tokens=tokens, mask_ids=mask_ids2, black_list=black_list, how_select="argmax") target = tokens[mask_ids[0]:mask_ids[-1] + 2] chunk = tokens[mask_ids2[0]:mask_ids2[-1] + 1] prd = tokens[mask_ids2[-1] + 1:len(tokens) - 1] target_tokens, target_ids = convert_bert_predicts_to_ids( target, vocab) chunk_tokens, chunk_ids = convert_bert_predicts_to_ids( chunk, vocab) predicate_tokens, predicate_ids = convert_bert_predicts_to_ids( prd, vocab) concat_surfs = target_tokens + chunk_tokens + predicate_tokens concat_ids = target_ids + chunk_ids + predicate_ids p_id = len(concat_surfs) - 1 labels = [3] * len(concat_surfs) labels[len(target_tokens) - 2] = CASES[case] instance = { "tokens": concat_ids, "surfaces": concat_surfs, "pas": [{ "p_id": p_id, "args": labels }] } print(json.dumps(instance), file=writer) if counter < 5: counter += 1 logger.info("{} + {} = {} {} {}".format( predicate, case, "".join(target_tokens), "".join(chunk_tokens), "".join(predicate_tokens)))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument("--continue_training", action='store_true', help="Continue training from a checkpoint") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.continue_training: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.hybrid_attention: max_seq_length = args.max_seq_length attention_mask = torch.ones(12, max_seq_length, max_seq_length, dtype=torch.long) # left attention attention_mask[:2, :, :] = torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # right attention attention_mask[2:4, :, :] = torch.triu( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # local attention, window size = 3 attention_mask[4:6, :, :] = torch.triu( torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long), 1), -1) attention_mask = torch.cat( [attention_mask.unsqueeze(0) for _ in range(8)]) attention_mask = attention_mask.to(device) else: attention_mask = None global_step = 0 epoch_start = 0 if args.do_train: if args.continue_training: # if checkpoint file exists, find the last checkpoint if os.path.exists(args.output_dir) and os.listdir(args.output_dir): all_cp = os.listdir(args.output_dir) steps = [ int(re.search('_\d+', cp).group()[1:]) for cp in all_cp if re.search('_\d+', cp) ] if len(steps) == 0: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) max_step = max(steps) # load checkpoint checkpoint = torch.load( os.path.join(args.output_dir, 'checkpoints_' + str(max_step) + '.pt')) logger.info("***** Loading checkpoint *****") logger.info(" Num steps = %d", checkpoint['global_step']) logger.info(" Num epoch = %d", checkpoint['epoch']) logger.info(" Loss = %d, %d", checkpoint['loss'], checkpoint['loss_now']) model.module.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) global_step = checkpoint['global_step'] epoch_start = checkpoint['epoch'] del checkpoint else: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) writer = SummaryWriter(log_dir=os.environ['HOME']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() tr_loss_1000 = 0 for ep in trange(epoch_start, int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, hybrid_mask=attention_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_loss_1000 += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # log the training loss for every 1000 steps if global_step % 1000 == 999: writer.add_scalar('data/loss', tr_loss_1000 / 1000, global_step) logger.info("training steps: %s", global_step) logger.info("training loss per 1000: %s", tr_loss_1000 / 1000) tr_loss_1000 = 0 # save the checkpoint for every 10000 steps if global_step % 10000 == 0: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_file = os.path.join( args.output_dir, "checkpoints_" + str(global_step) + ".pt") checkpoint = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': ep, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'loss_now': tr_loss_1000 } if args.do_train: torch.save(checkpoint, output_file) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_" + str(ep)) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) logger.info("training loss: %s", tr_loss / nb_tr_steps) # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def run_aug(args, save_every_epoch=False): processors = { # you can your processor here "TREC": AugProcessor, "stsa.fine": AugProcessor, "stsa.binary": AugProcessor, "mpqa": AugProcessor, "rt-polarity": AugProcessor, "subj": AugProcessor, } task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) #dev_examples = processor.get_dev_examples(args.data_dir) #train_examples.extend(dev_examples) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) # Prepare model model = BertForMaskedLM.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if task_name == 'stsa.fine': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 5, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) elif task_name == 'TREC': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 6, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) for e in trange(int(args.num_train_epochs), desc="Epoch"): avg_loss = 0. for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) _, input_ids, input_mask, segment_ids, masked_ids = batch loss = model(input_ids, segment_ids, input_mask, masked_ids) loss.backward() avg_loss += loss.item() optimizer.step() model.zero_grad() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0 if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--case_file", type=str) parser.add_argument("--json_file", type=str) parser.add_argument("--base2index", type=str) parser.add_argument("--n_best", type=int, default=5) parser.add_argument("--n_sample", type=int, default=10) parser.add_argument("--sampling_prob", type=float, default=0.5) parser.add_argument("--fill_mode", type=str, default=None, help="Choose from 'best_n', 'best_n_surface', ...") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.fill_mode.startswith("predicate"): examples, sent_options, instance_options = read_examples_and_mask_pred( args.input_file, args.json_file) else: examples, sent_options, instance_options = read_examples_and_mask( args.input_file, args.case_file) # add # split_sentence_dir = "/work01/ryuto/data/NTC_BERT_split" # split_sentence_file = os.path.join(split_sentence_dir, os.path.basename(args.output_file)) # if os.path.exists(split_sentence_file): # os.remove(split_sentence_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) # features = convert_examples_to_features( # examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertForMaskedLM.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) all_subwords = [feature.tokens for feature in features] eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() config = FillerConfig(subword_vocab=tokenizer.vocab, pre_trained_vocab_file=args.base2index, json_file=args.json_file, sent_options=sent_options, instance_options=instance_options, all_subwords=all_subwords) if args.fill_mode == "n_best": filler = BestNTokenFiller(config, n_best=args.n_best, mode="json") elif args.fill_mode == "n_best_surface": filler = BestNTokenFiller(config, n_best=args.n_best, mode="surface") elif args.fill_mode == "multi_sampling": filler = MultiTokenFiller(config, n_sample=args.n_sample, prob=args.sampling_prob, mode="json") elif args.fill_mode == "multi_sampling_surface": filler = MultiTokenFiller(config, n_sample=args.n_sample, prob=args.sampling_prob, mode="surface") elif args.fill_mode == "predicate": filler = BestNTokenPredicateFiller(config, n_best=args.n_best, mode="json") elif args.fill_mode == "predicate_surface": filler = BestNTokenPredicateFiller(config, n_best=args.n_best, mode="surface") elif args.fill_mode == "random": filler = RandomNTokenFiller(config, n_sample=args.n_best, mode="json") elif args.fill_mode == "random_surface": filler = RandomNTokenFiller(config, n_sample=args.n_best, mode="surface") elif args.fill_mode == "sampling": filler = SamplingTokenFiller(config, n_sample=args.n_best, mode="json") elif args.fill_mode == "sampling_surface": filler = SamplingTokenFiller(config, n_sample=args.n_best, mode="surface") else: raise ValueError("Unsupported Value: {}".format(args.fill_mode)) with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) prediction = model(input_ids, token_type_ids=None, attention_mask=input_mask) for scores in prediction: instances = filler(scores) if instances is not None: print("\n".join(instances), file=writer) instances = filler.pop() if instances: print("\n".join(instances), file=writer) with open(args.output_file + ".distribution", "w") as fo: json.dump(filler.predict_token_distribution, fo)
def run_aug(args, save_every_epoch=False): processors = { # you can your processor here "TREC": AugProcessor, "stsa.fine": AugProcessor, "stsa.binary": AugProcessor, "mpqa": AugProcessor, "rt-polarity": AugProcessor, "subj": AugProcessor, "squad":SquadProcessor, } task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) #dev_examples = processor.get_dev_examples(args.data_dir) #train_examples.extend(dev_examples) num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if task_name == 'stsa.fine': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) elif task_name == 'TREC': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate, warmup=args.warmup_proportion,t_total=t_total) global_step = 0 if task_name = "squad": train_features = convert_examples_to_features_squad( train_examples, label_list, args.max_seq_length, tokenizer)
from pytorch_pretrained_bert.modeling import BertForMaskedLM from pytorch_pretrained_bert import BertTokenizer import torch bert_model = 'bert-large-uncased' model = BertForMaskedLM.from_pretrained(bert_model) tokenizer = BertTokenizer.from_pretrained(bert_model) question = 'who invented the telephone' # "the telephone was invented by whom" tokenized_question = tokenizer.tokenize(question) masked_index = 0 tokenized_question[masked_index] = '[MASK]' question_ids = tokenizer.convert_tokens_to_ids(tokenized_question) combined_ids = question_ids segments_ids = [0] * len(question_ids) tokens_tensor = torch.tensor([combined_ids]) segments_tensor = torch.tensor([segments_ids]) model.eval() predictions = model(tokens_tensor, segments_tensor) # 1 x len(combined_ids) x vocab size predicted_index = torch.topk(predictions[0, masked_index], 20)[1].tolist() print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens(predicted_index) print(predicted_token)
import numpy as np from pytorch_pretrained_bert.tokenization import load_vocab, BertTokenizer from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler import torch import argparse from tqdm import tqdm, trange import os import re base_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path), do_lower_case=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained('checkpoint/') model.to(device) model.eval() vocab = load_vocab('{}/data/vocab.txt'.format(base_path)) inv_vocab = {v: k for k, v in vocab.items()} def getMI(sentence): tokens = tokenizer.tokenize(sentence) tokens.insert(0, "[CLS]") tokens.append("[SEP]") tokens_length = len(tokens) result = [] for i, token in enumerate(tokens): # tokens preprocessing
def run_aug(args, save_every_epoch=False): processors = { # you can your processor here "TREC": AugProcessor, "stsa.fine": AugProcessor, "stsa.binary": AugProcessor, "mpqa": AugProcessor, "rt-polarity": AugProcessor, "subj": AugProcessor, } task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) #dev_examples = processor.get_dev_examples(args.data_dir) #train_examples.extend(dev_examples) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) # Prepare model model = BertForMaskedLM.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if task_name == 'stsa.fine': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 5, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) elif task_name == 'TREC': model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( 6, 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] origin_train_path = os.path.join(args.output_dir, "train_origin.tsv") save_train_path = os.path.join(args.output_dir, "train.tsv") shutil.copy(origin_train_path, save_train_path) best_test_acc = train_text_classifier.train("aug_data") print("before augment best acc:{}".format(best_test_acc)) for e in trange(int(args.num_train_epochs), desc="Epoch"): avg_loss = 0. for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.cuda() for t in batch) _, input_ids, input_mask, segment_ids, masked_ids = batch loss = model(input_ids, segment_ids, input_mask, masked_ids) loss.backward() avg_loss += loss.item() optimizer.step() model.zero_grad() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0 torch.cuda.empty_cache() shutil.copy(origin_train_path, save_train_path) save_train_file = open(save_train_path, 'a') tsv_writer = csv.writer(save_train_file, delimiter='\t') #tsv_writer.writerow(['sentence', 'label']) for step, batch in enumerate(train_dataloader): model.eval() batch = tuple(t.cuda() for t in batch) init_ids, _, input_mask, segment_ids, _ = batch input_lens = [sum(mask).item() for mask in input_mask] #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens]) masked_idx = np.squeeze( [np.random.randint(0, l, max(l // 7, 2)) for l in input_lens]) for ids, idx in zip(init_ids, masked_idx): ids[idx] = MASK_id predictions = model(init_ids, segment_ids, input_mask) for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids): #pred = torch.argsort(pred)[:,-e-1][idx] ''' pred = torch.argsort(preds)[:,-1][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item()]) ''' pred = torch.argsort(preds)[:, -2][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item()]) torch.cuda.empty_cache() predictions = predictions.detach().cpu() torch.cuda.empty_cache() bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e)) shutil.copy(save_train_path, bak_train_path) best_test_acc = train_text_classifier.train("aug_data") print("epoch {} augment best acc:{}".format(e, best_test_acc)) if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="/home/ryuto/data/jawiki-kurohashi-bert", type=str, help= "Please fill the path to directory of BERT model, or the name of BERT model." "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) # BERT model parameters parser.add_argument( "--do_lower_case", action='store_true', help= "Set this flag if you are using an uncased model. (If Japanese model, set false)" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--language", type=str, default="ja", help="Choose from 'ja' or 'en' (default='ja').") # Data Augmentation Option parser.add_argument( "--how_select", dest='how_select', default="argmax", type=str, help="Choose from 'argmax' or 'sample' or 'beam'. (default='argmax')") parser.add_argument( "--how_many", default='multi', type=str, help="Choose from 'single' or 'multi'. (default='multi')") parser.add_argument('--topk', type=int, default=5, help="for beam search") # Hyper parameter parser.add_argument('--seed', type=int, default=2020) args = parser.parse_args() logger.info(args) # Seed random.seed(args.seed) logger.info("Seed: {}".format(args.seed)) # Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info("device: {}".format(device)) logger.info("language: {}".format(args.language)) logger.info("BERT model: {}".format(args.bert_model)) logger.debug("Loading BERT model...") model = BertForMaskedLM.from_pretrained(args.bert_model) logger.debug("Sending BERT model to device...") model.to(device) model.eval() # Tokenizer tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Input the sentence logger.info("How select tokens: {}".format(args.how_select)) logger.info("How many tokens to predict at once: {}".format(args.how_many)) print("Mask token is 'M'.") while True: text = input("Sentence: ") if text == "q": break black_list = input( "Black list of tokens (separator is ','): ").replace( " ", "").replace(" ", "").split(",") # Input feature feature = InputFeatures(text=text, tokenizer=tokenizer, max_seq_length=args.max_seq_length, language=args.language) logger.debug(feature.tokens) if len(feature.token_mask_ids) == 0: print("Not found mask token (mask token is 'M').") continue if args.how_select == "beam": output_sents, output_tokens = prediction_with_beam_search( device=device, model=model, feature=feature, tokenizer=tokenizer, black_list=black_list, k=args.topk) for sent in output_sents: print(" ".join(sent[1:feature.len - 1])) else: if args.how_many == "single": predict_tokens = prediction_single(device=device, model=model, feature=feature, tokenizer=tokenizer, how_select=args.how_select, black_list=black_list) elif args.how_many == "multi": predict_tokens = prediction_multi(device=device, model=model, feature=feature, tokenizer=tokenizer, how_select=args.how_select, black_list=black_list) else: raise ValueError("Unsupported value: {}".format(args.how_many)) assert len(predict_tokens) == len(feature.token_mask_ids) # tokens = feature.tokens # for idx, p_token in zip(feature.token_mask_ids, predict_tokens): # tokens[idx] = p_token # print(" ".join(tokens[1:feature.len - 1])) filled_tokens = copy.deepcopy(feature.original_tokens) for idx, p_token in zip(feature.original_token_mask_ids, predict_tokens): filled_tokens[idx] = p_token print(" ".join(filled_tokens))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--eval_dir", default=None, type=str, required=True, help="The evaluation data dir.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_SR_file", default=None, type=str, required=True, help="The output directory of writing substitution selection.") parser.add_argument("--word_embeddings", default=None, type=str, required=True, help="The path of word embeddings") parser.add_argument("--word_frequency", default=None, type=str, required=True, help="The path of word frequency.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=250, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--num_selections", default=20, type=int, help="Total number of training epochs to perform.") parser.add_argument("--num_eval_epochs", default=1, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--prob_mask", default=0.5, type=float, help="Proportion of the masked words in first sentence. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--ppdb", default="./ppdb-2.0-tldr", type=str, required=True, help="The path of word frequency.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError("At least `do_eval` must be True.") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model, output_attentions=True) model.to(device) ranker = Ranker() ranker.read_features(args.word_embeddings, args.word_frequency, args.ppdb) #one_sent = "John composed these verses." output_sr_file = open(args.output_SR_file, "w") one_sent = "alessandro mazzola -lrb- born 8 november , 1942 -rrb- is a former italian football player ." simple_sent = simplified_sentence(one_sent, model, tokenizer, ranker, args.max_seq_length, threshold=0.5, num_selections=args.num_selections) print(simple_sent) with open(args.eval_dir, "r") as reader: while True: one_sent = reader.readline() one_sent = one_sent.strip() if one_sent == "": break #output_sr_file.write(one_sent) #output_sr_file.write(' ||| ') print(one_sent) simple_sent = simplified_sentence( one_sent, model, tokenizer, ranker, args.max_seq_length, threshold=0.5, num_selections=args.num_selections) #simple_sent = "---------" output_sr_file.write(simple_sent) print(simple_sent) output_sr_file.write('\n') output_sr_file.close()
def main(): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer(vocab_file=args.vocab_file) train_examples = None num_train_optimization_steps = None vocab_list = [] with open(args.vocab_file, 'r') as fr: for line in fr: vocab_list.append(line.strip("\n")) if args.do_train: train_examples = create_examples( data_path=args.pretrain_train_path, max_seq_length=args.max_seq_length, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model = BertForMaskedLM( config=BertConfig.from_json_file(args.bert_config_json)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 best_loss = 100000 if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for e in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # masked_lm_loss loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if nb_tr_steps > 0 and nb_tr_steps % 100 == 0: logger.info( "===================== -epoch %d -train_step %d -train_loss %.4f\n" % (e, nb_tr_steps, tr_loss / nb_tr_steps)) if nb_tr_steps > 0 and nb_tr_steps % 2000 == 0: eval_examples = create_examples( data_path=args.pretrain_dev_path, max_seq_length=args.max_seq_length, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): loss = model(input_ids, segment_ids, input_mask, label_ids) eval_loss += loss.item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if eval_loss < best_loss: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) best_loss = eval_loss logger.info( "============================ -epoch %d -train_loss %.4f -eval_loss %.4f\n" % (e, tr_loss / nb_tr_steps, eval_loss))
from torch.nn import CrossEntropyLoss, MSELoss from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.optimization import BertAdam print(args.bert_config_json) vocab_list = [] with open(args.vocab_file, 'r') as fr: for line in fr: vocab_list.append(line.strip("\n")) tokenizer = BertTokenizer(vocab_file=args.vocab_file) model = BertForMaskedLM(config=BertConfig.from_json_file(args.bert_config_json)) model.load_state_dict(torch.load('/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/outputs/60000_pytorch_model.bin')) for k,v in model.named_parameters(): print(k,v) pretrain_=BertForMaskedLM(args.bert_config_json) eval_examples = create_examples(data_path=args.pretrain_dev_path, max_seq_length=args.max_seq_length, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)