def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): config_path = os.path.abspath(bert_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print("Converting TensorFlow checkpoint from {} with config at {}".format( tf_path, config_path)) # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) for name, array in zip(names, arrays): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] if l[0] == 'kernel' or l[0] == 'gamma': pointer = getattr(pointer, 'weight') elif l[0] == 'output_bias' or l[0] == 'beta': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: pointer = getattr(pointer, l[0]) if len(l) >= 2: num = int(l[1]) pointer = pointer[num] if m_name[-11:] == '_embeddings': pointer = getattr(pointer, 'weight') elif m_name == 'kernel': array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def test1(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") processor = SentProcessor() bert_path = '/home/liuxg/.pytorch_pretrained_bert' tokenizer = BertTokenizer.from_pretrained(bert_path, do_lower_case=True) # Prepare model model = BertForPreTraining.from_pretrained(bert_path) model.to(device) model.eval() sents = ['which computer do you like', 'what app are you most using'] label_list = processor.get_labels() eval_examples = processor.get_dev_examples(sents=sents) # for e in eval_examples: # print('----------------', e.text_a) max_seq_length = 15 eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer) input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) ans = model(input_ids, segment_ids, input_mask)[0] # 1,768 print(ans)
def bertForPreTraining(*args, **kwargs): """ BERT model with pre-training heads. This module comprises the BERT model followed by the two pre-training heads - the masked language modeling head, and - the next sentence classification head. """ model = BertForPreTraining.from_pretrained(*args, **kwargs) return model
def convert_all_tensorflow_bert_weights_to_pytorch(self, input_folder: str) -> None: """ Tensorflow to Pytorch weight conversion based on huggingface's library Parameters ---------- input_folder: `str`, required The folder containing the tensorflow files """ files = [e for e in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, e))] folders = [os.path.join(input_folder, e) for e in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, e))] flag = -4 for file in files: if file == 'vocab.txt' or \ file.endswith('.data-00000-of-00001') or \ file.endswith('.index') or \ file.endswith('.meta') or \ file.endswith('.json'): flag += 1 if file.endswith('.json'): config_file = file if flag > 0: assert type(config_file) == str, "no valid config file, but is attempting to convert" pytorch_path = os.path.join(input_folder, 'pytorch') tensorflow_path = os.path.join(input_folder, 'tensorflow') force_folder_to_exist(pytorch_path) force_folder_to_exist(tensorflow_path) os.system('mv ' + os.path.join(input_folder, '*.*') + ' ' + tensorflow_path) os.system('cp ' + os.path.join(tensorflow_path, '*.txt') + ' ' + pytorch_path) os.system('cp ' + os.path.join(tensorflow_path, '*.json') + ' ' + pytorch_path) config = BertConfig.from_json_file(os.path.join(tensorflow_path, config_file)) model = BertForPreTraining(config) load_tf_weights_in_bert(model=model, tf_checkpoint_path=os.path.join(tensorflow_path, 'bert_model.ckpt')) torch.save(model.state_dict(), os.path.join(pytorch_path, 'pytorch_model.bin')) else: for folder in folders: self.convert_all_tensorflow_bert_weights_to_pytorch(input_folder=folder)
def test_BertForPreTraining(): input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForPreTraining(config) print(model(input_ids, token_type_ids, input_mask))
def load_checkpoint(filename, device, n_gpu): logger.info('Loading model %s' % filename) saved_params = torch.load(filename, map_location=lambda storage, loc: storage) args = saved_params['args'] global_step = saved_params['global_step'] model_dict = saved_params['model_dict'] optimizer_dict = saved_params['optimizer'] iter_id = saved_params['iter_id'] model = BertForPreTraining.from_pretrained(args.bert_model, state_dict=model_dict) return args, global_step, iter_id, model, optimizer_dict
def load_bert(bert_model_name, bert_load_mode, all_state, bert_config_json_path=None): if bert_config_json_path is None: bert_config_json_path = os.path.join(get_bert_config_path(bert_model_name), "bert_config.json") if bert_load_mode == "model_only": state_dict = all_state elif bert_load_mode in ["state_model_only", "state_all", "state_full_model"]: state_dict = all_state["model"] else: raise KeyError(bert_load_mode) if bert_load_mode == "state_full_model": model = BertForPreTraining.from_state_dict_full( config_file=bert_config_json_path, state_dict=state_dict, ) else: model = BertForPreTraining.from_state_dict( config_file=bert_config_json_path, state_dict=state_dict, ) return model
def __init__(self, config, num_labels, model_param_fp=None): super(Model, self).__init__(config) self.num_labels = num_labels if model_param_fp is not None: self.bert = BertForPreTraining.from_pretrained( config).load_state_dict(torch.load(open(model_param_fp, 'rb'))).bert else: self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights)
def __init__(self, vocab: Vocabulary, bert_model_name: str, remap_segment_embeddings: int = None, regularizer: RegularizerApplicator = None): super().__init__(vocab, regularizer) pretrained_bert = BertForPreTraining.from_pretrained(bert_model_name) self.pretraining_heads = pretrained_bert.cls self.bert = pretrained_bert self.remap_segment_embeddings = remap_segment_embeddings if remap_segment_embeddings is not None: new_embeddings = self._remap_embeddings( self.bert.bert.embeddings.token_type_embeddings.weight) if new_embeddings is not None: del self.bert.bert.embeddings.token_type_embeddings self.bert.bert.embeddings.token_type_embeddings = new_embeddings
def load_bert(self): with torch.no_grad(): model = BertForPreTraining.from_pretrained('bert-base-uncased').to( 'cuda') '''if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model)''' model.eval() return model
def __init__(self, featQty, headFeatQty, useTfIdfTransform=False, useWordFeatures=True, useBERT=False, useHeadBERT=False, bertModelPath=None, torch_device='cuda', bertModelType='bert-base-uncased'): self.useWordFeatures = useWordFeatures if self.useWordFeatures: self.featQty = featQty self.countVect = CountVectorizer(ngram_range=(1, 1)) self.tfidf = TfidfTransformer() if useTfIdfTransform else None self.headFeatQty = headFeatQty self.headCountVect = CountVectorizer(ngram_range=(1, 1)) self.headTfidf = TfidfTransformer() if useTfIdfTransform else None self.useBERT = useBERT self.useHeadBERT = useHeadBERT if useBERT or useHeadBERT: self.torch_device = torch.device(torch_device) if bertModelPath is not None: print('Loading fine-tuned model from file:', bertModelPath) self.bertModelWrapper = BertForPreTraining.from_pretrained( bertModelType) self.bertModelWrapper.load_state_dict( torch.load(bertModelPath)) else: print('Loading standard pre-trained model') self.bertModelWrapper = BertForMaskedLM.from_pretrained( bertModelType) self.bertModelWrapper.eval() self.bertModelWrapper.to(torch_device) self.bert_tokenizer = BertTokenizer.from_pretrained( bertModelType, do_lower_case=True)
def bertForPreTraining(*args, **kwargs): """ BERT model with pre-training heads. This module comprises the BERT model followed by the two pre-training heads - the masked language modeling head, and - the next sentence classification head. Example: # Load the tokenizer >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForPreTraining >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased') >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors) """ model = BertForPreTraining.from_pretrained(*args, **kwargs) return model
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def main(): """ main function that saves embeddings of each article with their id as individual joblib pickle files """ parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_file", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--model_file", default=None, type=str, required=True, help="Where the pre-trained/fine-tuned model is stored for loading.") parser.add_argument("--override_features", default=False, type=bool, required=True, help="Override pickled feature files.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the output files will be written.") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be split into two, then combined by averaging \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--tuned', action='store_true', help="Whether to use fine-tuned BERT on finance articles") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForPreTraining.from_pretrained(args.bert_model) if args.tuned: model.load_state_dict(torch.load(args.model_file)) print('Loaded model') if args.fp16: model.half() model.to(device) # Prepare optimizer processor = SamplesProcessor() if not os.path.isfile('eval_features.gz'): # save processed articles into features eval_examples = processor.get_dev_examples(args.data_file) eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) dump(eval_features, 'eval_features.gz') else: if not args.override_features: eval_features = load('eval_features.gz') else: # override processed articles into features eval_examples = processor.get_dev_examples(args.data_file) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) dump(eval_features, 'eval_features.gz') logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_features)) model.eval() for eval_count, eval_feature in enumerate(eval_features): if os.path.isfile(args.output_dir + '/embedding_' + str(eval_feature.text_id) + '.gz'): continue para_embed_list = [] for para in range(len(eval_feature.input_ids)): # if segment has no overlap if eval_feature.overlap[para] == 0: encoded_layer_array = np.zeros((0, args.max_seq_length, 1024)) input_ids = torch.tensor(eval_feature.input_ids[para]).view( 1, -1) input_mask = torch.tensor(eval_feature.input_mask[para]).view( 1, -1) segment_ids = torch.tensor( eval_feature.segment_ids[para]).view(1, -1) para_len = np.sum(np.array(eval_feature.input_mask[para]) != 0) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) encoded_layers, _ = model.bert.forward(input_ids, segment_ids, input_mask) for encoded_layer in encoded_layers: encoded_layer_array = np.concatenate( (encoded_layer.detach().cpu().numpy(), encoded_layer_array)) encoded_layers = encoded_layer_array[-2, 1:para_len - 1, :] else: # if segment has overlap encoded_layer_array_1 = np.zeros( (0, args.max_seq_length, 1024)) encoded_layer_array_2 = np.zeros( (0, args.max_seq_length, 1024)) para_len_1 = np.sum( np.array(eval_feature.input_mask[para][0]) != 0) para_len_2 = np.sum( np.array(eval_feature.input_mask[para][1]) != 0) input_ids_1 = torch.tensor( eval_feature.input_ids[para][0]).view(1, -1) input_mask_1 = torch.tensor( eval_feature.input_mask[para][0]).view(1, -1) segment_ids_1 = torch.tensor( eval_feature.segment_ids[para][0]).view(1, -1) input_ids_1 = input_ids_1.to(device) input_mask_1 = input_mask_1.to(device) segment_ids_1 = segment_ids_1.to(device) encoded_layers_1, _ = model.bert.forward( input_ids_1, segment_ids_1, input_mask_1) for encoded_layer in encoded_layers_1: encoded_layer_array_1 = np.concatenate( (encoded_layer.detach().cpu().numpy(), encoded_layer_array_1)) encoded_layers_1 = encoded_layer_array_1[-2, 1:para_len_1 - 1, :] input_ids_2 = torch.tensor( eval_feature.input_ids[para][1]).view(1, -1) input_mask_2 = torch.tensor( eval_feature.input_mask[para][1]).view(1, -1) segment_ids_2 = torch.tensor( eval_feature.segment_ids[para][1]).view(1, -1) input_ids_2 = input_ids_2.to(device) input_mask_2 = input_mask_2.to(device) segment_ids_2 = segment_ids_2.to(device) encoded_layers_2, _ = model.bert.forward( input_ids_2, segment_ids_2, input_mask_2) for encoded_layer in encoded_layers_2: encoded_layer_array_2 = np.concatenate( (encoded_layer.detach().cpu().numpy(), encoded_layer_array_2)) encoded_layers_2 = encoded_layer_array_2[-2, 1:para_len_2 - 1, :] # average the overlapped portion overlap = eval_feature.overlap[para] encoded_overlap = (encoded_layers_1[-overlap:, :] + encoded_layers_2[:overlap, :]) / 2 encoded_layers = np.concatenate( (encoded_layers_1[:-overlap, :], encoded_overlap, encoded_layers_2[overlap:, :])) para_embed_list.append(encoded_layers) dump( para_embed_list, args.output_dir + '/embedding_' + str(eval_feature.text_id) + '.gz')
def main(): local_rank = -1 parser = argparse.ArgumentParser() parser.add_argument("--config", "-c", type=str, required=True) args, _ = parser.parse_known_args() options = argconf.options_from_json("confs/options.json") config = argconf.config_from_json(args.config) args = edict(argconf.parse_args(options, config)) args.local_rank = local_rank args.on_memory = True if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.workspace) and os.listdir(args.workspace): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.workspace)) if not os.path.exists(args.workspace): os.makedirs(args.workspace) tokenizer = BertTokenizer.from_pretrained(args.model_file, do_lower_case=True) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.model_file) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() loss_fct = nn.CrossEntropyLoss(ignore_index=-1) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch prediction_scores, _ = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = loss_fct( prediction_scores.view(-1, model.module.config.vocab_size), lm_label_ids.view(-1)).mean() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.workspace, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--dist_url', type=str, default="tcp://172.31.38.122:23456") parser.add_argument('--rank', type=int, default=0) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument('--use_all_gpus', action="store_true") parser.add_argument('--world_size', type=int, default=1) parser.add_argument('--output_file', type=str, default="pytorch_model.bin") parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--no_sentence_loss", action="store_true", help="Whether not to use sentence level loss.") parser.add_argument('--tokeniser', type=str, default="vocab.txt") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--training', type=bool, default=True, help="Whether to train the model or not") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--verbose', action='store_true', help="Whether to print more details along the way") parser.add_argument('--tensorboard', action='store_true', help="Whether to use Tensorboard ") parser.add_argument('--save', type=bool, default=True, help="Whether to save") parser.add_argument( '--bert_finetuned', type=str, default=None, help='Model finetuned to use instead of pretrained models') args = parser.parse_args() # if args.tensorboard : # from modeling import BertForPreTraining assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: n_gpu = torch.cuda.device_count() device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") else: if args.use_all_gpus: device = torch.device("cuda") n_gpu = torch.cuda.device_count() dp_device_ids = list(range(min(n_gpu, args.train_batch_size))) else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) dp_device_ids = [args.local_rank] n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs print("Initialize Process Group...") # torch.distributed.init_process_group(backend='nccl') # Number of distributed processes world_size = args.world_size # Distributed backend type dist_backend = 'gloo' start = time.time() torch.distributed.init_process_group(backend=dist_backend, init_method=args.dist_url, rank=args.rank, world_size=world_size) end = time.time() print('done within :', end - start) logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) ##### COMBIEN DE GPUs UTILLISER args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained('vocab.txt', do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # num_train_optimization_steps = num_train_optimization_ste ps // n_gpu # Prepare model try: model = BertForPreTraining.from_pretrained( args.bert_model, verbose=args.verbose, tokeniser=args.tokeniser, train_batch_size=args.train_batch_size, device=device) except: model = BertForPreTraining.from_pretrained(args.bert_model) if args.bert_finetuned is not None: model_dict = torch.load(args.bert_finetuned) model.load_state_dict(model_dict) if args.fp16: model.half() if args.local_rank != -1: # try: # from apex.parallel import DistributedDataParallel as DDP # except ImportError: # raise ImportError( # "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") print("Initialize Model...") # model = DDP(model, device_ids = dp_device_ids,output_device=args.local_rank) model.to(device) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=dp_device_ids, output_device=args.local_rank) n_gpu_used = model.device_ids print('number gpu used', n_gpu_used) elif n_gpu > 1: # torch.cuda.set_device(list(range(min(args.train_batch_size, n_gpu)))) model = torch.nn.DataParallel( model ) #, device_ids = list(range(min(args.train_batch_size, n_gpu)))) n_gpu_used = model.device_ids print('number gpu used', n_gpu_used) elif n_gpu == 1: print("Only 1 GPU used") n_gpu_used = [1] model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, num_workers=0, batch_size=args.train_batch_size, pin_memory=False) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): if args.training: model.train() batch = tuple( t.to(device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, mask_index = batch if args.no_sentence_loss: is_next = None loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, mask_index) if args.verbose: # print('input_ids : ', input_ids) # # print('input_mask : ', input_mask) # print('segment_ids : ', segment_ids) # print('lm_label_ids : ', lm_label_ids) # print('is_next : ', is_next) print('loss : ', loss) # if n_gpu > 1: if len(n_gpu_used) > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: # print('backwards') loss.backward() # print('backwards done') tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 else: with torch.no_grad(): model.eval() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if args.verbose: print('input_ids : ', input_ids) print('input_mask : ', input_mask) print('segment_ids : ', segment_ids) print('lm_label_ids : ', lm_label_ids) print('is_next : ', is_next) print('loss : ', loss) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically global_step += 1 # Save a trained model if args.save: # pickle.dump(model.df,open('results.p','wb')) logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = args.output_dir / args.output_file torch.save(model_to_save.state_dict(), str(output_model_file))
dev_label_features = data_loader.convert_examples_to_features(dev_label_examples, label_list, MAX_SEQ_LEN, tokenizer, "classification",all_name_array) dev_label_dataloader = data_loader.make_data_loader (dev_label_features,batch_size=args.batch_size_label-2,fp16=args.fp16, sampler='sequential',metric_option=args.metric_option) # torch.save( dev_label_dataloader, os.path.join( args.qnli_dir, "dev_label_dataloader"+name_add_on+".pytorch") ) print ('\ndev_label_examples {}'.format(len(dev_label_examples))) # dev_label_examples 7661 ## **** make model **** # bert model bert_config = BertConfig( os.path.join(args.bert_model,"bert_config.json") ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model,cache_dir=cache_dir) # @num_labels is yes/no if args.fp16: bert_lm_sentence.half() ## don't send to cuda, we will send to cuda with the joint model # entailment model ent_model = entailment_model.entailment_model (num_labels,bert_config.hidden_size,args.def_emb_dim,weight=torch.FloatTensor([1.5,.75])) # torch.FloatTensor([1.5,.75]) # cosine model # **** in using cosine model, we are not using the training sample A->B then B not-> A other = {'metric_option':args.metric_option} cosine_loss = encoder_model.cosine_distance_loss(bert_config.hidden_size,args.def_emb_dim, args) metric_pass_to_joint_model = {'entailment':ent_model, 'cosine':cosine_loss}
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model #model = BertForPreTraining.from_pretrained(args.bert_model) bert_config = BertConfig.from_json_file('bert_config.json') model = BertForPreTraining(bert_config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on file.__next__ # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("-train_file", required=True, help="The input train corpus.") parser.add_argument( "-output_dir", default='../trained_model', help="The output directory where the model checkpoints will be written." ) parser.add_argument( "-bert_model", default='bert-base-cased', help="bert-base-uncased, bert-large-uncased, bert-base-cased") parser.add_argument("-max_seq_length", default=128, type=int, help="sequence length after WordPiece tokenization. \ longer will be truncated, shorter will be padded." ) parser.add_argument("-train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("-learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("-num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "-warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear lr rate warmup. ") parser.add_argument("-local_rank", default=-1, type=int, help="local_rank for distributed training on gpus") parser.add_argument('-seed', default=42, type=int, help="random seed for initialization") parser.add_argument('-gradient_accumulation_steps', default=1, type=int, help="") parser.add_argument('-loss_scale', default=0, type=float, help="") parser.add_argument("-no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "-on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("-do_lower_case", action='store_true', help="Whether to lower case the input text.") parser.add_argument("-do_train", action='store_true', help="Whether to run training.") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if not os.path.exists(args.output_dir): logger.info('creating model output dir {}'.format(args.output_dir)) os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # import pdb # pdb.set_trace() #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model.to(device) if n_gpu > 1: # true most of the time model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss, encoded_seq = model( input_ids, segment_ids, input_mask, lm_label_ids, is_next, get_encoded_seq=True) # Language modeling loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}.bin".format( time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(dn, dev, batch_size, epochs): pregenerated_data = Dir(f'data/{dn}.pretrain.temp') output_dir = Dir(f'temp/{dn}.bert.pt') bert_model = 'bert-base-uncased' do_lower_case = TRUE reduce_memory = TRUE epochs = epochs local_rank = -1 no_cuda = (dev == 'cpu') gradient_accumulation_steps = 1 train_batch_size = batch_size fp16 = FALSE loss_scale = 0 warmup_proportion = 0.1 learning_rate = 3e-5 seed = 42 samples_per_epoch = [] for i in range(epochs): epoch_file = pregenerated_data / f'epoch_{i}.json' metrics_file = pregenerated_data / f'epoch_{i}_metrics.json' if epoch_file.isFile() and metrics_file.isFile(): metrics = json.loads(metrics_file.file().read()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = epochs if no_cuda: device, n_gpu = 'cpu', 0 elif local_rank == -1: device, n_gpu = 'cuda', torch.cuda.device_count() else: torch.cuda.set_device(local_rank) device, n_gpu = f'cuda:{local_rank}', 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') pr(device=device, n_gpu=n_gpu, distributed=(local_rank != -1), float16=fp16) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) total_train_examples = 0 for i in range(epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / train_batch_size / gradient_accumulation_steps) if local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(bert_model) if fp16: model.half() model.to(device) if local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_linear = NA if fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = WarmupLinearSchedule( warmup=warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 pr('***** Running training *****') pr(num_examples=total_train_examples) pr(batch_size=train_batch_size) pr(num_steps=num_train_optimization_steps) model.train() for epoch in range(epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=reduce_memory, ) if local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"epoch-{epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps if fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % gradient_accumulation_steps == 0: if fp16: # modify learning rate with special warm up BERT uses # if fp16 is False, BertAdam is used that handles this automatically lr_this_step = learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model pr('***** Saving fine-tuned model *****') model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = output_dir.add().div('pytorch_model.bin').file() torch.save(model_to_save.state_dict(), output_model_file.pathstr())
len(quant75))) other_params['GoCount'] = GO_counter other_params['quant25'] = quant25 other_params['quant75'] = quant75 other_params['betweenQ25Q75'] = betweenQ25Q75 ## **** make BERT model # bert language mask + next sentence model bert_config = BertConfig(os.path.join(args.bert_model, "bert_config.json")) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model, cache_dir=cache_dir) # cosine model # **** in using cosine model, we are not using the training sample A->B then B not-> A cosine_loss = BERT_encoder_model.cosine_distance_loss(bert_config.hidden_size, args.def_emb_dim, args) metric_pass_to_joint_model = {'entailment': None, 'cosine': cosine_loss} #* **** add yes/no classifier to BERT **** ## init joint model GOEncoder = BERT_encoder_model.encoder_model( bert_lm_sentence, metric_pass_to_joint_model[args.metric_option], args, tokenizer, **other_params) if args.go_enc_model_load is not None: print('\n\nload back best model for GO encoder {}'.format(
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the data files for the task.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory is where the model checkpoints will be saved.") ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() processors = {"semeval": SemevalProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) model_path = Path(args.output_dir, "model.pth") if model_path.exists(): model.load_state_dict(torch.load(model_path)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) train_data = PretrainingDataset(train_examples, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=0, pin_memory=True) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(train_dataloader, desc="Iteration") as pbar: for step, batch in enumerate(pbar): input_ids, input_mask, segment_ids, masked_lm_labels, next_sentence_labels = batch loss = model(input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() pbar.set_postfix(loss="%.3f" % loss.item()) if n_gpu > 1: torch.save(model.module.state_dict(), model_path) else: torch.save(model.state_dict(), model_path)
random.seed(42) np.random.seed(42) torch.manual_seed(42) if n_gpu > 0: torch.cuda.manual_seed_all(42) total_train_examples = 0 for i in range(epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / train_batch_size) # Prepare model model = BertForPreTraining.from_pretrained( pretrained_model_name_or_path=bert_model_path, cache_dir=bert_data_path) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_training_data', type=Path, required=True) parser.add_argument('--pregenerated_dev_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( '--bert_model', type=str, required=True, help='Bert pre-trained model selected in the list: bert-base-uncased, ' 'bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.' ) parser.add_argument('--do_lower_case', action='store_true') parser.add_argument( '--reduce_memory', action='store_true', help= 'Store training data as on-disc memmaps to massively reduce memory usage' ) parser.add_argument('--epochs', type=int, default=3, help='Number of epochs to train for') parser.add_argument('--local_rank', type=int, default=-1, help='local_rank for distributed training on gpus') parser.add_argument('--no_cuda', action='store_true', help='Whether not to use CUDA when available') parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= 'Number of updates steps to accumulate before performing a backward/update pass.' ) parser.add_argument('--train_batch_size', default=32, type=int, help='Total batch size for training.') parser.add_argument( '--fp16', action='store_true', help='Whether to use 16-bit float precision instead of 32-bit') parser.add_argument( '--loss_scale', type=float, default=0, help= 'Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n' '0 (default value): dynamic loss scaling.\n' 'Positive power of 2: static loss scaling value.\n') parser.add_argument( '--warmup_proportion', default=0.1, type=float, help= 'Proportion of training to perform linear learning rate warmup for. ' 'E.g., 0.1 = 10%% of training.') parser.add_argument('--learning_rate', default=3e-5, type=float, help='The initial learning rate for Adam.') parser.add_argument('--seed', type=int, default=42, help='random seed for initialization') args = parser.parse_args() assert args.pregenerated_training_data.is_dir(), \ '--pregenerated_training_data should point to the folder of files made by pregenerate_training_data.py!' samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_training_data / f'epoch_{i}.json' metrics_file = args.pregenerated_training_data / f'epoch_{i}_metrics.json' if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit('No training data was found!') print( f'Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).' ) print( 'This script will loop over the available data, but training diversity may be negatively impacted.' ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( 'device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}'. format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( 'Invalid gradient_accumulation_steps parameter: {}, should be >= 1' .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f'Output directory ({args.output_dir}) already exists and is not empty!' ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.' ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.' ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Track loss train_loss_history = list() dev_loss_history = list() # Start training global_step = 0 logging.info('***** Running training *****') logging.info(f' Num examples = {total_train_examples}') logging.info(f' Batch size = {args.train_batch_size}') logging.info(f' Num steps = {num_train_optimization_steps} \n') for epoch in range(args.epochs): # Train model model.train() epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_training_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, train_or_dev='train', reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch}') as train_pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 train_pbar.update(1) mean_train_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if step % 10 == 0: train_loss_history.append((epoch, mean_train_loss)) train_pbar.set_postfix_str(f'Loss: {mean_train_loss:.5f}') if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Evaluate dev loss model.eval() dev_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_dev_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, train_or_dev='dev', reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(dev_dataset) else: train_sampler = DistributedSampler(dev_dataset) dev_dataloader = DataLoader(dev_dataset, sampler=train_sampler, batch_size=args.train_batch_size) dev_loss = 0 nb_dev_examples, nb_dev_steps = 0, 0 with tqdm(total=len(dev_dataloader), desc=f'Epoch {epoch}') as dev_pbar: for step, batch in enumerate(dev_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() dev_loss += loss.item() nb_dev_examples += input_ids.size(0) nb_dev_steps += 1 dev_pbar.update(1) mean_dev_loss = dev_loss * args.gradient_accumulation_steps / nb_dev_steps dev_pbar.set_postfix_str(f'Loss: {mean_dev_loss:.5f}') dev_loss_history.append( (epoch, mean_dev_loss)) # Only collect final mean dev loss # Save training progress with optimizer logging.info('** ** * Saving training progress * ** **') Path(args.output_dir / f'{epoch}/').mkdir(exist_ok=True) output_model_file = args.output_dir / f'{epoch}/model_and_opt.bin' torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': tr_loss, }, str(output_model_file)) # Save easily-loadable model module logging.info(f'** ** * Saving fine-tuned model {epoch} * ** ** \n') model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = args.output_dir / f'{epoch}/{WEIGHTS_NAME}' output_config_file = args.output_dir / f'{epoch}/{CONFIG_NAME}' torch.save(model_to_save.state_dict(), str(output_model_file)) model_to_save.config.to_json_file(str(output_config_file)) tokenizer.save_vocabulary(args.output_dir) # Save loss history after every epoch with open(args.output_dir / f'{epoch}/loss_history.json', 'a') as h: hist = {'dev': dev_loss_history, 'train': train_loss_history} h.write(f'{json.dumps(hist)}\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--dataset_name", default="top300_kl", type=str, required=True, help="The name of dataset to inference (without extention ex) top300_kl)") parser.add_argument("--model_type", default="baseline_tfidf", type=str, required=True, help="baseline, baseline_tfidf, ir-v0, ir-v1") parser.add_argument("--model_path", default=None, type=str, required=True, help="path to model dir") parser.add_argument("--output_dir", default=None, type=str, required=True, help="save_path") ## Other parameters parser.add_argument("--bert_model", default="bert-base-multilingual-cased", type=str, help="Default: bert-base-multilingual-cased" "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_file", default="pytorch_model.bin", type=str, help="The file of model (.bin), default is pytorhc_model.bin,\n" "특정 파일이 필요시 이름 설정 필요") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) processor = IRProcessor() label_list = processor.get_labels() num_labels = len(label_list) print("model:", args.model_type) if args.model_type == "baseline": # load model (finetuned baseline on IR) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) config = BertConfig(os.path.join(args.model_path + "bert_config.json")) model = BertForPreTraining(config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling.BertTFIDFForPreTraining(TFIDFconfig) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v0": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v1": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir_2.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) if args.fp16: model.half() model.to(device) tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl')) results_logit = dict() results_softmax = dict() eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name) logger.info("***** Running evaluation *****") logger.info(" Batch size = %d", args.eval_batch_size) for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"): # for query in queries[0:1]: # for testing logger.info(f"Current Query Num : {q_num}") eval_examples = processor._create_examples(eval_set, query, documents) # logger.info(" Num examples = %d", len(eval_examples)) if args.model_type == "baseline": # baseline or baseline_finetuned eval_features = convert_examples_to_features_for_vanilla( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): _, logits = model(input_ids, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() # nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) else: # baseline_tfidf or *-head model eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for batch in tqdm(eval_dataloader, desc="Query"): batch = tuple(t.to(device) for t in batch) input_ids, input_weights, input_mask, segment_ids, label_ids = batch with torch.no_grad(): _, logits = model(input_ids, input_weights, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] results_softmax[query] = [] for i, pred in enumerate(softmax(preds)): # using softmax pair = dict() pair["score"] = pred[1] pair["doc_id"] = list(documents.keys())[i] results_softmax[query].append(pair) results_softmax[query].sort(reverse=True, key=lambda x: x["score"]) ranked_doc_list = [] for doc in results_logit[query]: ranked_doc_list.append(doc["doc_id"]) results_logit[query] = ranked_doc_list ranked_doc_list = [] for doc in results_softmax[query]: ranked_doc_list.append(doc["doc_id"]) results_softmax[query] = ranked_doc_list save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \ + '_' + args.dataset_name + '_output.json' path2 = os.path.join(args.output_dir, save_name2) with open(path2, 'w', encoding="utf8") as f: json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
cosine_loss = encoder_model.cosine_distance_loss( args.def_emb_dim, args.def_emb_dim, args) ## remember to turn on reduce flag ??? # entailment model # ent_model = entailment_model.entailment_model (num_labels,args.gcnn_dim,args.def_emb_dim,weight=torch.FloatTensor([1.5,.75])) # torch.FloatTensor([1.5,.75]) metric_pass_to_joint_model = {'entailment': None, 'cosine': cosine_loss} ## NEED TO MAKE THE BERT MODEL # use BERT tokenizer bert_config = BertConfig(os.path.join(args.bert_model, "bert_config.json")) other = {'metric_option': args.metric_option} bert_lm_sentence = BertForPreTraining.from_pretrained(args.bert_model) bert_lm_ent_model = BERT_encoder_model.encoder_model( bert_lm_sentence, metric_pass_to_joint_model[args.metric_option], args, tokenizer, **other) ## make GCN model model = encoder_model.encoder_with_bert( args, bert_lm_ent_model, metric_pass_to_joint_model[args.metric_option], **other_params) print('\nmodel is\n') print(model) if args.use_cuda: print('\n\n send model to gpu\n\n') model.cuda()
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-cased", "bert-base-chinese"]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=None, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs print(samples_per_epoch) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.seed: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model = torch.nn.DataParallel(model) # Prepare optimizer optimizer = BertAdam train_dataloader = DataLoader( PregeneratedData(args.pregenerated_data, tokenizer,args.epochs, args.train_batch_size), batch_size=args.train_batch_size, ) data = DataBunch(train_dataloader,train_dataloader) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) def loss(x, y): return x.mean() learn = Learner(data, model, optimizer, loss_func=loss, true_wd=False, path='learn', layer_groups=bert_layer_list(model), ) lr= args.learning_rate layers = len(bert_layer_list(model)) lrs = learn.lr_range(slice(lr/(2.6**4), lr)) for epoch in range(args.epochs): learn.fit_one_cycle(1, lrs, wd=0.01) # save model at half way point if epoch == args.epochs//2: savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{epoch}.bin") torch.save(savem.state_dict(), str(output_model_file)) print(f'Saved bert to {output_model_file}') savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{args.epochs}.bin") torch.save(savem.state_dict(), str(output_model_file)) print(f'Saved bert to {output_model_file}')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--output_dir", default='../trained_model', type=str, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--bert_model", default='bert-base-cased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if not os.path.exists(args.output_dir): logger.info('creating model output dir {}'.format(args.output_dir)) os.makedirs(args.output_dir) dataset = args.train_file.split('/')[-3] voc_fname = '/'.join(args.train_file.split('/')[:-2] + ['vocab.txt']) tokenizer = BertTokenizer(voc_fname, do_lower_case=args.do_lower_case) num_train_optimization_steps = None print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model. TODO: change to create fresh model. vocab = utils.read(voc_fname) config = BertConfig(vocab_size_or_config_json_file=len(vocab), hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForPreTraining(config) model.to(device) if n_gpu > 1: # true most of the time model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # true most of the time train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) # batch_size nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pretraining_{}.bin".format( time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def create_from_pretrained(bert_model_name, cache_dir): model = BertForPreTraining.from_pretrained( pretrained_model_name_or_path=bert_model_name, cache_dir=cache_dir, ) return model