def __init__(self, hparams): """ input: hparams: namespace with the following items: 'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00' 'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory') 'max_length' (int): Max Length. default: 1024 'lr' (float): Learning Rate. default: 1e-2 'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False 'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4 """ super().__init__(hparams) # Load Pretrained BioBERT DIR_BERT = Path(str(self.hparams.biobert_dir)) BERT_CKPT_PATH = os.path.splitext(glob(str(DIR_BERT / '*ckpt*'))[0])[0] self.bertconfig = BertConfig.from_pretrained('bert-base-cased') self.berttokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.biobert_for_pretraining = BertForPreTraining.from_pretrained( 'bert-base-cased') self.biobert_for_pretraining.load_tf_weights(self.bertconfig, BERT_CKPT_PATH) self.biobert = self.biobert_for_pretraining.bert self.biobert_pad_token = self.berttokenizer.pad_token self.biobert_output_dim = self.bertconfig.hidden_size # Initialize Intermediate Affine Layer self.hidden_to_tag = nn.Linear(int(self.biobert_output_dim), len(self.itol))
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not hasattr(args, 'max_source_positions'): args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if not hasattr(args, 'max_target_positions'): args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS src_dict, tgt_dict = task.source_dictionary, task.target_dictionary if len(task.datasets) > 0: src_berttokenizer = next(iter(task.datasets.values())).berttokenizer else: src_berttokenizer = BertTokenizer.from_pretrained(args.bert_model_name) def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb if args.share_all_embeddings: if src_dict != tgt_dict: raise ValueError('--share-all-embeddings requires a joined dictionary') if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim') if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path): raise ValueError('--share-all-embeddings not compatible with --decoder-embed-path') encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = build_embedding( tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) from_tf = getattr(args, 'from_tf', False) bertencoder = BertForPreTraining.from_pretrained(args.bert_model_name, from_tf=from_tf, output_hidden_states=True).bert args.bert_out_dim = bertencoder.config.hidden_size args.bert_num_layers = bertencoder.config.num_hidden_layers encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) return BertTransformerModel(encoder, decoder, bertencoder, src_berttokenizer, args.mask_cls_sep, args)
def __init__(self, BERT_PATH): self.config = BertConfig.from_json_file(BERT_PATH + "/bert_config.json") self.model = BertForPreTraining.from_pretrained(BERT_PATH + "/bert_model.ckpt", from_tf=True, config=self.config) self.tokenizer = BertTokenizer(BERT_PATH + "/vocab.txt") self.model.eval() self.model.cuda(args.gpu_id)
def load_pretrained_bert(config: BertConfig, model_path: str): if model_path.endswith(".index"): bert_model = BertForPreTraining.from_pretrained(model_path, config=config, from_tf=True).bert elif model_path.endswith(".pth"): bert_model = BertModel.from_pretrained(model_path, config=config) else: raise ValueError(f"Wrong model path ({model_path})") return bert_model
def __init__(self, number_of_classes=16): super(BERTClass, self).__init__() # self.bert = transformers.BertModel.from_pretrained('bert-base-uncased') # self.reset_weight(self.bert) self.bert = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=False) self.bert.cls.seq_relationship = torch.nn.Sequential( torch.nn.Dropout(0.3), torch.nn.Linear(768, 512), torch.nn.Linear(512, 256), torch.nn.ReLU(), torch.nn.Linear(256, number_of_classes), torch.nn.Softmax())
def create_baseline_bert_model(self): model = BertForPreTraining.from_pretrained( self.model_weight_name, num_labels=self.num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) return model
def get_bert_save_dict(): import os state_path = 'data/bert-large.pt' if os.path.exists(state_path): state = torch.load(state_path) else: model = BertForPreTraining.from_pretrained(globals.bert_model) state = model.state_dict() # cache state torch.save(state, state_path) return state
def __init__(self, args, base_model_name='bert-base-uncased'): super(DialogBERT, self).__init__() if args.language == 'chinese': base_model_name = 'bert-base-chinese' self.tokenizer = BertTokenizer.from_pretrained(base_model_name, cache_dir='./cache/') if args.model_size == 'tiny': self.encoder_config = BertConfig(vocab_size=30522, hidden_size=256, num_hidden_layers=6, num_attention_heads=2, intermediate_size=1024) self.utt_encoder = BertForPreTraining(self.encoder_config) elif args.model_size == 'small': self.encoder_config = BertConfig(vocab_size=30522, hidden_size=512, num_hidden_layers=8, num_attention_heads=4, intermediate_size=2048) self.utt_encoder = BertForPreTraining(self.encoder_config) else: self.encoder_config = BertConfig.from_pretrained( base_model_name, cache_dir='./cache/') self.utt_encoder = BertForPreTraining.from_pretrained( base_model_name, config=self.encoder_config, cache_dir='./cache/') self.context_encoder = BertModel( self.encoder_config) # context encoder: encode context to vector self.mlm_mode = 'mse' # 'mdn', 'mse' if self.mlm_mode == 'mdn': self.context_mlm_trans = MixtureDensityNetwork( self.encoder_config.hidden_size, self.encoder_config.hidden_size, 3) else: self.context_mlm_trans = BertPredictionHeadTransform( self.encoder_config ) # transform context hidden states back to utterance encodings self.dropout = nn.Dropout(self.encoder_config.hidden_dropout_prob) self.context_order_trans = SelfSorting(self.encoder_config.hidden_size) # self.context_order_trans = MLP(self.encoder_config.hidden_size, '200-200-200', 1) self.decoder_config = deepcopy(self.encoder_config) self.decoder_config.is_decoder = True self.decoder_config.add_cross_attention = True self.decoder = BertLMHeadModel(self.decoder_config)
def _load_google_checkpoint(self): logger.info('Loading Checkpoint from Google for Pre training') download_and_extract(self.google_checkpoint_location, './') checkpoint_dir = os.path.join('./', self.google_checkpoint_root) config_location = os.path.join(checkpoint_dir, 'bert_config.json') index_location = os.path.join(checkpoint_dir, 'bert_model.ckpt.index') logger.info( f'Config file: {config_location}. Index file: {index_location}') config = BertConfig.from_json_file(config_location) self.bert = BertForPreTraining.from_pretrained(index_location, config=config, from_tf=True)
def from_pretrained(self, model_dir): self.encoder_config = BertConfig.from_pretrained(model_dir) self.tokenizer = BertTokenizer.from_pretrained( path.join(model_dir, 'tokenizer'), do_lower_case=args.do_lower_case) self.utt_encoder = BertForPreTraining.from_pretrained( path.join(model_dir, 'utt_encoder')) self.context_encoder = BertForSequenceClassification.from_pretrained( path.join(model_dir, 'context_encoder')) self.context_mlm_trans = BertPredictionHeadTransform( self.encoder_config) self.context_mlm_trans.load_state_dict( torch.load(path.join(model_dir, 'context_mlm_trans.pkl'))) self.context_order_trans = SelfSorting(self.encoder_config.hidden_size) self.context_order_trans.load_state_dict( torch.load(path.join(model_dir, 'context_order_trans.pkl'))) self.decoder_config = BertConfig.from_pretrained(model_dir) self.decoder = BertLMHeadModel.from_pretrained( path.join(model_dir, 'decoder'))
def test(): bert_model_path = '../checkpoints/bert-base-chinese/' # pytorch_model.bin bert_config_path = '../checkpoints/bert-base-chinese/' # bert_config.json vocab_path = '../checkpoints/bert-base-chinese/vocab.txt' tokenizer = BertTokenizer.from_pretrained(vocab_path) # model = BertModel.from_pretrained(bert_model_path, config=bert_config_path) model = BertForPreTraining.from_pretrained(bert_model_path, config=bert_config_path) text_batch = ["哈哈哈", "嘿嘿嘿", "嘿嘿嘿", "嘿嘿嘿"] encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True) input_ids = encoding['input_ids'] print(input_ids) print(input_ids.shape) output1, output2 = model(input_ids) print(output1) print(output2) print(output1.shape) print(output2.shape)
def __init__(self, pretrained_model, tokenizer_name_or_path: str, data_dir: str, batch_size: int, max_train_examples: int = None, max_eval_examples: int = None, train_strategy='train-all-lexical') -> None: super(LexicalTrainingModel, self).__init__() self.save_hyperparameters() if pretrained_model.startswith('google-checkpoint'): self._load_google_checkpoint() else: self.bert = BertForPreTraining.from_pretrained(pretrained_model) self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name_or_path) self.__setup_lexical_for_training() self.train_dataset = None self.eval_dataset = None self.test_dataset = None
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model 'bert'") model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = BertPretrainingCriterion(vocab_size) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True args.max_predictions_per_seq = 80 # Data loading code traindir = os.path.join(args.data) epoch = 0 training_steps = 0 writer = None enable_tensorboard = args.rank <= 0 if enable_tensorboard: if args.rank == -1: # No DDP: writer = SummaryWriter(comment='_bert_no_ddp_' + args.data) else: writer = SummaryWriter(comment='_bert_' + args.dist_backend + '_' + str(args.world_size) + 'GPUs_' + args.data) train_raw_start = time.time() while True: batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') example_speed = AverageMeter('Speed', ':6.3f') losses = AverageMeter('Loss', ':.4e') files = [ os.path.join(traindir, f) for f in os.listdir(traindir) if os.path.isfile(os.path.join(traindir, f)) and 'training' in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id * get_world_size() + get_rank() + remainder * f_start_id) % num_files] else: data_file = files[(f_start_id * get_world_size() + get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, shuffle=False) else: train_sampler = torch.utils.data.RandomSampler(train_data) train_dataloader = torch.utils.data.DataLoader( train_data, sampler=train_sampler, batch_size=args.batch_size, num_workers=4, pin_memory=True) pool = ProcessPoolExecutor(1) shared_file_list = {} for f_id in range(f_start_id + 1, len(files)): if get_world_size() > num_files: data_file = files[(f_id * get_world_size() + get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * get_world_size() + get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args) train_iter = train_dataloader end = time.time() progress = ProgressMeter( len(train_iter), [batch_time, data_time, example_speed, losses], prefix="Epoch: [{}]".format(epoch)) for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(args.gpu) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) prediction_scores = outputs.prediction_logits seq_relationship_score = outputs.seq_relationship_logits loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) losses.update(loss.item()) # compute gradient and do SGD step # optimizer.zero_grad() loss.backward() optimizer.step() for param in model.parameters(): param.grad = None # measure elapsed time elapsed_time = time.time() - end batch_time.update(elapsed_time) end = time.time() speed = len(batch[0]) / elapsed_time example_speed.update(speed) global global_steps global global_examples global_examples += len(batch[0]) global_steps += 1 if step % args.print_freq == 0: progress.display(step) if writer is not None: writer.add_scalar('loss/step', loss.item(), global_steps) writer.add_scalar('speed/step', speed, global_steps) if global_steps >= (args.max_step / abs(args.world_size)): break if global_steps >= (args.max_step / abs(args.world_size)): break del train_dataloader train_dataloader, data_file = dataset_future.result(timeout=None) now = time.time() print('Global Steps: ' + str(global_steps)) print('Total Examples: ' + str(global_examples)) print('Train duration: ' + str(now - train_raw_start)) print('Example/Sec: ' + str(global_examples / (now - train_raw_start))) epoch += 1 if epoch >= args.epochs: break if writer is not None: writer.add_scalar('overall_speed/step', global_examples / (now - train_raw_start), global_steps) writer.close()
def load_annotations(self, proposal_method, **kwargs): logger = logging.getLogger("vmr.trainer") logger.info("Preparing data form file {}, please wait...".format( self.anno_file)) self.annos = [] self.gts = [] word2vec_cache_prefix = os.path.splitext(self.anno_file)[0] word2vec_cache_file = '{}_word2vec_{}.pkl'.format( word2vec_cache_prefix, self.word2vec) # Define word embedding function if os.path.exists(word2vec_cache_file): annos_original = None # Load word embeddings cache if exists logger.info("Word2vec cache exist, load cache file.") with open(word2vec_cache_file, 'rb') as F: self.annos_query = pickle.load(F) def word_embedding(idx, sentence): assert self.annos_query[idx]['sentence'] == sentence, \ 'annotation file {} has been modified, cache file expired!'.format(self.anno_file,) return self.annos_query[idx]['query'], self.annos_query[idx][ 'wordlen'] else: annos_original = [] # Computing word embeddings if there's no cache if self.word2vec == 'BERT': # Here we use second-to-last hidden layer # See 3.5 Pooling Strategy & Layer Choice in https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) bert.to('cuda') def word_embedding(idx, sentence): sentence_tokenized = tokenizer( sentence, return_tensors="pt") # token_num = sentence_num+2 for key in sentence_tokenized: sentence_tokenized[key] = sentence_tokenized[key].to( 'cuda') with torch.no_grad(): query = bert(**sentence_tokenized, output_hidden_states=True )['hidden_states'][-2].squeeze_().to( 'cpu') #(token_num, 768) query = query[1:-1] return query, query.size( 0) #(sentence_len, 768) including punctuations elif self.word2vec == 'GloVe': def word_embedding(idx, sentence): word2vec = glove_embedding(sentence) return word2vec, word2vec.size( 0) #(sentence_len, 300) including punctuations else: raise NotImplementedError # Loading annotations, generate ground truth for model proposal logger.info("loading annotations ...") with open(self.anno_file, 'r') as f: annos = json.load(f) for vid, anno in tqdm(annos.items()): duration = anno[ 'duration'] if self.dataset_name != 'tacos' else anno[ 'num_frames'] / anno['fps'] # Produce annotations for idx in range(len(anno['timestamps'])): timestamp = anno['timestamps'][idx] sentence = anno['sentences'][idx] if timestamp[0] < timestamp[1]: moment = torch.tensor([max(timestamp[0], 0), min(timestamp[1], duration)]) if self.dataset_name != 'tacos' \ else torch.tensor( [max(timestamp[0]/anno['fps'],0), min(timestamp[1]/anno['fps'],duration)] ) query, wordlen = word_embedding(len(self.annos), sentence) self.avg_wordvec += query.mean(dim=0) if annos_original is not None: annos_original.append({ 'sentence': sentence, 'query': query, 'wordlen': wordlen, }) adjmat = torch.tensor(anno['dependency_parsing_graph'] [idx]) if self.dep_graph else None if self.consti_mask: constimask = torch.tensor( anno['constituency_parsing_mask'][idx], dtype=torch.float32) layers = torch.linspace( constimask.size(0) - 1, 0, self.tree_depth).long( ) # The original tree is from root to leaf constimask = constimask[layers, :, :] else: constimask = None if self.dep_graph: padding = query.size(0) - adjmat.size(0) adjmat = torch.nn.functional.pad( adjmat, (0, padding, 0, padding), "constant", 0) if self.dep_graph else None if wordlen >= self.max_num_words: wordlen = self.max_num_words query = query[:self.max_num_words] adjmat = adjmat[:self.max_num_words, :self. max_num_words] if self.dep_graph else None elif self.fix_num_words: padding = self.max_num_words - wordlen query = torch.nn.functional.pad( query, (0, 0, 0, padding), "constant", 0) #print('padded:', query.shape) if self.dep_graph: padding = self.max_num_words - adjmat.size(0) adjmat = torch.nn.functional.pad( adjmat, (0, padding, 0, padding), "constant", 0) if self.dep_graph else None self.annos.append({ 'vid': vid, 'moment': moment, 'sentence': sentence, 'query': query, 'querymask': torch.ones(wordlen, dtype=torch.int32), 'adjmat': adjmat, 'constimask': constimask, 'wordlen': wordlen, 'duration': duration, }) gt_dict = self.__generate_ground_truth__( moment, duration, proposal_method, **kwargs) self.gts.append(gt_dict) self.avg_wordvec /= len(self.annos) if not os.path.exists(word2vec_cache_file): with open(word2vec_cache_file, 'wb') as F: word2vec_cache = [{ 'sentence': anno['sentence'], 'query': anno['query'], 'wordlen': anno['wordlen'] } for anno in annos_original] pickle.dump(word2vec_cache, F) # Loading visual features if in_memory if self.in_memory: logger.info( "Loading visual features from {}, please wait...".format( self.feat_file)) self.feats, self.seglens = video2feats(self.feat_file, annos.keys(), self.num_segments, self.dataset_name, self.upsample) logger.info("Dataset prepared!")
labels = data['bert_label'].to(device).long() optim.zero_grad() outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, next_sentence_label=next_sentence_label) loss = outputs['loss'] losses.append(loss.cpu().detach().numpy()) loss = np.mean(losses) return loss device = 'cuda' if torch.cuda.is_available() else 'cpu' config = BertConfig(vocab_size=len(WORDS) + 1) model = BertForPreTraining.from_pretrained('bert-base-chinese') model = model.to(device) # model=nn.DataParallel(model,device_ids=[0,1]) optim = torch.optim.Adam(model.parameters(), lr=2e-5) criterion = nn.CrossEntropyLoss() NUM_EPOCHS = 5 for epoch in range(NUM_EPOCHS): pbar = tqdm(train_loader) losses = [] for data_label in pbar: data = data_label[0] next_sentence_label = data_label[1].to(device).long() input_ids = data['input_ids'].to(device).long() token_type_ids = data['token_type_ids'].to(device).long() attention_mask = data['attention_mask'].to(device).long()
from transformers import BertTokenizer, BertForPreTraining, BertForSequenceClassification from tqdm import tqdm, trange from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from transformers.optimization import AdamW, get_linear_schedule_with_warmup max_length=100 k=10 device="cpu" pretrained_weights = '/data5/private/suyusheng/task_selecte/bert-base-uncased-128/' tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True) fine_tuned_weight = '/data5/private/suyusheng/task_selecte/output_finetune/pytorch_model.bin_1314' model = BertForPreTraining.from_pretrained(pretrained_weights, output_hidden_states=True,return_dict=True) model.load_state_dict(torch.load(fine_tuned_weight), strict=False) model.to(device) #out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS.pt") out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS_res.pt") out_CLS = out_CLS.to(device) #with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain.json") as f: with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_res.json") as f: out_data = json.load(f) with open("../data/restaurant/train.json") as f: data = json.load(f) for index, d in enumerate(tqdm(data)):
def __init__(self, args): super().__init__() self.args = args self.use_bert = args.bert self.pad_token_id = args.pad_token_id self.concat = args.concat if self.use_bert: # from transformers import AutoModel # self.bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states = True) from transformers import BertConfig, BertForPreTraining if args.finetune: config = BertConfig.from_json_file( './bert/bert_tiny_finetune/bert_config.json') self.bert = BertForPreTraining.from_pretrained( './bert/bert_tiny_finetune/bert_model.ckpt.index', from_tf=True, config=config) else: config = BertConfig.from_json_file( './bert/bert_tiny/bert_config.json') self.bert = BertForPreTraining.from_pretrained( './bert/bert_tiny/bert_model.ckpt.index', from_tf=True, config=config) # keeping the weights of the pre-trained encoder frozen for param in self.bert.base_model.parameters(): param.requires_grad = False # bert base uncased has embedding dim = 768, tiny = 128 if self.concat: args.embedding_dim = 256 else: args.embedding_dim = 128 else: # Initialize embedding layer (1) self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim) # Initialize Context2Query (2) self.aligned_att = AlignedAttention(args.embedding_dim) rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU # Initialize passage encoder (3) self.passage_rnn = rnn_cell( args.embedding_dim * 2, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) # Initialize question encoder (4) self.question_rnn = rnn_cell( args.embedding_dim, args.hidden_dim, bidirectional=args.bidirectional, batch_first=True, ) self.dropout = nn.Dropout(self.args.dropout) # Adjust hidden dimension if bidirectional RNNs are used _hidden_dim = (args.hidden_dim * 2 if args.bidirectional else args.hidden_dim) # Initialize attention layer for question attentive sum (5) self.question_att = SpanAttention(_hidden_dim) # Initialize bilinear layer for start positions (6) self.start_output = BilinearOutput(_hidden_dim, _hidden_dim) # Initialize bilinear layer for end positions (7) self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the files for bert pretraining.") parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters # parser.add_argument("--config_name", default="", type=str, # help="Pretrained config name or path if not the same as model_name") # parser.add_argument("--tokenizer_name", default="", type=str, # help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") # parser.add_argument("--do_eval", action="store_true", # help="Whether to run eval on the dev set.") # parser.add_argument("--do_predict", action="store_true", # help="Whether to run predictions on the test set.")) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=1000, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") # parser.add_argument("--eval_all_checkpoints", action="store_true", # help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument( "--yago_reference", action="store_true", help="Use Reference of Yago types as additional inputs.") parser.add_argument("--start_task_id", type=int, default=0) parser.add_argument("--skip_steps", type=int, default=-1) parser.add_argument("--skip_global_steps", type=int, default=-1) parser.add_argument("--load_checkpoint", type=str, default="") args = parser.parse_args() if 'uncased' in args.model_name_or_path: args.do_lower_case = True else: args.do_lower_case = False if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = (BertConfig, BertForPreTraining, BertTokenizer) # config = BertConfig() # bertconfig = BertConfig.from_pretrained( 'bert-base-uncased', do_lower_case=args.do_lower_case, cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_uncased') # tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, # do_lower_case=args.do_lower_case, # cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_uncased') # model = model_class.from_pretrained(args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None) if args.load_checkpoint == "": if not args.yago_reference: config = bertconfig model = BertForPreTraining(config) else: config = YagoRefBertConfig.from_pretrained( 'bert-base-uncased' if args.do_lower_case else 'bert-base-cased', reference_size=REFERENCE_SIZE, cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_{}' .format('uncased' if args.do_lower_case else 'cased')) model = YagoRefBertForPreTraining(config) else: if 'step' in args.load_checkpoint.split( '/')[-1] and args.skip_global_steps is not None: assert (args.load_checkpoint.endswith(str(args.skip_global_steps))) if not args.yago_reference: config = BertConfig.from_pretrained(args.load_checkpoint) model = BertForPreTraining.from_pretrained(args.load_checkpoint) else: config = YagoRefBertConfig.from_pretrained(args.load_checkpoint) model = YagoRefBertForPreTraining.from_pretrained( args.load_checkpoint) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training # train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, mode="train") # TODO: need total rewritten # pickle_list = assign_pickles(args, args.start_task_id) # train_dataset = load_and_cache_examples(args, tokenizer, pickle_list) global_step, tr_loss = train(args, model=model, tokenizer=tokenizer, pad_token_label_id=pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation TODO: need rewritten: """
def __init__(self, metadata, timer, is_ZH, data_manager): super().__init__() self.timer = timer self.timer("bert-init") self.batch_per_train = 50 self.batch_size_eval = 64 self.max_seq_len = 301 self.batch_size = 48 self.weight_decay = 0 self.learning_rate = 5e-5 self.adam_epsilon = 1e-8 self.max_grad_norm = 1. self.total_epoch = 100 self.logging_step = -1 self.warmup_steps = 0 self.metadata = metadata self.num_class = self.metadata.get_output_size() self.bert_folder = extract_bert_model() bertConfig = BertConfig.from_json_file(self.bert_folder + '/config.json') self.model = BertClassification(None, bertConfig, self.num_class) self.bertTokenizer = BertTokenizer.from_pretrained(self.bert_folder) bertModel = BertForPreTraining.from_pretrained( self.bert_folder, num_labels=self.num_class, from_tf=BERT_V == 0) self.model.bert = bertModel.bert del bertModel.cls self.model.to(torch.device("cuda")) self.data = data_manager self.data.add_pipeline( BertPipeline(is_ZH, metadata, self.bertTokenizer, max_length=self.max_seq_len)) self.train_data_loader = None self.test_data_loader = None self.valid_data_loader = None self.done_training = False self.estimate_time_per_batch = None self.estimate_valid_time = None self.estimate_test_time = None # init optimizer and scheduler no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.total_epoch * self.batch_per_train) # first, we only train the classifier self.optimizer_only_classifier = optim.Adam( self.model.classifier.parameters(), 0.0005) self.place = 'cpu' self.timer("bert-init") print('[bert init] time cost: %.2f' % (self.timer.accumulation["bert-init"]))
log.write("tokenizer loaded with custom vocabulary of size %d \n" % len(tokenizer)) #Datasets train_dataset = QuestMLMDataset(train_df, tokenizer, target_cols=TARGETS) val_dataset = QuestMLMDataset(test_df, tokenizer, target_cols=TARGETS) #Load Model config = BertConfig.from_json_file(str(path_to_ckpt_config / "config.json")) model = BertPretrain(config, len(TARGETS)) model = model.cuda() log.write("model loaded") #Token embeddings of new tokens orig_bert = BertForPreTraining.from_pretrained("bert-base-cased") orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") state_dict = orig_bert.state_dict() del state_dict["cls.predictions.decoder.weight"], state_dict[ "cls.predictions.bias"], state_dict["cls.predictions.decoder.bias"] orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"] extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab):] new_tokens_as_orig_indices = [ [i] for i in range(len(orig_tokenizer.vocab)) ] + [ orig_tokenizer.encode(t, add_special_tokens=False) for t in extra_tokens ] new_embedding = torch.zeros(len(new_tokens_as_orig_indices), orig_embedding.shape[-1]) new_embedding.normal_(mean=0.0, std=0.02)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if model_args.model_name_or_path: model = BertForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = BertForPreTraining.from_config(config) if model_args.cls_model_name_or_path: cls_config = AutoConfig.from_pretrained( model_args.cls_model_name_or_path, num_labels=2, finetuning_task="cola", cache_dir=model_args.cache_dir, ) cls_model = AutoModelForSequenceClassification.from_pretrained( model_args.cls_model_name_or_path, from_tf=bool(".ckpt" in model_args.cls_model_name_or_path), config=cls_config, cache_dir=model_args.cache_dir, ) cls_model.resize_token_embeddings(len(tokenizer)) mask_selector = MaskSelector(cls_model,training_args) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset(data_args, tokenizer=tokenizer, model_args=model_args, cache_dir=model_args.cache_dir) if training_args.do_train else None eval_dataset = get_dataset(data_args, model_args=None, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForMixLM( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def train(): logger.info('*' * 64) logger.info('token:%s' % current_time) logger.info('*' * 64) parser = ArgumentParser() parser.add_argument( "--train_file", type=str, default="./my_test/data/student/part1.txt", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./cache/', help="Path or url of the dataset cache") parser.add_argument("--batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-4, help="Learning rate") # parser.add_argument("--train_precent", type=float, default=0.7, help="Batch size for validation") parser.add_argument("--n_epochs", type=int, default=1, help="Number of training epochs") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--log_step", type=int, default=1, help="Multiple-choice loss coefficient") parser.add_argument("--base_model", type=str, default="bert-base-uncased") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) args = parser.parse_args() logger.info(args) device = torch.device(args.device) tokenizer = BertTokenizer.from_pretrained(args.base_model) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size) model = BertForPreTraining.from_pretrained(args.base_model) optimizer = optim.Adam(model.parameters(), lr=args.lr) steps = len(train_data_loader.dataset) // train_data_loader.batch_size steps = steps if steps > 0 else 1 logger.info('steps:%d' % steps) lr_warmup = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=1500, num_training_steps=steps * args.n_epochs) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") gpu_num = torch.cuda.device_count() gpu_list = [int(i) for i in range(gpu_num)] model = DataParallel(model, device_ids=gpu_list) multi_gpu = True if torch.cuda.is_available(): model.cuda() # model.to(device) # criterion.to(device) def update(engine, batch): model.train() # input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch """ input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, next_sentence_label=None, """ # loss = model(input_ids=batch[0],input_mask=batch[1],segment_ids=batch[2],lm_label_ids=batch[3],is_next=batch[4]) loss = model(input_ids=batch[0], attention_mask=batch[1], position_ids=batch[2], masked_lm_labels=batch[3], next_sentence_label=batch[4]) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() lr_warmup.step() if multi_gpu: loss = loss.mean() loss.backward() return loss.cpu().item() trainer = Engine(update) # def inference(engine, batch): # model.eval() # with torch.no_grad(): # input_ids = batch[0].to(device) # attention_mask = batch[1].to(device) # labels = batch[2].to(device) # output = model(input_ids=input_ids, attention_mask=attention_mask) # # predict = output.permute(1, 2, 0) # trg = labels.permute(1, 0) # loss = criterion(predict.to(device), trg.to(device)) # return predict, trg # # evaluator = Engine(inference) # metrics = {"nll": Loss(criterion, output_transform=lambda x: (x[0], x[1])), # "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))} # for name, metric in metrics.items(): # metric.attach(evaluator, name) # # @trainer.on(Events.EPOCH_COMPLETED) # def log_validation_results(trainer): # evaluator.run(valid_data_loader) # ms = evaluator.state.metrics # logger.info("Validation Results - Epoch: [{}/{}] Avg accuracy: {:.6f} Avg loss: {:.6f}" # .format(trainer.state.epoch, trainer.state.max_epochs, ms['accuracy'], ms['nll'])) # '''======================early stopping ==========================''' # def score_function(engine): # val_loss = engine.state.metrics['nll'] # return -val_loss # handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer) # evaluator.add_event_handler(Events.COMPLETED, handler) '''==================print information by iterator=========================''' @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): if trainer.state.iteration % args.log_step == 0: logger.info("Epoch[{}/{}] Step[{}/{}] Loss: {:.6f}".format( trainer.state.epoch, trainer.state.max_epochs, trainer.state.iteration % steps, steps, trainer.state.output * args.gradient_accumulation_steps)) '''================add check point========================''' checkpoint_handler = ModelCheckpoint(checkpoint_dir, 'checkpoint', n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'BertClassificationModel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation '''==============run trainer=============================''' trainer.run(train_data_loader, max_epochs=args.n_epochs)
def main(): parser = argparse.ArgumentParser() ## Required parameters ############### parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--pretrain_model", default='bert-case-uncased', type=str, required=True, help="Pre-trained model") parser.add_argument("--num_labels_task", default=None, type=int, required=True, help="num_labels_task") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--task", default=None, type=int, required=True, help="Choose Task") ############### args = parser.parse_args() processors = Processor_pretrain #num_labels = args.num_labels_task if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) #tokenizer = BertTokenizer.from_pretrained(args.ernie_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.pretrain_model, do_lower_case=True) train_examples = None num_train_steps = None #aspect_list = None #sentiment_list = None processor = processors() #num_labels = num_labels #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir) train_examples = processor.get_train_examples(args.data_dir) if args.task == 1: num_labels = len(aspect_list) elif args.task == 2: num_labels = len(sentiment_list) elif args.task == 0: print("pretrain") num_lables = 0 else: print("What's task?") exit() num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained(args.pretrain_model, return_dict=True) # Prepare optimizer t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] #no_decay = ['bias', 'LayerNorm.weight'] no_grad = [ 'bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent' ] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * 0.1), num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) exit() model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 if args.do_train: #train_features = convert_examples_to_features( train_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) train_features = convert_examples_to_features( train_examples, aspect_list=None, sentiment_list=None, max_seq_length=args.max_seq_length, tokenizer=tokenizer, task_n=args.task) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) #all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long) if args.task == 1: print("Excuting the task 1") elif args.task == 2: all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) elif args.task == 0: print("Excuting the task 0") else: print("Wrong here2") if args.task == 1: train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids) elif args.task == 2: train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) elif args.task == 0: #train_data = TensorDataset(all_input_ids, all_input_mask, all_masked_lm_positions, all_masked_lm_labels) train_data = TensorDataset(all_input_ids, all_input_mask, all_masked_lm_labels) else: print("Wrong here1") if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) output_loss_file = os.path.join(args.output_dir, "loss") loss_fout = open(output_loss_file, 'w') model.train() ##########Pre-Process######### ############################### for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple( t.to(device) if i != 3 else t for i, t in enumerate(batch)) if args.task == 1: input_ids, input_mask, label_ids = batch elif args.task == 2: input_ids, input_mask, segment_ids, label_ids = batch elif args.task == 0: #input_ids, input_mask, masked_lm_positions, masked_lm_labels = batch input_ids, input_mask, masked_lm_labels = batch else: print("Wrong here3") if args.task == 1: #loss, logits, hidden_states, attentions output = model(input_ids=input_ids, token_type_ids=None, input_mask=input_mask, labels=None) loss = output.loss #loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f52ec67f8e0> elif args.task == 2: #loss, logits, hidden_states, attentions output = model(input_ids=input_ids, token_type_ids=segment_ids, input_mask=input_mask, labels=None) loss = output.loss elif args.task == 0: #loss, logits, hidden_states, attentions #output = model(input_ids=input_ids, input_mask=input_mask, position_ids=masked_lm_positions, labels=masked_lm_labels) output = model(input_ids=input_ids, attention_mask=None, position_ids=None, masked_lm_labels=masked_lm_labels) loss = output.loss else: print("Wrong!!") if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: ### #optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() ### else: loss.backward() loss_fout.write("{}\n".format(loss.item())) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses ### if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 ### model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin_{}".format(global_step)) torch.save(model_to_save.state_dict(), output_model_file) '''
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def __init__(self, scan_encoder_class=None, scan_encoder_args={}, bert_class=None, bert_args={}, scan_decoder_class=None, scan_decoder_args={}, task_configs=[], vocab_args={}, loss_weighting=None, optim_class="Adam", optim_args={}, scheduler_class=None, scheduler_args={}, pretrained_configs=[], cuda=True, devices=[0]): """ """ super().__init__(optim_class, optim_args, scheduler_class, scheduler_args, pretrained_configs, cuda, devices) self.encodes_scans = scan_encoder_class is not None if self.encodes_scans: self.scan_encoder = getattr( modules, scan_encoder_class)(**scan_encoder_args) self.scan_encoder = nn.DataParallel(self.scan_encoder, device_ids=self.devices) if bert_class == "BertModelPreTrained": self.bert = BertModel.from_pretrained(**bert_args) elif bert_class == "BertForPretraining": self.bert = BertForPreTraining.from_pretrained(**bert_args) elif bert_class == "BertModel": bert_args["config"] = BertConfig.from_dict(bert_args["config"]) self.bert = BertModel(**bert_args) else: self.bert = getattr(modules, bert_class)(**bert_args) self.bert = nn.DataParallel(self.bert, device_ids=self.devices) self.decodes_scans = scan_decoder_class is not None if self.decodes_scans: self.scan_decoder = getattr( modules, scan_decoder_class)(**scan_decoder_args) self.task_heads = {} self.task_inputs = {} for task_head_config in task_configs: task = task_head_config["task"] head_class = getattr(modules, task_head_config["class"]) args = task_head_config["args"] self.task_inputs[task] = (task_head_config["inputs"] if "inputs" in task_head_config else "pool") if "config" in args: # bert task heads take config object for parameters, must convert from dict config = args["config"] args["config"] = namedtuple("Config", config.keys())(*config.values()) if head_class is BertOnlyMLMHead: embs = self.bert.module.embeddings.word_embeddings.weight self.task_heads[task] = head_class( bert_model_embedding_weights=embs, **args) else: self.task_heads[task] = head_class(**args) self.task_heads = torch.nn.ModuleDict(self.task_heads) self.vocab = WordPieceVocab(**vocab_args) self._build_loss(loss_weighting) self._post_init()
def main(): args = get_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # not parallizing across GPUs because of deadlocks n_gpu = 1 if torch.cuda.device_count() > 0 else 0 logging.info(f'device: {device} n_gpu: {n_gpu} seed: {args.seed}') res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) logging.info( f'mem: {res.used / (1024**2)} (GiB) ({100 * (res.used / res.total):.3f}%)' ) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) # TODO: not sure what this for loop is doing samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = total_train_examples // args.train_batch_size # Prepare model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForPreTraining.from_pretrained(args.bert_model) model.to(device) # Prepare optimizer no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): tmp_fp = f'/media/data_1/darius/data/512epoch_{epoch}_dataset_255.pkl' if Path(tmp_fp).is_file(): logging.info(f'Loading dataset from {tmp_fp}...') with open(tmp_fp, 'rb') as f: epoch_dataset = pickle.load(f) else: epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) with open(tmp_fp, 'wb') as f: pickle.dump(epoch_dataset, f, protocol=4) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for _, (input_ids, input_mask, segment_ids, lm_label_ids, is_next) in enumerate(train_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) lm_label_ids = lm_label_ids.to(device) is_next = is_next.to(device) # breakpoint() outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=lm_label_ids, next_sentence_label=is_next) # outputs = model(input_ids, segment_ids, # input_mask, lm_label_ids, is_next) loss = outputs.loss loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") optimizer.step() # optimizer.zero_grad() scheduler.step() global_step += 1 # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
import torch from transformers import BertTokenizer, BertForPreTraining tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') # 疑似誤り文(ノイズ)を生成し、img2infoの辞書で保管。 import json import random import pickle import nltk # word_tokenize nltk.download('punkt') # pos_tag nltk.download('averaged_perceptron_tagger') # wordnet from nltk.corpus import wordnet as wn from tqdm import tqdm def build_img2info(json_obj, sim_value): # 画像のidをkey (key, caption, noise caption)をvalue img2info = {} idx = 0 for dic in tqdm(json_obj.values(), total=len(json_obj)): new_noise = [] for caption in dic['captions']: noise_captions = [] # 形態素解析 morph = nltk.word_tokenize(caption.lower()) pos = nltk.pos_tag(morph)
from transformers import (BertTokenizerFast, DataCollatorForNextSentencePrediction, TextDatasetForNextSentencePrediction, BertForPreTraining, Trainer) parser = argparse.ArgumentParser() parser.add_argument("--corpus_eval") parser.add_argument("--block_size", type=int) parser.add_argument("--model_name_or_path") parser.add_argument("--token_vocab", default='/home/ubuntu/lrz_share/data/token_vocab/bert/') args = parser.parse_args() tokenizer = BertTokenizerFast.from_pretrained(args.token_vocab) model = BertForPreTraining.from_pretrained(args.model_name_or_path) data_eval = TextDatasetForNextSentencePrediction( tokenizer=tokenizer, file_path=args.corpus_eval, block_size=args.block_size, ) data_collator = DataCollatorForNextSentencePrediction( tokenizer=tokenizer, mlm=True, mlm_probability=0.15, block_size=args.block_size) trainer = Trainer(model=model, data_collator=data_collator,
from torch.nn import CrossEntropyLoss from common import AverageMeter from custom_metrics import LMAccuracy from data_loader import Data_pretrain from config import Config if __name__ == '__main__': # training_path, file_id, tokenizer, data_name, reduce_memory=False tokenizer = BertTokenizer.from_pretrained('./bert_base_pretrain/vocab.txt') train_data_path = './data/processed_data0.json' txt = Data_pretrain(train_data_path, tokenizer) data_iter = DataLoader(txt, shuffle=True, batch_size=2) bert_config = BertConfig.from_pretrained(Config.config_path) # model = BertForPreTraining(config=bert_config) model = BertForPreTraining.from_pretrained( './bert_base_pretrain/pytorch_model.bin', config=bert_config) model.to(Config.device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }]
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default="manual_description.txt", type=str, help="The input train corpus.") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default="out", type=str, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=200, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=4.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", default=True, action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained( args.bert_model, config=BertConfig.from_pretrained(args.bert_model)) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(1, int(args.num_train_epochs) + 1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 print("epoch=", epoch) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", position=0)): with torch.no_grad(): batch = (item.cuda(device=device) for item in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch model.train() optimizer.zero_grad() prediction_scores, seq_relationship_score = model( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids) if lm_label_ids is not None and is_next is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) #masked_lm_loss = loss_fct(prediction_scores.view(-1, model.config.vocab_size),lm_label_ids.view(-1)) next_sentence_loss = loss_fct( seq_relationship_score.view(-1, 2), is_next.view(-1)) total_loss = next_sentence_loss model.zero_grad() loss = total_loss if step % 200 == 0: print(loss) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if epoch % 5 == 0: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self checkpoint_prefix = 'checkpoint' + str(epoch) output_dir = os.path.join( args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if args.do_train: torch.save(args, os.path.join(output_dir, 'training_args.bin'))