def bert_fn(): # Read train data and model train_dataset = pd.read_pickle(DATA['unbalanced_train']) train_lb_encoder = preprocessing.LabelEncoder() train_y = train_lb_encoder.fit_transform(train_dataset.label.values) xtrain, xvalid, ytrain, yvalid = train_test_split( train_dataset.preprocessed_tweets.values, train_y, stratify=train_y, random_state=42, test_size=.1, shuffle=True) # import BERT-base pretrained model bert = AutoModel.from_pretrained('bert-base-uncased') # Load the BERT tokenizer tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') max_seq_len = 20 # tokenize and encode sequences in the training set tokens_train = tokenizer.batch_encode_plus(xtrain.tolist(), max_length=max_seq_len, pad_to_max_length=True, truncation=True, return_token_type_ids=False) # tokenize and encode sequences in the validation set tokens_val = tokenizer.batch_encode_plus(xvalid.tolist(), max_length=max_seq_len, pad_to_max_length=True, truncation=True, return_token_type_ids=False) # for train set train_seq = torch.tensor(tokens_train['input_ids']) train_mask = torch.tensor(tokens_train['attention_mask']) train_y = torch.tensor(ytrain.tolist()) # for validation set val_seq = torch.tensor(tokens_val['input_ids']) val_mask = torch.tensor(tokens_val['attention_mask']) val_y = torch.tensor(yvalid.tolist()) #define a batch size batch_size = 32 # wrap tensors train_data = TensorDataset(train_seq, train_mask, train_y) # sampler for sampling the data during training train_sampler = RandomSampler(train_data) # dataLoader for train set train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # wrap tensors val_data = TensorDataset(val_seq, val_mask, val_y) # sampler for sampling the data during training val_sampler = SequentialSampler(val_data) # dataLoader for validation set val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) # freeze all the parameters for param in bert.parameters(): param.requires_grad = False # pass the pre-trained BERT to our define architecture model = BERT_Arch(bert) # define the optimizer optimizer = AdamW(model.parameters(), lr=1e-3) #compute the class weights class_wts = compute_class_weight('balanced', np.unique(ytrain), ytrain) # convert class weights to tensor weights = torch.tensor(class_wts, dtype=torch.float) # loss function cross_entropy = nn.NLLLoss(weight=weights) # number of training epochs epochs = 10 # set initial loss to infinite best_valid_loss = float('inf') # empty lists to store training and validation loss of each epoch train_losses = [] valid_losses = [] #for each epoch for epoch in range(epochs): print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) #train model train_loss, _ = train(model, train_dataloader, cross_entropy, optimizer) #evaluate model valid_loss, _ = evaluate(model, train_dataloader, cross_entropy, optimizer) # append training and validation loss train_losses.append(train_loss) valid_losses.append(valid_loss) print(f'\nTraining Loss: {train_loss:.3f}') print(f'Validation Loss: {valid_loss:.3f}')
def transformers( path_to_senteval: str, pretrained_model_name_or_path: str, output_filepath: Path = None, mean_pool: bool = False, cuda_device: int = -1, prototyping_config: bool = False, verbose: bool = False, ) -> None: """Evaluates a pre-trained model from the Transformers library against the SentEval benchmark.""" from transformers import AutoModel, AutoTokenizer # SentEval prepare and batcher def prepare(params, samples): return @torch.no_grad() def batcher(params, batch): batch = _cleanup_batch(batch) # Re-tokenize the input text using the pre-trained tokenizer batch = [" ".join(tokens) for tokens in batch] # HACK (John): This will save us in the case of tokenizers with no default max_length # Why does this happen? Open an issue on Transformers. max_length = params.tokenizer.max_length if hasattr( tokenizer, "max_length") else 512 inputs = params.tokenizer.batch_encode_plus(batch, pad_to_max_length=True, max_length=max_length, return_tensors="pt") # Place all input tensors on same device as the model inputs = { name: tensor.to(params.device) for name, tensor in inputs.items() } sequence_output, pooled_output = model(**inputs)[:2] # If mean_pool, we take the average of the token-level embeddings, accounting for pads. # Otherwise, we take the pooled output for this specific model, which is typically the # embedding of a special tokens embedding, like [CLS] or <s>, which is prepended to the # input during tokenization. if mean_pool: embeddings = torch.sum( sequence_output * inputs["attention_mask"].unsqueeze(-1), dim=1) / torch.clamp(torch.sum( inputs["attention_mask"], dim=1, keepdims=True), min=1e-9) else: embeddings = pooled_output embeddings = embeddings.cpu().numpy() return embeddings # Determine the torch device device = _get_device(cuda_device) # Load the Transformers tokenizer tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) typer.secho( (f"{SUCCESS} Tokenizer '{pretrained_model_name_or_path}' from Transformers loaded" " successfully."), fg=typer.colors.GREEN, bold=True, ) # Load the Transformers model model = AutoModel.from_pretrained(pretrained_model_name_or_path) model.to(device) model.eval() typer.secho( f'{SUCCESS} Model "{pretrained_model_name_or_path}" from Transformers loaded successfully.', fg=typer.colors.GREEN, bold=True, ) # Performs a few setup steps and returns the SentEval params params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose) params_senteval["tokenizer"] = tokenizer params_senteval["model"] = model params_senteval["device"] = device _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath) return
# print(sample_list[0]) # print() # print(tokenizer.decode(batch_input_ids[0].tolist())) # print(batch_input_ids.size()) # print(batch_attention_mask.size()) # print(batch_token_type_ids.size()) # print(len(tok2char_span_list)) # print(batch_shaking_tag.size()) # # Model # In[21]: encoder = AutoModel.from_pretrained(model_path) # In[22]: fake_input = torch.zeros([batch_size, max_seq_len, encoder.config.hidden_size]).to(device) shaking_type = hyper_parameters["shaking_type"] ent_extractor = TPLinkerNER(encoder, len(tags), fake_input, shaking_type, visual_field) ent_extractor = ent_extractor.to(device) # In[23]: metrics = Metrics(handshaking_tagger)
def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory=None, **kwargs): """ Relevant inputs should be provided using kwargs. This method is defined this way to match parent's and siblings' method signatures. If `save_directory` is None, will initialize a new model and numericalizer, otherwise, will load them from `save_directory` Inputs: args vocab_sets save_directory: The directory where numericalizer can be loaded from. """ super().__init__(PretrainedConfig()) # dummy PretrainedConfig self.args = args encoder_embeddings = args.pretrained_model config = AutoConfig.from_pretrained(encoder_embeddings, cache_dir=args.embeddings) args.dimension = config.hidden_size # tasks is not passed during initialization only in server mode # call this function after task is recognized if tasks: self.set_generation_output_options(tasks) self.src_lang, self.tgt_lang = adjust_language_code( config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en')) self.numericalizer = TransformerNumericalizer( encoder_embeddings, args, max_generative_vocab=args.max_generative_vocab, save_dir=save_directory, config=config, src_lang=self.src_lang, tgt_lang=self.tgt_lang, vocab_sets=vocab_sets, tasks=tasks, ) logger.info('Initializing encoder and decoder embeddings') if args.do_ned: if type(config) == BertConfig: if save_directory is not None: self.encoder_embeddings = BertModelForNER( config, args.num_db_types, args.db_unk_id) else: self.encoder_embeddings = BertModelForNER( config, args.num_db_types, args.db_unk_id).from_pretrained( encoder_embeddings, num_db_types=args.num_db_types, db_unk_id=args.db_unk_id, cache_dir=args.embeddings) elif type(config) == XLMRobertaConfig: if save_directory is not None: self.encoder_embeddings = XLMRobertaModelForNER( config, args.num_db_types, args.db_unk_id) else: self.encoder_embeddings = XLMRobertaModelForNER( config, args.num_db_types, args.db_unk_id).from_pretrained( encoder_embeddings, num_db_types=args.num_db_types, db_unk_id=args.db_unk_id, cache_dir=args.embeddings) else: raise ValueError( 'Model is not supported for using entity embeddings for NER' ) else: if save_directory is not None: self.encoder_embeddings = AutoModel.from_config(config) else: self.encoder_embeddings = AutoModel.from_pretrained( encoder_embeddings, config=config, cache_dir=args.embeddings) self.encoder_embeddings.resize_token_embeddings( self.numericalizer.num_tokens) logger.info(f'Vocabulary has {self.numericalizer.num_tokens} tokens') self.encoder = IdentityEncoder(self.numericalizer, args, config, self.encoder_embeddings) self.decoder = MQANDecoder(self.numericalizer, args)
def __build_model(self) -> None: """ Init transformer model + tokenizer + classification head.""" if self.hparams.transformer_type == 'roberta-long': self.transformer = RobertaLongForMaskedLM.from_pretrained( self.hparams.encoder_model, output_hidden_states=True, gradient_checkpointing=True) elif self.hparams.transformer_type == 'longformer': self.transformer = AutoModel.from_pretrained( self.hparams.encoder_model, output_hidden_states=True, gradient_checkpointing=True, #critical for training speed. ) else: #BERT self.transformer = AutoModel.from_pretrained( self.hparams.encoder_model, output_hidden_states=True, ) logger.warning(f'model is {self.hparams.encoder_model}') if self.hparams.transformer_type == 'longformer': logger.warning('Turnin ON gradient checkpointing...') self.transformer = AutoModel.from_pretrained( self.hparams.encoder_model, output_hidden_states=True, gradient_checkpointing=True, #critical for training speed. ) else: self.transformer = AutoModel.from_pretrained( self.hparams.encoder_model, output_hidden_states=True, ) # set the number of features our encoder model will return... self.encoder_features = 768 # Tokenizer if self.hparams.transformer_type == 'longformer' or self.hparams.transformer_type == 'roberta-long': self.tokenizer = Tokenizer( pretrained_model=self.hparams.encoder_model, max_tokens=self.hparams.max_tokens_longformer) self.tokenizer.max_len = 4096 else: self.tokenizer = Tokenizer( pretrained_model=self.hparams.encoder_model, max_tokens=512) #others: #'emilyalsentzer/Bio_ClinicalBERT' 'simonlevine/biomed_roberta_base-4096-speedfix' # Classification head if self.hparams.single_label_encoding == 'default': self.classification_head = nn.Sequential( nn.Linear(self.encoder_features, self.encoder_features * 2), nn.Tanh(), nn.Linear(self.encoder_features * 2, self.encoder_features), nn.Tanh(), nn.Linear(self.encoder_features, self.data.label_encoder.vocab_size), ) elif self.hparams.single_label_encoding == 'graphical': logger.critical('Graphical embedding not yet implemented!')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name_or_path", type=str, help="Transformers' model name or path") parser.add_argument("--pooler", type=str, choices=['cls', 'cls_before_pooler', 'avg', 'avg_top2', 'avg_first_last'], default='cls', help="Which pooler to use") parser.add_argument("--mode", type=str, choices=['dev', 'test', 'fasttest'], default='test', help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results") parser.add_argument("--task_set", type=str, choices=['sts', 'transfer', 'full', 'na'], default='sts', help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'") parser.add_argument("--tasks", type=str, nargs='+', default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC', 'SICKRelatedness', 'STSBenchmark'], help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden") args = parser.parse_args() # Load transformers' model checkpoint model = AutoModel.from_pretrained(args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) # Set up the tasks if args.task_set == 'sts': args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] elif args.task_set == 'transfer': args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] elif args.task_set == 'full': args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] # Set params for SentEval if args.mode == 'dev' or args.mode == 'fasttest': # Fast mode params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 'tenacity': 3, 'epoch_size': 2} elif args.mode == 'test': # Full mode params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4} else: raise NotImplementedError # SentEval prepare and batcher def prepare(params, samples): return def batcher(params, batch, max_length=None): # Handle rare token encoding issues in the dataset if len(batch) >= 1 and len(batch[0]) >= 1 and isinstance(batch[0][0], bytes): batch = [[word.decode('utf-8') for word in s] for s in batch] sentences = [' '.join(s) for s in batch] # Tokenization if max_length is not None: batch = tokenizer.batch_encode_plus( sentences, return_tensors='pt', padding=True, max_length=max_length, truncation=True ) else: batch = tokenizer.batch_encode_plus( sentences, return_tensors='pt', padding=True, ) # Move to the correct device for k in batch: batch[k] = batch[k].to(device) # Get raw embeddings with torch.no_grad(): outputs = model(**batch, output_hidden_states=True, return_dict=True) last_hidden = outputs.last_hidden_state pooler_output = outputs.pooler_output hidden_states = outputs.hidden_states # Apply different poolers if args.pooler == 'cls': # There is a linear+activation layer after CLS representation return pooler_output.cpu() elif args.pooler == 'cls_before_pooler': return last_hidden[:, 0].cpu() elif args.pooler == "avg": return ((last_hidden * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1)).cpu() elif args.pooler == "avg_first_last": first_hidden = hidden_states[0] last_hidden = hidden_states[-1] pooled_result = ((first_hidden + last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1) return pooled_result.cpu() elif args.pooler == "avg_top2": second_last_hidden = hidden_states[-2] last_hidden = hidden_states[-1] pooled_result = ((last_hidden + second_last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1) return pooled_result.cpu() else: raise NotImplementedError results = {} for task in args.tasks: se = senteval.engine.SE(params, batcher, prepare) result = se.eval(task) results[task] = result # Print evaluation results if args.mode == 'dev': print("------ %s ------" % (args.mode)) task_names = [] scores = [] for task in ['STSBenchmark', 'SICKRelatedness']: task_names.append(task) if task in results: scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100)) else: scores.append("0.00") print_table(task_names, scores) task_names = [] scores = [] for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: task_names.append(task) if task in results: scores.append("%.2f" % (results[task]['devacc'])) else: scores.append("0.00") task_names.append("Avg.") scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) print_table(task_names, scores) elif args.mode == 'test' or args.mode == 'fasttest': print("------ %s ------" % (args.mode)) task_names = [] scores = [] for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']: task_names.append(task) if task in results: if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100)) else: scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100)) else: scores.append("0.00") task_names.append("Avg.") scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) print_table(task_names, scores) task_names = [] scores = [] for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: task_names.append(task) if task in results: scores.append("%.2f" % (results[task]['devacc'])) else: scores.append("0.00") task_names.append("Avg.") scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) print_table(task_names, scores)
fpath = '/home/dpappas/bioasq_2021/BioASQ-task9bPhaseB-testset{}'.format(b) ofpath = '/home/dpappas/bioasq_2021/batch{}_system_1_factoid.json'.format(b) # model_name = "ktrapeznikov/biobert_v1.1_pubmed_squad_v2" # my_model_path = '/home/dpappas/bioasq_factoid/snipBefAfter1_ktrapeznikov__biobert_v1.1_pubmed_squad_v2_MLP_100_9.pth.tar' # hidden = 768 model_name = "ktrapeznikov/albert-xlarge-v2-squad-v2" my_model_path = "/home/dpappas/bioasq_factoid/albert_ktrapeznikov__albert-xlarge-v2-squad-v2_MLP_100_42_5e-05.pth.tar" hidden = 2048 d = json.load(open(fpath)) use_cuda = torch.cuda.is_available() device = torch.device("cuda") if (use_cuda) else torch.device("cpu") bert_tokenizer = AutoTokenizer.from_pretrained(model_name) pprint(bert_tokenizer.special_tokens_map) bert_model = AutoModel.from_pretrained(model_name).to(device) bert_model.eval() my_model = Ontop_Modeler(hidden, 100).to(device) load_model_from_checkpoint(my_model_path) gb = my_model.eval() def fix_phrase(phr): if len(phr) == 0: return '' while not phr[0].isalnum(): phr = phr[1:] while not phr[-1].isalnum(): phr = phr[:-1] return phr
def setUp(self) -> None: """Set up for the tests--load tokenizer.""" self.test_tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base") self.model = AutoModel.from_pretrained("allenai/biomed_roberta_base") self.model.resize_token_embeddings(len(self.test_tokenizer)) self.out_dir = 'tests/models/test_output'
import os import ray import glob import time import torch from transformers import AutoModel import torchfly_dev ray.init(memory=12 * 1024**3, object_store_memory=8 * 1024**3, redis_max_memory=8 * 1024**3) model = AutoModel.from_pretrained("roberta-large") device = torch.device("cuda") model = model.cuda() for i in range(100): start = time.time() obj = torchfly_dev.async_save(model.state_dict(), f"tmp.pth") time.sleep(4) end = time.time() print(f"Time takes: {end-start-4}s") time.sleep(100)
def get( model_name: str, make_copy: bool, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, **kwargs, ) -> transformers.PreTrainedModel: """ Returns a transformer model from the cache. # Parameters model_name : `str` The name of the transformer, for example `"bert-base-cased"` make_copy : `bool` If this is `True`, return a copy of the model instead of the cached model itself. If you want to modify the parameters of the model, set this to `True`. If you want only part of the model, set this to `False`, but make sure to `copy.deepcopy()` the bits you are keeping. override_weights_file : `str`, optional If set, this specifies a file from which to load alternate weights that override the weights from huggingface. The file is expected to contain a PyTorch `state_dict`, created with `torch.save()`. override_weights_strip_prefix : `str`, optional If set, strip the given prefix from the state dict when loading it. """ global _model_cache spec = TransformerSpec(model_name, override_weights_file, override_weights_strip_prefix) transformer = _model_cache.get(spec, None) if transformer is None: if override_weights_file is not None: from allennlp.common.file_utils import cached_path import torch override_weights_file = cached_path(override_weights_file) override_weights = torch.load(override_weights_file) if override_weights_strip_prefix is not None: def strip_prefix(s): if s.startswith(override_weights_strip_prefix): return s[len(override_weights_strip_prefix):] else: return s valid_keys = { k for k in override_weights.keys() if k.startswith(override_weights_strip_prefix) } if len(valid_keys) > 0: logger.info("Loading %d tensors from %s", len(valid_keys), override_weights_file) else: raise ValueError( f"Specified prefix of '{override_weights_strip_prefix}' means no tensors " f"will be loaded from {override_weights_file}.") override_weights = { strip_prefix(k): override_weights[k] for k in valid_keys } transformer = AutoModel.from_pretrained( model_name, state_dict=override_weights, **kwargs, ) else: transformer = AutoModel.from_pretrained( model_name, **kwargs, ) _model_cache[spec] = transformer if make_copy: import copy return copy.deepcopy(transformer) else: return transformer
def __init__(self, \ doc_definition, sent_definition, pretrained, num_workers, num_epochs, dropout_sent = 0.0, dropout_doc = 0.0, use_sent_objective = True, concat_sent_scores = True, dataset_class = DatasetBertTC, scorer_class = ScorerXray, grad_max_norm = 1.0, loss_reduction = 'sum', batch_size = 5, lr = 1e-5, lr_ratio = 1.0, attention_query_dim = 100, max_length = 50, max_sent_count = 50, linebreak_bound = True, keep_ws = False, project_sent = False, project_size = 200, optimizer_params = None, dataloader_params = None, hyperparams = None, dataset_params = None, ): super(ModelBertTC, self).__init__( \ hyperparams = hyperparams, dataset_params = dataset_params, dataloader_params = dataloader_params, optimizer_params = optimizer_params, num_workers = num_workers, num_epochs = num_epochs, dataset_class = dataset_class, scorer_class = scorer_class ) self.pretrained = pretrained self.use_sent_objective = use_sent_objective self.concat_sent_scores = concat_sent_scores self.grad_max_norm = grad_max_norm self.loss_reduction = loss_reduction self.doc_definition = doc_definition self.sent_definition = sent_definition self.num_workers = num_workers self.batch_size = batch_size self.lr = lr self.lr_ratio = lr_ratio self.max_length = max_length self.max_sent_count = max_sent_count self.linebreak_bound = linebreak_bound self.keep_ws = keep_ws self.project_sent = project_sent self.project_size = project_size if self.concat_sent_scores: assert self.use_sent_objective self.bert = AutoModel.from_pretrained(self.pretrained) hidden_size = self.bert.config.hidden_size self.sent_attention = nn.ModuleDict(OrderedDict()) self.doc_output_layers = nn.ModuleDict(OrderedDict()) self.sent_ffnn = nn.ModuleDict(OrderedDict()) self.sent_classifiers = nn.ModuleDict(OrderedDict()) for k, label_set in doc_definition.items(): self.sent_classifiers[k] = SentClassifiers( \ input_dim = hidden_size, num_tags = 2, loss_reduction = self.loss_reduction, dropout = dropout_sent, sent_definition = sent_definition[k], ) if self.concat_sent_scores: n = len(sent_definition[k]) * 2 else: n = 0 if self.project_sent: self.sent_ffnn[k] = FeedForward( \ input_dim = hidden_size+n, num_layers = 1, hidden_dims = self.project_size, activations = get_activation('tanh'), dropout = 0) out_dim = self.project_size else: out_dim = hidden_size + n self.sent_attention[k] = Attention( \ input_dim = out_dim, dropout = dropout_doc, use_ffnn = True, activation = 'tanh', query_dim = attention_query_dim) self.doc_output_layers[k] = nn.Linear(out_dim, len(label_set)) self.get_summary()
def __init__(self, config, gpu_list, *args, **params): super(XLNetEncoder, self).__init__() self.bert = AutoModel.from_pretrained(config.get("model", "bert_path")) self.max_seq_len = config.getint("model", "max_seq_len")
import torch from transformers import BertTokenizer from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertForSequenceClassification, AdamW, BertConfig, BertModel, AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
def __init__(self, pretrained_model_name_or_path): BertModule.__init__(self) lm_model = AutoModel.from_pretrained(pretrained_model_name_or_path) PreTrainedModel.__init__(self, config=lm_model.config) self.lm_model = lm_model self.type = type(lm_model)
def from_encoder_decoder_pretrained( cls, encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, **kwargs ) -> PreTrainedModel: r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints. The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train the model, you need to first set it back in training mode with `model.train()`. Params: encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the encoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): information necessary to initiate the decoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method kwargs: (`optional`) Remaining dictionary of keyword arguments. Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Examples:: >>> from transformers import EncoderDecoderModel >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert """ kwargs_encoder = { argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } # Load and initialize the encoder and decoder # The distinction between encoder and decoder at the model level is made # by the value of the flag `is_decoder` that we need to set correctly. encoder = kwargs_encoder.pop("model", None) if encoder is None: assert ( encoder_pretrained_model_name_or_path is not None ), "If `model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModel encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) encoder.config.is_decoder = False decoder = kwargs_decoder.pop("model", None) if decoder is None: assert ( decoder_pretrained_model_name_or_path is not None ), "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has to be defined" from .modeling_auto import AutoModelForCausalLM if "config" not in kwargs_decoder: from transformers import AutoConfig decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) if decoder_config.is_decoder is False: logger.info( f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." ) decoder_config.is_decoder = True kwargs_decoder["config"] = decoder_config if kwargs_decoder["config"].is_decoder is False: logger.warning( f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, make sure that the attribute `is_decoder` of `decoder_config` passed to `.from_encoder_decoder_pretrained(...)` is set to `True` or do not pass a `decoder_config` to `.from_encoder_decoder_pretrained(...)`" ) decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) return cls(encoder=encoder, decoder=decoder)
def __init__(self): super().__init__() # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096') self.model = AutoModel.from_pretrained('roberta-base')
def create_model(encoder_name="bert-base-uncased", dec_vocabsize=None, abs_dec_vocabsize=None, dec_layers=6, dec_dim=640, dec_heads=8, dropout=0., maxlen=20, smoothing=0., numbeam=1, tensor2tree=None, abstensor2tree=None, abs_id=-100, entropycontrib=1.): if encoder_name != "bert-base-uncased": raise NotImplementedError( f"encoder '{encoder_name}' not supported yet.") pretrained = AutoModel.from_pretrained(encoder_name) encoder = pretrained class BertEncoderWrapper(torch.nn.Module): def __init__(self, model, dropout=0., **kw): super(BertEncoderWrapper, self).__init__(**kw) self.model = model self.proj = torch.nn.Linear(pretrained.config.hidden_size, dec_dim, bias=False) self.dropout = torch.nn.Dropout(dropout) def forward(self, input_ids, attention_mask=None): ret, _ = self.model(input_ids, attention_mask=attention_mask) if pretrained.config.hidden_size != dec_dim: ret = self.proj(ret) ret = self.dropout(ret) ret = (ret, None, None) return ret class DummyEncoder(torch.nn.Module): def __init__(self, dim, **kw): super(DummyEncoder, self).__init__(**kw) self.dim = dim def forward(self, input_ids, attention_mask=None): return torch.zeros(input_ids.size(0), 1, self.dim, device=input_ids.device) encoder = BertEncoderWrapper(encoder, dropout=dropout) decoder_config = BartConfig( d_model=dec_dim, pad_token_id=0, bos_token_id=1, vocab_size=abs_dec_vocabsize, decoder_attention_heads=dec_heads // 2, decoder_layers=dec_layers, dropout=dropout, attention_dropout=min(0.1, dropout / 2), decoder_ffn_dim=dec_dim * 4, encoder_attention_heads=dec_heads, encoder_layers=dec_layers, encoder_ffn_dim=dec_dim * 4, ) adv_decoder_config = BartConfig( d_model=dec_dim, pad_token_id=0, bos_token_id=1, vocab_size=dec_vocabsize, decoder_attention_heads=dec_heads // 2, decoder_layers=dec_layers, dropout=dropout, attention_dropout=min(0.1, dropout / 2), decoder_ffn_dim=dec_dim * 4, encoder_attention_heads=dec_heads, encoder_layers=dec_layers, encoder_ffn_dim=dec_dim * 4, ) decoder_lm_config = BartConfig( d_model=dec_dim, pad_token_id=0, bos_token_id=1, vocab_size=dec_vocabsize, decoder_attention_heads=dec_heads // 2, decoder_layers=dec_layers, dropout=dropout, attention_dropout=min(0.1, dropout / 2), decoder_ffn_dim=dec_dim * 4, encoder_attention_heads=dec_heads, encoder_layers=dec_layers, encoder_ffn_dim=dec_dim * 4, ) model = BartGenerator(decoder_config) model.model.encoder = encoder advmodel = BartGenerator(adv_decoder_config) advmodel.model.encoder = encoder decoder_lm = BartGenerator(decoder_lm_config) decoder_lm.model.encoder = DummyEncoder(dec_dim) orderless = {"op:and", "SW:concat"} trainmodel = GeneratorTrain(model, advmodel, smoothing=smoothing, tensor2tree=abstensor2tree, orderless=orderless, abs_id=abs_id, entropycontrib=entropycontrib) advtrainmodel = AdversaryTrain(advmodel, decoder_lm, smoothing=smoothing, tensor2tree=tensor2tree, orderless=orderless) testmodel = BartGeneratorTest(model, maxlen=maxlen, numbeam=numbeam, tensor2tree=abstensor2tree, orderless=orderless) return trainmodel, advtrainmodel, testmodel
text2 = text[end:] word_id += 1 # return wsd return pd.DataFrame([{'form': text, 'WSD': str(wsd)}]) if __name__ == "__main__": args = parser.parse_args() text = args.text multigpu = args.multigpu with open('Dict/processed_dictionary.json', 'rb') as f: urimal_dict = json.load(f) bert_model = AutoModel.from_pretrained("monologg/distilkobert") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') model = BiEncoderModel(bert_model) model.to('cuda') model_list = os.listdir(f"checkpoint/{args.model_date}") model_fname = 'saved_checkpoint_fin' # model = torch.load(f"checkpoint/{args.model_date}/{model_fname}") model = torch.load(f"checkpoint/WSD_v2/{model_fname}", map_location='cuda') model.eval() batch_generator = BatchGenerator(tokenizer, 128) eval_df = text_process(text, urimal_dict) eval_ds = ContextDataset(eval_df) eval_dl = context_dataloader(eval_ds, batch_generator, 1)
def get_bert_embeddings(self, path, dynamic=True, static=False): """ SciBert embeddings computation using Transformes. It store and transforms the texts into SciBERT embeddings. The embeddings are stored in csv files. :param path: path to save the embeddings :param dynamic: boolean variable to compute the dynamic embeddings (By defalut: True). :param static: boolean variable to compute the static embeddings (By defalut: False). :returns: static embeddings if static=True """ self.__data_preparation() data_stat=[] bert = AutoModel.from_pretrained(self.model).embeddings bert=bert.to(self.device) for idx_batch, sequence in enumerate(self.data_dataloader,1): sequence=sequence.to(self.device) ids_tokens=np.where((self.indexed_tokens[idx_batch-1]!=102) &(self.indexed_tokens[idx_batch-1]!=103) &(self.indexed_tokens[idx_batch-1]!=101) )[0] tokens=np.array(self.tokenized_texts[idx_batch-1])[ids_tokens] index=[] index_num=[] for i in range(len(tokens)): if [idx for idx, x in enumerate(tokens[i]) if x=='#'] ==[]: index.append(i) else: index_num.append(i) bert_embeddings=bert(sequence)[0][:,ids_tokens].cpu().detach() embeddings=torch.tensor(np.zeros((bert_embeddings.shape[1]-len(index_num),bert_embeddings.shape[2]))) count=0 if index_num!=[]: for idx in range (len(ids_tokens)): if np.where(index_num==np.array([idx]))[0].size!=0: nums=bert_embeddings[0][idx]*bert_embeddings[0][idx-1] embeddings[idx-count-1]=nums.cpu().detach() count+=1 else: embeddings[idx-count]=bert_embeddings[0][idx].cpu().detach() else: embeddings=bert_embeddings[0] if static: for emb in embeddings: data_stat.append(emb) if dynamic: i=1 data_csv=[] labelstf= [] labelstf.append('Word') for n in range (self.neurons): labelstf.append('Neuron'+str(n+1)) for emb in embeddings: data_csv.append(np.hstack((self.words[idx_batch-1][i-1], emb))) i+=1 with open(path+self.file_names+'.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(labelstf) writer.writerows(data_csv) if static: wordar=np.vstack(data_stat) del data_stat meanBERT=np.mean(wordar, axis=0) stdBERT=np.std(wordar, axis=0) kurtosisBERT=kurtosis(wordar, axis=0) skewnessBERT=skew(wordar, axis=0) skewnessBERT=skew(wordar, axis=0) minBERT=np.min(wordar, axis=0) maxBERT=np.max(wordar, axis=0) statisticalMeasures=np.hstack((meanBERT, stdBERT, kurtosisBERT, skewnessBERT,minBERT, maxBERT)) del embeddings #del bert_embeddings del bert del self.data_dataloader del self.tokenized_texts del self.data return statisticalMeasures else: del embeddings #del bert_embeddings del bert del self.data_dataloader del self.tokenized_texts del self.data gc.collect()
def main(): parser = utils.ArgParser() parser.add_argument("dataset_name", type=str, help="dataset name") arguments.add_dataset_path_arg(parser) arguments.add_test_arg(parser) parser.add_argument("--metadata_name", type=str, default="text_data", help="change which metadata to load") parser.add_argument("--cuda", action="store_true", help="use cuda") parser.add_argument("--multi_gpu", action="store_true", help="use multiple gpus") parser.add_argument("--model_path", type=str, default=None, help="Cache path for transformers package.") parser.add_argument("--model_name", type=str, default="bert-base-uncased", help="Which model to use.") parser.add_argument("--model_source", type=str, default="transformers", help="Where to get the models from.") parser.add_argument("--layers", type=str, default="-2,-1", help="Read the features from these layers. Careful: Multiple layers must be specified like " "this: --layers=-2,-1 because of argparse handling minus as new argument.") parser.add_argument("--batch_size", type=int, default=1, help="Batch size.") parser.add_argument("--workers", type=int, default=0, help="Dataloader workers.") parser.add_argument("--add_name", type=str, default="", help="Add additional identifier to output files.") parser.add_argument("-f", "--force", action="store_true", help="Overwrite embedding if exists.") parser.add_argument("--encoder_only", action="store_true", help="Flag for hybrid models (BART: bilinear and unilinear) that return " "both encoder and decoder output, if the decoder output should be discarded.") parser.add_argument("--set_tokenizer", type=str, default="", help=f"Manually define the tokenizer instead of determining it from model name. " f"Options: {nntrainer.data_text.TextPreprocessing.values()}") parser.add_argument("--add_special_tokens", action="store_true", help=f"Set the tokenizer to add special tokens (like [CLS], [SEP] for BERT).") parser.add_argument("--token_stride", action="store_true", help=f"If set, too long texts will be strided over instead of cut to max.") parser.add_argument("--token_stride_factor", type=int, default=2, help=f"Default 2 means to stride half the window size. Set to 1 for non-overlapping windows.") parser.add_argument("--print_model", action="store_true", help=f"Print model and config") args = parser.parse_args() data_path = Path("data") dataset_path = data_path / args.dataset_name model_name = args.model_name token_stride = args.token_stride model_ident = f"{args.model_source}_{model_name.replace('/', '--')}_{args.layers}" full_ident = f"text_feat_{args.dataset_name}_meta_{args.metadata_name}_{model_ident}{args.add_name}" # setup paths text_features_path = dataset_path os.makedirs(text_features_path, exist_ok=True) lengths_file = text_features_path / f"{full_ident}_sentence_splits.json" data_file_only = f"{full_ident}.h5" data_file = text_features_path / data_file_only ''' if data_file.exists() and lengths_file.exists() and not args.force: print(f"{data_file} already exists. nothing to do.") return ''' # Load pretrained model print("*" * 20, f"Loading model {model_name} from {args.model_source}") if args.model_source == "transformers": tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=args.model_path) model: BertModel = AutoModel.from_pretrained(model_name, cache_dir=args.model_path) if args.print_model: print("*" * 40, "Model") print(f"{model}") print("*" * 40, "Config") print(model.config) # noinspection PyUnresolvedReferences max_text_len = model.config.max_position_embeddings model.eval() else: raise NotImplementedError(f"Model source unknown: {args.model_source}") if args.cuda: if args.multi_gpu: model = nn.DataParallel(model).cuda() else: model = model.cuda() print(f"Running model on device {next(model.parameters()).device}") print(f"Maximum input length {max_text_len}") # define preprocessor is_tp = False add_special_tokens = args.add_special_tokens if args.set_tokenizer != "": print(f"Set tokenizer via flag to {args.set_tokenizer}") preprocessor = get_text_preprocessor(args.set_tokenizer) elif model_name == "bert-base-uncased": # paper results preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.BERT_PAPER) elif model_name.startswith(TextModelConst.BERT) or model_name.startswith(TextModelConst.DISTILBERT): # new results bert-large-cased preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.BERT_NEW) elif model_name.startswith(TextModelConst.GPT2): # new results with gpt2 preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.GPT2) else: print(f"WARNING: no text preprocessing defined for model {model_name}, using default preprocessing which " f"does not add any special tokens.") preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.SIMPLE) # else: # raise NotImplementedError(f"No preprocessing defined for model {model_name}") # define feature layers to extract layer_list_int = [int(layer.strip()) for layer in args.layers.strip().split(",")] # load metadata meta_file = dataset_path / f"{args.metadata_name}.json" print(f"Loading meta file of {meta_file.stat().st_size // 1024 ** 2:.0f} MB") timer_start = timer() meta_dict = json.load(meta_file.open("rt", encoding="utf8")) print(f"Took {timer() - timer_start:.1f} seconds for {len(meta_dict)}.") text_dict: Dict[str, List[str]] = {} for key, meta in meta_dict.items(): text_dict[key] = [item for key, item in meta.items()] #text_dict[key] = [seg["text"] for seg in meta["segments"]] # get max number of words length total_words = 0 max_words = 0 for key, val in tqdm(text_dict.items(), desc="Compute total_words and max_words"): num_words = sum(len(text.split(" ")) for text in val) total_words += num_words max_words = max(num_words, max_words) print(f"Total {total_words} average {total_words / len(meta_dict):.2f} max {max_words}") # create dataset and loader print("*" * 20, "Loading and testing dataset.") dataset = TextConverterDataset(tokenizer, text_dict, preprocessor, max_text_len=max_text_len, token_stride=token_stride, add_special_tokens=add_special_tokens) dataloader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=dataset.collate_fn) # print first datapoint for key, value in dataset[0].items(): print(f"{key}: {value}\n") if args.test: # print first datapoint for point in dataset: for key, value in dict(point).items(): print(f"{key}: {value}\n") print("Test, stopping here.") return # loop videos and encode features print("*" * 20, "Running the encoding.") print(f"Encoding text with model: {model_name}, layers: {layer_list_int}, " f"batch size: {args.batch_size}, workers: {args.workers}") temp_file = text_features_path / f"TEMP_{utils.get_timestamp_for_filename()}_{data_file_only}" data_h5 = h5py.File(temp_file, "w") lengths = {} total_feat_dim = None printed_warning = False pbar = tqdm(desc="compute text features", total=maths.ceil(len(dataset) / args.batch_size)) for i, batch in enumerate(dataloader): # type: TextDataBatchPoint if args.cuda: batch.to_cuda(non_blocking=True) batch_size = len(batch.key) total_max_seq_len = batch.tokens.shape[1] if total_max_seq_len <= max_text_len: # everything is fine # compute model output and read hidden states model_outputs = model(input_ids=batch.tokens, attention_mask=batch.mask, output_hidden_states=True) hidden_states = model_outputs["hidden_states"] # pbar.write(f"tokens {batch.tokens.shape[1]}") # pbar.write(f"outputs {list(state.shape[1] for state in hidden_states)}") # concatenate the features from the requested layers of the hidden state (-1 is the output layer) features = [] for layer_num in layer_list_int: layer_features = hidden_states[layer_num] features.append(layer_features.detach().cpu().numpy()) # concatenate features of individual hidden layers features = np.concatenate(features, axis=-1) # shape (batch_size, max_sent_len, num_layers * feat_dim) # pbar.write(f"features {features.shape}") else: print('Hoy') # if batch tokens is too long we need multiple steps depending on stride stride = max_text_len // args.token_stride_factor positions = list(range(0, total_max_seq_len - stride, stride)) all_model_outputs = [] pbar.write(f"Length {total_max_seq_len}! Split with window {max_text_len} stride {stride} " f"into {len(positions)} batches at positions {positions} ") for pos in positions: end_pos = pos + max_text_len these_tokens = batch.tokens[:, pos:end_pos] these_masks = batch.mask[:, pos:end_pos] these_model_outputs = model(input_ids=these_tokens, attention_mask=these_masks, output_hidden_states=True) these_hidden_states = these_model_outputs["hidden_states"] # pbar.write(f"tokens {these_tokens.shape[1]}") # pbar.write(f"outputs {list(state.shape[1] for state in these_hidden_states)}") # concatenate the features from the requested layers of the hidden state (-1 is the output layer) features = [] for layer_num in layer_list_int: layer_features = these_hidden_states[layer_num] if pos != 0: layer_features = layer_features[:, stride:] features.append(layer_features.detach().cpu().numpy()) # concatenate features of individual hidden layers features = np.concatenate(features, axis=-1) # shape (batch_size, max_sent_len, num_layers * feat_dim) # pbar.write(f"features {features.shape}") all_model_outputs.append(features) # concatenate outputs back together features = np.concatenate(all_model_outputs, axis=1) # compute total output size, need to know this for model architecture if total_feat_dim is None: total_feat_dim = features.shape[-1] # extract single datapoint information from the batch for batch_num in range(batch_size): key = batch.key[batch_num] length = batch.lengths[batch_num] # given length (number of tokens), cut off the padded tokens feature = features[batch_num, :length] # store sentence lengths so features can be mapped to sentences later sentence_lengths = batch.sentence_lengths[batch_num] if is_tp: sentence_lengths = [int(np.round(length / 4)) for length in sentence_lengths] # make sure correspondence between paragraph features and sentence lengths is still there if feature.shape[0] != sum(sentence_lengths) and not printed_warning: pbar.write("*" * 40) pbar.write(f"WARNING: Feature sequence length {feature.shape[0]} is not equal sum of the sentence " f"lengths: "f"{sum(sentence_lengths)}") pbar.write(f"{sentence_lengths}") pbar.write(f"It may be hard to get the correspondence between tokens and features back and the " f"correct hierarchical sentence structure back from these features..") printed_warning = True # write features data_h5[key] = feature lengths[key] = sentence_lengths pbar.update() pbar.close() data_h5.close() print(f"Wrote data to {temp_file}, moving to {data_file}") if data_file.is_file(): os.remove(data_file) time.sleep(0.1) shutil.move(temp_file, data_file) # write lengths file json.dump(lengths, lengths_file.open("wt", encoding="utf8")) print(f"Wrote sentence splits to {lengths_file}") print(f"Total feature dim of {len(layer_list_int)} is {total_feat_dim}")
def __init__(self, freeze_bert_params=True, dropout_prob=0.1, num_heads=3, base='bert-base-uncased'): print("BERTAttentionClasswiseWeighted Being Used!\n\n\n") super(BERTAttentionClasswiseWeighted, self).__init__() self.embeddings = AutoModel.from_pretrained( base) #, output_hidden_states = True) if freeze_bert_params: for param in self.embeddings.parameters(): param.requires_grad = False self.num_heads = num_heads self.dropout_common = nn.Dropout(dropout_prob) self.dropout1 = nn.Dropout(dropout_prob) self.dropout2 = nn.Dropout(dropout_prob) self.dropout3 = nn.Dropout(dropout_prob) self.dropout4 = nn.Dropout(dropout_prob) self.dropout5 = nn.Dropout(dropout_prob) self.dropout6 = nn.Dropout(dropout_prob) self.dropout7 = nn.Dropout(dropout_prob) self.fc1 = LinearBlock(768, 512) self.fc2 = LinearBlock(512, 512) self.fc_out1 = LinearBlock(512, 512) self.fc_out2 = LinearBlock(512, 512) self.fc_out3 = LinearBlock(512, 512) self.fc_out4 = LinearBlock(512, 512) self.fc_out5 = LinearBlock(512, 512) self.fc_out6 = LinearBlock(512, 512) self.fc_out7 = LinearBlock(512, 512) self.fc_out1_2 = LinearBlock(512, 256) self.fc_out2_2 = LinearBlock(512, 256) self.fc_out3_2 = LinearBlock(512, 256) self.fc_out4_2 = LinearBlock(512, 256) self.fc_out5_2 = LinearBlock(512, 256) self.fc_out6_2 = LinearBlock(512, 256) self.fc_out7_2 = LinearBlock(512, 256) self.attn1 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn2 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn3 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn4 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn5 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn6 = MultiHeadAttention(self.num_heads, self.num_heads * 256) self.attn7 = MultiHeadAttention(self.num_heads, self.num_heads * 256) # Penultimate layers # Variance prediction layers self.log_var1 = nn.Linear(self.num_heads * 256, 1) self.log_var2 = nn.Linear(self.num_heads * 256, 1) self.log_var3 = nn.Linear(self.num_heads * 256, 1) self.log_var4 = nn.Linear(self.num_heads * 256, 1) self.log_var5 = nn.Linear(self.num_heads * 256, 1) self.log_var6 = nn.Linear(self.num_heads * 256, 1) self.log_var7 = nn.Linear(self.num_heads * 256, 1) self.out1 = nn.Linear(self.num_heads * 256, 2) self.out2 = nn.Linear(self.num_heads * 256, 3) self.out3 = nn.Linear(self.num_heads * 256, 3) self.out4 = nn.Linear(self.num_heads * 256, 3) self.out5 = nn.Linear(self.num_heads * 256, 3) self.out6 = nn.Linear(self.num_heads * 256, 2) self.out7 = nn.Linear(self.num_heads * 256, 2)
import torch from scipy.spatial.distance import cosine from transformers import AutoModel, AutoTokenizer # Import our models. The package will take care of downloading the models automatically tokenizer = AutoTokenizer.from_pretrained("/data/project/learn_code/data/sup-simcse-bert-base-uncased") model = AutoModel.from_pretrained("/data/project/learn_code/data/sup-simcse-bert-base-uncased") # Tokenize input texts texts = [ "There's a kid on a skateboard.", "A kid is skateboarding.", "A kid is inside the house." ] inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") # Get the embeddings with torch.no_grad(): embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output # Calculate cosine similarities # Cosine similarities are in [-1, 1]. Higher means more similar cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1]) cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2]) print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1)) print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2)) # simcse 的核心有两个 # 1. loss function: 拉近相似的样本,推开不相似的样本 # 1. 使用dropout做数据增强
label_dict = {} for index, possible_label in enumerate(possible_labels): label_dict[possible_label] = index label_dict race_sampled_train['label'] = race_sampled_train['label'].replace(label_dict) race_sampled_val['label'] = race_sampled_val['label'].replace(label_dict) torch.set_grad_enabled(True) # Store the model we want to use MODEL_NAME = "bert-base-cased" # Creating the model and tokenizer model = AutoModel.from_pretrained(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Tokenize Data encoded_train = tokenizer.batch_encode_plus(race_sampled_train['name'].apply(get_name_pair).values.tolist(), return_attention_mask=True, padding=True, return_tensors='pt') encoded_val = tokenizer.batch_encode_plus(race_sampled_val['name'].apply(get_name_pair).values.tolist(), return_attention_mask=True, padding=True, return_tensors='pt') input_ids_train = encoded_train['input_ids'] attention_masks_train = encoded_train['attention_mask']
def _load_transformer(self): pretrained_transformer = self.hparams.pretrained_transformer if not pretrained_transformer: raise Exception("no transformer identifier specified") return AutoModel.from_pretrained(pretrained_transformer)
import matplotlib.pyplot as plt import seaborn as sns import numpy as np from torchvision import datasets from transformers import AutoTokenizer, AutoModel import pandas as pd import torch import torch.nn as nn import torch.optim as optim tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") model = AutoModel.from_pretrained("bert-base-multilingual-cased") #we add 2 more to it, because we will use it to measure the sentences for #the tokenizer, which has a starting and an end character additionally def longest_sentence_size(sentences): current_max=0 for sentence in sentences: if(len(sentence)>current_max): current_max=len(sentence) return current_max+2 #it wants the sentences as a list already split to words/punctuation #in the first column def text_tokenizing(text_dataframe): input_ids=[] attention_masks=[] targets=[] for_preparing=tokenizer("") start_symbol=for_preparing.input_ids[0]
from transformers import AutoTokenizer, AutoModel from roberta_ast_label_pretrain import FinetuningRoberta, RobertaPretrain from roberta_mask_pretrain import RobertaMaskPretrain, FinetuningMaskRoberta import pandas as pd import xarray as xa from scipy import stats if __name__ == '__main__': test_data = pd.read_pickle("data_cache/java_train_0.pkl") # model_all = RobertaPretrain.load_from_checkpoint( # "pretrained_module/roberta_ast_label_pretrain_on_java_all-type_label/model.ckpt") # tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1", resume_download=True) model = AutoModel.from_pretrained("huggingface/CodeBERTa-small-v1", resume_download=True) encoded = tokenizer.encode_plus(test_data.docstring.array[0], test_data.code.array[0], return_tensors="pt") hiddens, first_output, attentions = model(**encoded, output_attentions=True) attn_array = xa.DataArray( data=[attn.detach().numpy() for attn in attentions], dims=["layer", "batch", "head", "query_pos", "key_pos"]) tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0]) df = attn_array.to_dataframe("attn") filter = df["attn"] > (1 / attn_array.sizes["key_pos"]) df = df[filter] df["relative_pos"] = df.index.get_level_values("query_pos")
def main(): #print("MODEL NAME, BATCH SIZE, AVG LATENCY (ms), AVG MEM USAGE (MiB)") #parser parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str) parser.add_argument('--num_inference', type=int) parser.add_argument('--batch_size', type=int) parser.add_argument('--gpu', action="store_true", default=False) args = parser.parse_args() model_name = args.model_name num_inference = args.num_inference batch_size = args.batch_size use_gpu = args.gpu and torch.cuda.is_available() # stores latency / memory usage values l_inference_latency = list() l_memory_capacity = list() # call corresponding DNN model... # TODO: ADD OTHER MODELS - RESNET50, ... # TODO: FIX NLP MODELS' SEQUENCE LENGTH if (model_name == "resnet18"): with torch.no_grad(): model = models.resnet18(True, True) if use_gpu: model = model.cuda() # inference for i in range(num_inference): # input inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "RESNET18", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "wide_resnet101_2"): with torch.no_grad(): model = models.wide_resnet101_2(True, True) if use_gpu: model = model.cuda() # inference for i in range(num_inference): # input inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "WIDE-RESNET101-2", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "mobilenet"): with torch.no_grad(): model = models.mobilenet_v2(True, True) if use_gpu: model = model.cuda() # warmup for i in range(num_inference): inputs = torch.zeros(batch_size, 3, 224, 224) if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "MOBILENET_V2", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "bert"): with torch.no_grad(): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModel.from_pretrained("bert-base-uncased") if use_gpu: model = model.cuda() # inference for i in range(num_inference): # BERT maximum sequence length 512 sample_text = "BERT" * int(512 / 4) texts = [sample_text] * batch_size inputs = tokenizer(texts, return_tensors="pt") if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join([ "BERT-BASE-UNCASED", str(batch_size), str_avg_inf_time, str_avg_mem_usage ])) elif (model_name == "gpt2"): with torch.no_grad(): tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2Model.from_pretrained("gpt2") if use_gpu: model = model.cuda() # inference for i in range(num_inference): # GPT2 maximum sequence length 124 sample_text = "GPT2" * int(1024 / 4) texts = [sample_text] * batch_size inputs = tokenizer(texts, return_tensors="pt") if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms( average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib( average_90_percent(l_memory_capacity)) print(",".join( ["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage])) elif (model_name == "dlrm"): print("Unimplemented model: DLRM") # TODO: MAKE IT WORK... PLEASE ''' with torch.no_grad(): model = DLRM_Net() if use_gpu: model = model.cuda() # inference for i in range(num_inference): inputs = ???? if use_gpu: inputs = inputs.to('cuda') start_time = time.time() _ = model(**inputs) torch.cuda.synchronize() end_time = time.time() l_inference_latency.append(end_time - start_time) l_memory_capacity.append(torch.cuda.memory_allocated()) str_avg_inf_time = sec_to_ms(average_90_percent(l_inference_latency)) str_avg_mem_usage = bytes_to_mib(average_90_percent(l_memory_capacity)) print(",".join(["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage])) ''' else: print("Unidentified model name: {}".format(model_name)) return
df = pd.read_sql_query("select distinct(title), yes_no from giant_jobs", conn) # won't be needing this anymore conn.close() #print(df.head) # -> everything looks good # create the test, train, and validation sets train_text, temp_text, train_labels, temp_labels = train_test_split( df['title'], df['yes_no'], test_size=0.3, stratify=df['yes_no']) val_text, test_text, val_labels, test_labels = train_test_split( temp_text, temp_labels, test_size=0.3, stratify=temp_labels) # print("test_text", type(test_text)) # Series # import the bert model and tokenizer bert = AutoModel.from_pretrained('bert-base-uncased') tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') # the set of length of messages in the training set set_len = [len(i.split()) for i in train_text] # tokenize and encode sequences in the training set tokens_train = tokenizer.batch_encode_plus(train_text.tolist(), max_length=25, padding='max_length', truncation=True) # tokenize and encode sequences in the validation set tokens_val = tokenizer.batch_encode_plus(val_text.tolist(), max_length=25, padding='max_length',
def load_embeddings(cachedir, encoder_emb_names, decoder_emb_names, max_generative_vocab=50000, logger=_logger): logger.info(f'Getting pretrained word vectors and pretrained models') encoder_emb_names = encoder_emb_names.split('+') decoder_emb_names = decoder_emb_names.split('+') all_vectors = {} encoder_vectors = [] decoder_vectors = [] numericalizer = None for emb_name in encoder_emb_names: if not emb_name: continue if _is_bert(emb_name) or _is_xlmr(emb_name): if numericalizer is not None: raise ValueError( 'Cannot specify multiple Transformer embeddings') config = AutoConfig.from_pretrained(emb_name, cache_dir=cachedir) config.output_hidden_states = True if _is_bert(emb_name): numericalizer = BertNumericalizer( emb_name, config=config, max_generative_vocab=max_generative_vocab, cache=cachedir) elif _is_xlmr(emb_name): numericalizer = XLMRobertaNumericalizer( emb_name, config=config, max_generative_vocab=max_generative_vocab, cache=cachedir) # load the tokenizer once to ensure all files are downloaded AutoTokenizer.from_pretrained(emb_name, cache_dir=cachedir) encoder_vectors.append( TransformerEmbedding( AutoModel.from_pretrained(emb_name, config=config, cache_dir=cachedir))) else: if numericalizer is not None: logger.warning( 'Combining BERT embeddings with other pretrained embeddings is unlikely to work' ) if emb_name in all_vectors: encoder_vectors.append(all_vectors[emb_name]) else: vec = _name_to_vector(emb_name, cachedir) all_vectors[emb_name] = vec encoder_vectors.append(vec) for emb_name in decoder_emb_names: if not emb_name: continue if _is_bert(emb_name) or _is_xlmr(emb_name): raise ValueError( 'Transformer embeddings cannot be specified in the decoder') if emb_name in all_vectors: decoder_vectors.append(all_vectors[emb_name]) else: vec = _name_to_vector(emb_name, cachedir) all_vectors[emb_name] = vec decoder_vectors.append(vec) if numericalizer is None: numericalizer = SimpleNumericalizer( max_generative_vocab=max_generative_vocab, pad_first=False) return numericalizer, encoder_vectors, decoder_vectors
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .jsonl files for MMIMDB.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Setup model labels = get_mmimdb_labels() num_labels = len(labels) transformer_config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir, ) transformer = AutoModel.from_pretrained(args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir) img_encoder = ImageEncoder(args) config = MMBTConfig(transformer_config, num_labels=num_labels) model = MMBTForClassification(config, transformer, img_encoder) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_examples(args, tokenizer, evaluate=False) label_frequences = train_dataset.get_label_frequencies() label_frequences = [label_frequences[l] for l in labels] label_weights = (torch.tensor( label_frequences, device=args.device, dtype=torch.float) / len(train_dataset))**-1 criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME)) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = MMBTForClassification(config, transformer, img_encoder) model.load_state_dict( torch.load(os.path.join(args.output_dir, WEIGHTS_NAME))) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = MMBTForClassification(config, transformer, img_encoder) model.load_state_dict(torch.load(checkpoint)) model.to(args.device) result = evaluate(args, model, tokenizer, criterion, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results