def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a language model either by supplying * the name of a remote model on s3 ("albert-base" ...) * or a local path of a model trained via transformers ("some_dir/huggingface_model") * or a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: name or path of a model :param language: (Optional) Name of language the model was trained for (e.g. "german"). If not supplied, FARM will try to infer it from the model name. :return: Language Model """ albert = cls() if "farm_lm_name" in kwargs: albert.name = kwargs["farm_lm_name"] else: albert.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style config = AlbertConfig.from_pretrained(farm_lm_config) farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" albert.model = AlbertModel.from_pretrained(farm_lm_model, config=config, **kwargs) albert.language = albert.model.config.language else: # Huggingface transformer Style albert.model = AlbertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) albert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) return albert
def __init__(self, config, args): super().__init__(config) self.args = args if args.bert_model == "albert-base-v2": bert = AlbertModel.from_pretrained(args.bert_model) elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": bert = AutoModel.from_pretrained(args.bert_model) elif args.bert_model == "bert-small-scratch": config = BertConfig.from_pretrained( "google/bert_uncased_L-4_H-512_A-8") bert = BertModel(config) elif args.bert_model == "bert-base-scratch": config = BertConfig.from_pretrained("bert-base-uncased") bert = BertModel(config) else: bert = BertModel.from_pretrained( args.bert_model) # bert-base-uncased, small, tiny self.txt_embeddings = bert.embeddings self.img_embeddings = ImageBertEmbeddings(args, self.txt_embeddings) if args.img_encoder == 'ViT': img_size = args.img_size patch_sz = 32 if img_size == 512 else 16 self.img_encoder = Img_patch_embedding(image_size=img_size, patch_size=patch_sz, dim=2048) else: self.img_encoder = ImageEncoder_cnn(args) for p in self.img_encoder.parameters(): p.requires_grad = False for c in list(self.img_encoder.children())[5:]: for p in c.parameters(): p.requires_grad = True self.encoder = bert.encoder self.pooler = bert.pooler
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, transformer_weights_model: str = None, num_labels: int = 2, predictions_file=None, layer_freeze_regexes: List[str] = None, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._predictions = [] self._pretrained_model = pretrained_model if 't5' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if transformer_weights_model: # Override for RoBERTa only for now logging.info(f"Loading Transformer weights model from {transformer_weights_model}") transformer_model_loaded = load_archive(transformer_weights_model) self._transformer_model = transformer_model_loaded.model._transformer_model else: self._transformer_model = T5Model.from_pretrained(pretrained_model) self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if transformer_weights_model: # Override for RoBERTa only for now logging.info(f"Loading Transformer weights model from {transformer_weights_model}") transformer_model_loaded = load_archive(transformer_weights_model) self._transformer_model = transformer_model_loaded.model._transformer_model else: self._transformer_model = RobertaModel.from_pretrained(pretrained_model) self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetModel.from_pretrained(pretrained_model) self.sequence_summary = SequenceSummary(self._transformer_model.config) elif 'albert' in pretrained_model: self._transformer_model = AlbertModel.from_pretrained(pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) elif 'bert' in pretrained_model: self._transformer_model = BertModel.from_pretrained(pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob) else: assert (ValueError) for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any([bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False transformer_config = self._transformer_model.config transformer_config.num_labels = num_labels self._output_dim = self._transformer_model.config.hidden_size # unifing all model classification layer self._classifier = Linear(self._output_dim, num_labels) self._classifier.weight.data.normal_(mean=0.0, std=0.02) self._classifier.bias.data.zero_() self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = -1
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, probe_type: str = None, layer_freeze_regexes: List[str] = None, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._pretrained_model = pretrained_model if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token self._transformer_model = RobertaModel.from_pretrained( pretrained_model) self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetModel.from_pretrained( pretrained_model) self.sequence_summary = SequenceSummary( self._transformer_model.config) elif 'albert' in pretrained_model: self._transformer_model = AlbertModel.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) elif 'bert' in pretrained_model: self._transformer_model = BertModel.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token self._dropout = torch.nn.Dropout( self._transformer_model.config.hidden_dropout_prob) else: assert (ValueError) if probe_type == 'MLP': layer_freeze_regexes = ["embeddings", "encoder"] for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False transformer_config = self._transformer_model.config transformer_config.num_labels = 1 self._output_dim = self._transformer_model.config.hidden_size # unifing all model classification layer self._classifier = Linear(self._output_dim, 1) self._classifier.weight.data.normal_(mean=0.0, std=0.02) self._classifier.bias.data.zero_() self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = 2
def wsd( model_name='bert-base-uncased', #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased classifier_input='token-embedding-last-1-layers', # token-embedding-last-layer / token-embedding-last-n-layers classifier_hidden_layers=[], reduce_options=True, freeze_base_model=True, max_len=512, batch_size=32, test=False, lr=5e-5, eps=1e-8, n_epochs=50, cls_token=False, # If true, the cls token is used instead of the relevant-word token cache_embeddings=False, # If true, the embeddings from the base model are saved to disk so that they only need to be computed once save_classifier=True # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists ): train_path = "wsd_train.txt" test_path = "wsd_test_blind.txt" n_classes = 222 device = 'cuda' import __main__ as main print("Script: " + os.path.basename(main.__file__)) print("Loading base model %s..." % model_name) if model_name.startswith('ensemble-distil-'): last_n_distil = int(model_name.replace('ensemble-distil-', "")[0]) last_n_albert = int(model_name[-1]) from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel base_model = AlbertModel.from_pretrained('albert-xxlarge-v2', output_hidden_states=True, output_attentions=False) tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') print( "Ensemble model with DistilBert last %d layers and Albert last %d layers" % (last_n_distil, last_n_albert)) elif model_name.startswith('distilbert'): tokenizer = DistilBertTokenizer.from_pretrained(model_name) base_model = DistilBertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('bert'): from transformers import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained(model_name) base_model = BertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('albert'): from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel tokenizer = AlbertTokenizer.from_pretrained(model_name) base_model = AlbertModel.from_pretrained(model_name, output_hidden_states=True, output_attentions=False) use_n_last_layers = 1 if classifier_input == 'token-embedding-last-layer': use_n_last_layers = 1 elif classifier_input.startswith( 'token-embedding-last-') and classifier_input.endswith('-layers'): use_n_last_layers = int( classifier_input.replace('token-embedding-last-', "").replace('-layers', "")) else: raise ValueError("Invalid classifier_input argument") print("Using the last %d layers" % use_n_last_layers) def tokenize(str): return tokenizer.tokenize(str)[:max_len - 2] SENSE = LabelField(is_target=True) LEMMA = LabelField() TOKEN_POS = LabelField(use_vocab=False) TEXT = Field(tokenize=tokenize, pad_token=tokenizer.pad_token, init_token=tokenizer.cls_token, eos_token=tokenizer.sep_token) EXAMPLE_ID = LabelField(use_vocab=False) fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS), ('text', TEXT), ('example_id', EXAMPLE_ID)] def read_data(corpus_file, fields, max_len=None): train_id_start = 0 test_id_start = 76049 # let the ids for the test examples start after the training example indices if corpus_file == "wsd_test_blind.txt": print("Loading test data...") id_start = test_id_start else: print("Loading train/val data...") id_start = train_id_start with open(corpus_file, encoding='utf-8') as f: examples = [] for i, line in enumerate(f): sense, lemma, word_position, text = line.split('\t') # We need to convert from the word position to the token position words = text.split() pre_word = " ".join(words[:int(word_position)]) pre_word_tokenized = tokenizer.tokenize(pre_word) token_position = len( pre_word_tokenized ) + 1 # taking into account the later addition of the start token example_id = id_start + i if max_len is None or token_position < max_len - 1: # ignore examples where the relevant token is cut off due to max_len if cls_token: token_position = 0 examples.append( Example.fromlist( [sense, lemma, token_position, text, example_id], fields)) else: print( "Example %d is skipped because the relevant token was cut off (token pos = %d)" % (example_id, token_position)) print(text) return Dataset(examples, fields) dataset = read_data(train_path, fields, max_len) random.seed(0) trn, vld = dataset.split(0.7, stratified=True, strata_field='sense') TEXT.build_vocab([]) if model_name.startswith('albert') or model_name.startswith( 'ensemble-distil-'): class Mapping: def __init__(self, fn): self.fn = fn def __getitem__(self, item): return self.fn(item) TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId) TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece) else: TEXT.vocab.stoi = tokenizer.vocab TEXT.vocab.itos = list(tokenizer.vocab) SENSE.build_vocab(trn) LEMMA.build_vocab(trn) trn_iter = BucketIterator(trn, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=True, sort=True) vld_iter = BucketIterator(vld, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=False, sort=True) if freeze_base_model: for mat in base_model.parameters(): mat.requires_grad = False # Freeze Bert model so that we only train the classifier on top if reduce_options: lemma_mask = defaultdict( lambda: torch.zeros(len(SENSE.vocab), device=device)) for example in trn: lemma = LEMMA.vocab.stoi[example.lemma] sense = SENSE.vocab.stoi[example.sense] lemma_mask[lemma][sense] = 1 lemma_mask = dict(lemma_mask) def mask( batch_logits, batch_lemmas ): # Masks out the senses that do not belong to the specified lemma for batch_i in range(len(batch_logits)): lemma = batch_lemmas[batch_i].item() batch_logits[batch_i, :] *= lemma_mask[lemma] return batch_logits else: def mask(batch_logits, batch_lemmas): return batch_logits experiment_name = model_name + " " + ( classifier_input if not model_name.startswith('ensemble-distil-') else "") + " " + str(classifier_hidden_layers) + " (" + ( " cls_token" if cls_token else "") + (" reduce_options" if reduce_options else "") + ( " freeze_base_model" if freeze_base_model else "" ) + " ) " + "max_len=" + str(max_len) + " batch_size=" + str( batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + ( " cache_embeddings" if cache_embeddings else "") if model_name.startswith('ensemble-distil-'): model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask, classifier_hidden_layers) else: model = WSDModel(base_model, n_classes, mask, use_n_last_layers, model_name, classifier_hidden_layers, cache_embeddings) history = None #if save_classifier: # if model.load_classifier(experiment_name): # # Existing saved model loaded # # Also load the corresponding training history # history = read_dict_file("results/"+experiment_name+".txt") model.cuda() print("Starting experiment " + experiment_name) if test: tst = read_data(test_path, fields, max_len=512) tst_iter = Iterator(tst, device=device, batch_size=batch_size, sort=False, sort_within_batch=False, repeat=False, train=False) batch_predictions = [] for batch in tst_iter: print('.', end='') sys.stdout.flush() text = batch.text.t() with torch.no_grad(): outputs = model(text, token_positions=batch.token_pos, lemmas=batch.lemma, example_ids=batch.example_id) scores = outputs[-1] batch_predictions.append(scores.argmax(dim=1)) batch_preds = torch.cat(batch_predictions, 0).tolist() predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds] with open("test_predictions/" + experiment_name + ".txt", "w") as out: out.write("\n".join(predicted_senses)) else: no_decay = ['bias', 'LayerNorm.weight'] decay = 0.01 optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps) def save_results(history): with open("results/" + experiment_name + ".txt", "w") as out: out.write(str(history)) if save_classifier: if len(history['val_acc']) < 2 or history['val_acc'][-1] > max( history['val_acc'][:-1]): model.save_classifier(experiment_name, best=True) else: model.save_classifier(experiment_name, best=False) train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results, history)