def __init__(self, config: dict): """ SemEval Model using Transformers. Args: config: configuration parameters """ super().__init__() self.gradient_acc_steps = config.get("gradient_acc_steps") self.transformer = config.get("transformer") self.config = config self.data_loader = SemEvalDataloader(self.config) logger.info( f"Loaded {len(self.data_loader.train_generator)} fine-tuning samples." ) self.tokenizer = self.data_loader.tokenizer self.tokenizer.convert_tokens_to_ids("[E1]") self.tokenizer.convert_tokens_to_ids("[E2]") self.model = BertModel.from_pretrained( model_size=self.config.get("transformer"), force_download=False, pretrained_model_name_or_path=self.config.get("transformer"), task="classification", n_classes=self.data_loader.n_classes, ) self.model.resize_token_embeddings(len(self.tokenizer)) pretrained_mtb_model = self.config.get("pretrained_mtb_model", None) if pretrained_mtb_model and os.path.isfile(pretrained_mtb_model): self._load_pretrained_model(pretrained_mtb_model) self.train_on_gpu = torch.cuda.is_available() and config.get( "use_gpu", True) if self.train_on_gpu: self.model.cuda() self.criterion = CrossEntropyLoss(reduction="sum") self._start_epoch = 0 self._train_loss = [] self._train_acc = [] self._test_f1 = [] self._test_acc = [] self._best_test_f1 = 0 self.checkpoint_dir = os.path.join("models", "finetuning", "sem_eval", self.transformer) Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True) self._points_seen = 0
def __init__(self, config: dict): """ Matching the Blanks Model. Args: config: configuration parameters """ super().__init__() self.experiment_name = config.get("experiment_name") self.transformer = config.get("transformer") self.config = config self.data_loader = MTBPretrainDataLoader(self.config) self.train_len = len(self.data_loader.train_generator) logger.info("Loaded %d pre-training samples." % self.train_len) self.model = BertModel.from_pretrained( model_size=self.transformer, pretrained_model_name_or_path=self.transformer, force_download=False, ) self.tokenizer = self.data_loader.tokenizer self.model.resize_token_embeddings(len(self.tokenizer)) e1_id = self.tokenizer.convert_tokens_to_ids("[E1]") e2_id = self.tokenizer.convert_tokens_to_ids("[E2]") if e1_id == e2_id == 1: raise ValueError("e1_id == e2_id == 1") self.train_on_gpu = torch.cuda.is_available() and config.get( "use_gpu", True) if self.train_on_gpu: logger.info("Train on GPU") self.model.cuda() self.criterion = MTBLoss(lm_ignore_idx=self.tokenizer.pad_token_id, ) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.get("lr")) ovr_steps = (self.config.get("epochs") * len(self.data_loader.train_generator) * self.config.get("max_size") * 2 / self.config.get("batch_size")) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, ovr_steps // 10, ovr_steps) self._start_epoch = 0 self._best_mtb_bce = 50 self._train_loss = [] self._train_lm_acc = [] self._lm_acc = [] self._mtb_bce = [] self.checkpoint_dir = os.path.join("models", "MTB-pretraining", self.experiment_name, self.transformer) Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True) self._batch_points_seen = 0 self._points_seen = 0
def __init__(self, args=None, detect_entities=False): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() self.detect_entities = detect_entities if self.detect_entities: self.nlp = spacy.load("en_core_web_lg") else: self.nlp = None self.entities_of_interest = [ "PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "PER", ] logger.info("Loading tokenizer and model...") from .train_funcs import load_state if self.args.model_no == 0: from model.bert import BertModel as Model model = args.model_size #'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="classification", n_classes_=self.args.num_classes, ) self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) self.net.resize_token_embeddings(len(self.tokenizer)) if self.cuda: self.net.cuda() start_epoch, best_pred, amp_checkpoint = load_state( self.net, None, None, self.args, load_best=False ) logger.info("Done!") self.e1_id = self.tokenizer.convert_tokens_to_ids("[E1]") self.e2_id = self.tokenizer.convert_tokens_to_ids("[E2]") self.pad_id = self.tokenizer.pad_token_id self.rm = load_pickle("relations.pkl")
def __init__(self, args=None): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() if self.args.model_no == 0: from model.bert import BertModel as Model from model.bert_tokenizer import BertTokenizer as Tokenizer model = args.model_size #'bert-large-uncased' 'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model from model.albert.albert_tokenizer import ( AlbertTokenizer as Tokenizer, ) model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig from model.bert_tokenizer import BertTokenizer as Tokenizer model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="fewrel", ) if os.path.isfile("./data/%s_tokenizer.pkl" % model_name): self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) logger.info("Loaded tokenizer from saved file.") else: logger.info( "Saved tokenizer not found, initializing new tokenizer..." ) if args.model_no == 2: self.tokenizer = Tokenizer( vocab_file="./additional_models/biobert_v1.1_pubmed/vocab.txt", do_lower_case=False, ) else: self.tokenizer = Tokenizer.from_pretrained( model, do_lower_case=False ) self.tokenizer.add_tokens( ["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"] ) save_as_pickle("%s_tokenizer.pkl" % model_name, self.tokenizer) logger.info( "Saved %s tokenizer at ./data/%s_tokenizer.pkl" % (model_name, model_name) ) self.net.resize_token_embeddings(len(self.tokenizer)) self.pad_id = self.tokenizer.pad_token_id if self.cuda: self.net.cuda() if self.args.use_pretrained_blanks == 1: logger.info( "Loading model pre-trained on blanks at ./data/test_checkpoint_%d.pth.tar..." % args.model_no ) checkpoint_path = ( "./data/test_checkpoint_%d.pth.tar" % self.args.model_no ) checkpoint = torch.load(checkpoint_path) model_dict = self.net.state_dict() pretrained_dict = { k: v for k, v in checkpoint["state_dict"].items() if k in model_dict.keys() } model_dict.update(pretrained_dict) self.net.load_state_dict(pretrained_dict, strict=False) del checkpoint, pretrained_dict, model_dict logger.info("Loading Fewrel dataloaders...") self.train_loader, _, self.train_length, _ = load_dataloaders(args)