def __init__(self, num_classes=1): super(NaiveSummarizer, self).__init__() self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.tokenizer.padding_side = 'left' self.embedder = LSTM(self.tokenizer.vocab_size) self.lstm = nn.LSTM(128, 64, 1, batch_first=True, bidirectional=False) self.fc = nn.Linear(64, num_classes)
def get_tokenizer(model_type='BERT'): if model_type == 'distilBERT': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') elif model_type == 'BERT': tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') elif model_type == 'alBERT': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') else: print('model_type not allowed ', model_type) return tokenizer
def create_tokenizer(model_type: str) -> PreTrainedTokenizer: if model_type == "albert": return AlbertTokenizer.from_pretrained("albert-base-v2") elif model_type == "bert": return BertTokenizer.from_pretrained("bert-base-uncased") elif model_type == "electra": return BertTokenizer.from_pretrained("bert-base-uncased") else: raise ValueError( f"model_type={model_type} must be one of ['albert', 'bert', 'electra']" )
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert( model, config, args.checkpoint) model.save_pretrained(args.output) tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True) tokenizer.save_pretrained(args.output)
def getAlBertEmbeddings(self): model = AlbertModel.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model.eval() tokens_tensor, segments_tensors = self.getIndexs(tokenizer) with torch.no_grad(): last_hidden_states = model(tokens_tensor, attention_mask=segments_tensors) features = last_hidden_states[0][:, 0, :].numpy() features = np.reshape(features, features.shape[1]) return (features.tolist())
def init_model(cachedir='~/hashtag/', no_cuda=True): global tokenizer, model f_cachedir = os.path.expanduser(cachedir) bert_config = AlbertConfig.from_pretrained(f_cachedir) model = HashtagClassifier.from_pretrained(f_cachedir, config=bert_config) device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" model.to(device) model.eval() tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
def get_bert(bert_model, bert_do_lower_case): # Avoid a hard dependency on BERT by only importing it if it's being used from transformers import AlbertTokenizer, BertModel model = BertModel.from_pretrained('huseinzol05/bert-base-bahasa-cased') tokenizer = AlbertTokenizer.from_pretrained( 'huseinzol05/bert-base-bahasa-cased', unk_token = '[UNK]', pad_token = '[PAD]', do_lower_case = False, ) return tokenizer, model
def download_albert_base(): file = '../input/albert-base-v2' config = AlbertConfig.from_pretrained('albert-base-v2') config.save_pretrained(file) model = AlbertModel.from_pretrained('albert-base-v2') model.save_pretrained(file) tkn = AlbertTokenizer.from_pretrained('albert-base-v2') tkn.save_pretrained(file)
def _test(_hparams): model = BERT.load_from_checkpoint( checkpoint_path=_hparams.weight_path, tags_csv=_hparams.cfg_path ) print('model loaded.') model.eval() model.freeze() if _hparams.pretrained_model.startswith('distilbert'): tokenizer = DistilBertTokenizer.from_pretrained(_hparams.pretrained_model) elif _hparams.pretrained_model.startswith('bert'): tokenizer = BertTokenizer.from_pretrained(_hparams.pretrained_model) elif _hparams.pretrained_model.startswith('albert'): tokenizer = AlbertTokenizer.from_pretrained(_hparams.pretrained_model) else: raise ValueError('Unrecognized model name.') y_all, y_hat_all = [], [] error_analysis_f = None if _hparams.error_analysis: error_analysis_f = open(MiningConfig.error_analysis_path % (_hparams.name, _hparams.dataset), 'w') for input_ids, attention_mask, token_type_ids, y in model.test_dataloader(): y_hat, attn = model(input_ids, attention_mask, token_type_ids) a, y_hat = torch.max(y_hat, dim=1) for i in range(input_ids.size(0)): y_single = y.cpu().numpy()[i] y_hat_single = y_hat.cpu().numpy()[i] text = tokenizer.decode(input_ids[i]).\ replace('[CLS]', '').replace('[SEP]', '').replace('[PAD]', '').\ replace('\t', '').replace('\n', '').strip() y_all.append(y_single) y_hat_all.append(y_hat_single) if 'STANCE' not in _hparams.dataset: if _hparams.error_analysis: if y_single == 0 and y_hat_single == 1: error_analysis_f.write('FN' + '\t' + text + '\n') if y_single == 1 and y_hat_single == 0: error_analysis_f.write('FP' + '\t' + text + '\n') else: if _hparams.error_analysis: if y_single != y_hat_single: error_analysis_f.write('%s-->%s' % (LABEL_MAP['STANCE'][y_single], LABEL_MAP['STANCE'][y_hat_single]) + '\t' + text + '\n') if _hparams.error_analysis: error_analysis_f.close() test_acc = accuracy_score(y_all, y_hat_all) test_f1 = f1_score(y_all, y_hat_all, average='macro') print(test_acc, test_f1)
def _test_TFAlbert(self, size, large=False): from transformers import AlbertTokenizer, TFAlbertModel tokenizer = AlbertTokenizer.from_pretrained(size) model = TFAlbertModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def init_predict_tokenizer(self, tokenizer: PreTrainedTokenizer, ckpt: str) -> None: self.inf_session.tokenizer = tokenizer if tokenizer else \ AlbertTokenizer.from_pretrained(self.inf_session.config.model.sp_model, max_len=self.inf_session.config.data_source.max_seq_length, truncation=True) self.inf_session.special_token_mask = [ self.inf_session.tokenizer.unk_token_id, self.inf_session.tokenizer.sep_token_id, self.inf_session.tokenizer.pad_token_id, self.inf_session.tokenizer.cls_token_id ] logger.info(f'Predictions from model weights: {ckpt}')
def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=2): super(BasicSummarizer, self).__init__() self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.tokenizer.padding_side = 'left' self.embedder = LSTM(self.tokenizer.vocab_size) self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim, num_heads, num_classes)
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def test_tokenization_albert(self): # Given self.base_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyAlbertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['albert-base-v2']), do_lower_case=True, strip_accents=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When # Note: the original sentence piece tokenizer strips trailing spaces output_rust = self.rust_tokenizer.encode_list( [example.text_a.strip() for example in self.examples], max_len=256, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): if rust.token_ids != baseline['input_ids']: if len(rust.token_ids) == len(baseline['input_ids']): if Counter(rust.token_ids) != Counter( baseline['input_ids']): raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') else: raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def __init__(self, batch_size, epoch_num, model_name, is_test): self.BATCH_SIZE = batch_size self.EPOCHS = epoch_num self.NUM_LABELS = 4 self.model_name = model_name if self.model_name == "bert": self.model_version = 'bert-base-cased' self.tokenizer = BertTokenizer.from_pretrained(self.model_version) if is_test: self.model = BertForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = BertForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) elif self.model_name == "robert": self.model_version = 'roberta-base' self.tokenizer = RobertaTokenizer.from_pretrained( self.model_version) if is_test: self.model = RobertaForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = RobertaForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) elif self.model_name == "albert": self.model_version = 'albert-base-v2' self.tokenizer = AlbertTokenizer.from_pretrained( self.model_version) if is_test: self.model = AlbertForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = AlbertForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) if is_test: self.testset = FakeNewsDataset("test", tokenizer=self.tokenizer) self.testloader = DataLoader(self.testset, batch_size=self.BATCH_SIZE, collate_fn=create_mini_batch) else: self.trainset = FakeNewsDataset("train", tokenizer=self.tokenizer) self.trainloader = DataLoader(self.trainset, batch_size=self.BATCH_SIZE, collate_fn=create_mini_batch) self.model.train() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model.to(self.device)
def encode_text(cls, tlc_text: np.array, sequence_length): tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') instances_tlc = tokenizer.batch_encode_plus( tlc_text, max_length=sequence_length, pad_to_max_length=True, return_attention_masks=True, return_token_type_ids=False) input_ids_tlc = torch.tensor(instances_tlc['input_ids'], dtype=torch.int32) attention_mask_tlc = torch.tensor(instances_tlc['attention_mask'], dtype=torch.int32) return input_ids_tlc, attention_mask_tlc
def __init__(self, path='src/Bert', model_type='huseinzol05/bert-base-bahasa-cased'): self.path = path self.model_type = model_type self.tokenizer = AlbertTokenizer.from_pretrained(self.path, do_lower_case=True) self.model = BertForSequenceClassification.from_pretrained( self.path, num_labels=3) self.device = "cuda" if torch.cuda.is_available() else "cpu" # self.device = "cpu" self.model.to(self.device) self.model.eval()
def compute_input_ids_masks_albert(train_set,val_test_set_emb,MAX_LEN): from transformers import AlbertTokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2',padding_side ='left') text_batch_train = train_set['review'].apply(dataset_preprocessing.clean_text).to_list() encoding = tokenizer(text_batch_train, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN) train_inputs = encoding['input_ids'] train_masks = encoding['attention_mask'] text_batch_val = val_test_set_emb['review'].apply(dataset_preprocessing.clean_text).to_list() encoding = tokenizer(text_batch_val, return_tensors='pt', padding=True, truncation=True,max_length=MAX_LEN) val_test_inputs = encoding['input_ids'] val_test_masks = encoding['attention_mask'] return train_inputs.numpy(),train_masks.numpy(),val_test_inputs.numpy(),val_test_masks.numpy()
def __init__(self, args): self.args = args self.tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/bert-base-bahasa-cased', unk_token = '[UNK]', pad_token='[PAD]', do_lower_case=False) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[CLS]' self.tgt_eos = '[SEP]' self.tgt_sent_split = '[CLS]' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=2): super(Summarizer, self).__init__() albert_base_configuration = AlbertConfig( hidden_size=256, num_attention_heads=4, intermediate_size=1024, ) self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.embedder = AlbertModel(albert_base_configuration) self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim, num_heads, num_classes)
def __init__(self, model_name, max_length, device): super(TransformerRLN, self).__init__() self.max_length = max_length self.device = device if model_name == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.encoder = AlbertModel.from_pretrained('albert-base-v2') elif model_name == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.encoder = BertModel.from_pretrained('bert-base-uncased') else: raise NotImplementedError self.to(self.device)
def read_train_inputs(train_file, delimiter, max_len, max_negatives, num_dev_samples): f = open(train_file, 'r', encoding='utf8') unique_entity_map, all_samples = {}, [] tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') for line in f.readlines(): info = line.strip().split('\t') sentence, entity_id, canonical_name, negative_samples = info[0], info[ 1], info[2], info[3].split(delimiter) if len(negative_samples) < max_negatives: negative_samples = negative_samples + [''] * ( max_negatives - len(negative_samples)) else: negative_samples = negative_samples[:max_negatives] token_info = tokenizer.encode_plus(sentence, max_length=max_len, pad_to_max_length=True) sentence_tokens = token_info['input_ids'] sentence_attention_mask = np.array(token_info['attention_mask']) negative_tokens, negative_attention_masks = [], [] for negative_sample in negative_samples: negative_token_info = tokenizer.encode_plus(negative_sample, max_length=max_len, pad_to_max_length=True) negative_tokens.append(negative_token_info['input_ids']) negative_attention_masks.append( np.array(negative_token_info['attention_mask'])) train_sample = TrainSample(sentence, entity_id, negative_samples, sentence_tokens, negative_tokens, sentence_attention_mask, negative_attention_masks) all_samples.append(train_sample) if entity_id not in unique_entity_map: entity_token_info = tokenizer.encode_plus(canonical_name, max_length=max_len, pad_to_max_length=True) new_entity = EntityObj( entity_id, canonical_name, entity_token_info['input_ids'], np.array(entity_token_info['attention_mask'])) unique_entity_map[entity_id] = new_entity new_entity.utterances.append(sentence_tokens) new_entity.masks.append(sentence_attention_mask) else: unique_entity_map[entity_id].utterances.append(sentence_tokens) unique_entity_map[entity_id].masks.append(sentence_attention_mask) random.shuffle(all_samples) train_samples, dev_samples = all_samples[:len(all_samples) - num_dev_samples], all_samples[ len(all_samples) - num_dev_samples:] return train_samples, dev_samples, unique_entity_map
def __init__(self, data_dir, task, max_len, bert_name, bert_type, mode='train'): self.mode = mode if bert_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained(bert_name) elif bert_type == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained(bert_name) self.data = self.convert_data(max_len) self.num_class = 3
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(ALBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 510: logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") max_seq_length = 510 self.max_seq_length = max_seq_length self.bert = AlbertModel.from_pretrained(model_name_or_path) self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
def train(args): wandb.init(config=args, project='CXR-BERT') set_seed(args.seed) # TODO: bert-base,small,tiny tokenizer if args.bert_model == "albert-base-v2": tokenizer = AlbertTokenizer.from_pretrained( args.bert_model, do_lower_case=True).tokenize elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT": # same with Bert-base-cased model tokenizer = AutoTokenizer.from_pretrained(args.bert_model).tokenize elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12": tokenizer = AutoTokenizer.from_pretrained(args.bert_model).tokenize elif args.bert_model == "bert-small-scratch": tokenizer = BertTokenizer.from_pretrained( "google/bert_uncased_L-4_H-512_A-8", do_lower_case=True).tokenize elif args.bert_model == "bert-base-scratch": tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True).tokenize else: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True).tokenize transforms = get_transforms(args) print("Load Train dataset", args.train_dataset) train_dataset = CXRDataset(args.train_dataset, tokenizer, transforms, args) print("Load Test dataset", args.test_dataset) test_dataset = CXRDataset(args.test_dataset, tokenizer, transforms, args) \ if args.test_dataset is not None else None print("Create DataLoader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) \ if test_dataset is not None else None print("Creating BERT Trainer") trainer = CXRBERT_Trainer(args, train_dataloader=train_data_loader, test_dataloader=test_data_loader) print("Training Start!") for epoch in range(args.epochs): trainer.train(epoch) trainer.save(epoch, args.output_path)
def test_tokenization_albert(self): # Given self.base_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyAlbertTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['albert-base-v2']), do_lower_case=True, strip_accents=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When # Note: the original sentence piece tokenizer strips trailing spaces output_rust = self.rust_tokenizer.encode_list( [example.text_a.strip() for example in self.examples], max_len=256, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): if rust.token_ids != baseline['input_ids']: for pos, (rust_id, baseline_id) in enumerate( zip(rust.token_ids, baseline['input_ids'])): # This check is required a SentencePiece can also be ambiguous in very rare cases # (e.g. "eee" -> "e, ee" or "ee, e" have the same score) if rust_id != baseline_id: if pos < len(baseline): if (rust_id != baseline['input_ids'][pos + 1]) & \ (rust_id != baseline['input_ids'][pos - 1]): raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def __init__( self, model_name=Defaults.model_name, measure=Defaults.measure, gap=Defaults.gap, gap_mask=Defaults.gap_mask, gap_tune=Defaults.gap_tune, gap_mask_tune=Defaults.gap_mask_tune, min_token_length_normal=Defaults.min_token_length_normal, min_token_length_lead=Defaults.min_token_length_lead, min_token_length_followup=Defaults.min_token_length_followup, min_token_length_normal_tune=Defaults.min_token_length_normal_tune, min_token_length_lead_tune=Defaults.min_token_length_lead_tune, min_token_length_followup_tune=Defaults.min_token_length_followup_tune, device=Defaults.device, inference_batch_size=Defaults.inference_batch_size, inference_mask_evenly=Defaults.inference_mask_evenly, len_sent_allow_cut=Defaults.len_sent_allow_cut, p_mask=Defaults.p_mask, show_progress_bar=Defaults.show_progress_bar, ): """This class should not be instantiated directly: instead use BlancHelp or BlancTune""" self.model_name = model_name self.measure = measure self.gap = gap self.gap_mask = gap_mask self.gap_tune = gap_tune self.gap_mask_tune = gap_mask_tune self.min_token_length_normal = min_token_length_normal self.min_token_length_lead = min_token_length_lead self.min_token_length_followup = min_token_length_followup self.min_token_length_normal_tune = min_token_length_normal_tune self.min_token_length_lead_tune = min_token_length_lead_tune self.min_token_length_followup_tune = min_token_length_followup_tune self.device = device self.inference_batch_size = inference_batch_size self.inference_mask_evenly = inference_mask_evenly self.len_sent_allow_cut = len_sent_allow_cut self.p_mask = p_mask self.show_progress_bar = show_progress_bar # The same is intentionally not given: self.gap_tune = self.gap if self.gap_tune < 0 else self.gap_tune self.gap_mask_tune = self.gap_mask if self.gap_mask_tune < 0 else self.gap_mask_tune if self.model_name.lower().find('albert') >= 0: self.model_tokenizer = AlbertTokenizer.from_pretrained(model_name) else: self.model_tokenizer = BertTokenizer.from_pretrained(model_name)
def load_ds_from_cache(self) -> None: ds_metadata = utils.core_utils.load_json(self.ds_meta) self.dataset_conf['num_train_recs'], self.dataset_conf['num_val_recs'], self.dataset_conf[ 'num_test_recs'] = \ ds_metadata["train_recs"], ds_metadata["val_recs"], ds_metadata["test_recs"] for k in self.dataset_conf[self.target_ds_structure].keys(): self.dataset_conf[f'{k}_start_date'] = ds_metadata[f'{k}_start_date'] \ if f'{k}_start_date' in ds_metadata.keys() else None self.dataset_conf[f'{k}_end_date'] = ds_metadata[f'{k}_end_date'] \ if f'{k}_end_date' in ds_metadata.keys() else None self.dataset_conf['dsid'] = ds_metadata[ 'dsid'] if 'dsid' in ds_metadata.keys() else None self.dataset_conf['albert_tokenizer'] = \ AlbertTokenizer.from_pretrained(self.config.model.sp_model, max_len=self.config.data_source.max_seq_length, truncation=True)
def _get_lm_model_tokenizer(self, lm_model="albert"): if getattr(self, "_lm_model_tokenizer", None) is not None: return self._lm_model_tokenizer if self._train_dl is not None and self._train_dl.dataset is not None: self._lm_model_tokenizer = self._train_dl.dataset.lm_model_tokenizer if lm_model == "albert": self._lm_model_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') else: raise NotImplementedError( f"{lm_model} lm model is not supported. Only albert is supported at this moment." ) return self._lm_model_tokenizer
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(ALBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 510: logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") max_seq_length = 510 self.max_seq_length = max_seq_length if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.albert = AlbertModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)