def get_model(self) -> Union[T5Model, T5EncoderModel]: if not self._decoder: if self._half_precision_model: model = T5EncoderModel.from_pretrained( self._model_directory, torch_dtype=torch.float16) else: model = T5EncoderModel.from_pretrained(self._model_directory) else: if self._half_precision_model: model = T5Model.from_pretrained(self._model_directory, torch_dtype=torch.float16) else: model = T5Model.from_pretrained(self._model_directory) return model
def __init__(self, dropout=0.5): super().__init__() drop = nn.Dropout(dropout) if use_t5: """ Use t5_model.encoder as the encoder for this model. Note that unlike the custom transformer, you don't need to use an external input or positional embedding for the T5 transformer (i.e. don't define self.in_embed or self.pos_emb) since it already defines them internally You may specify layer weights to freeze during finetuning by modifying the freeze_layers global variable """ ### Your code here ### self.t5_model = T5Model.from_pretrained(f't5-{use_t5}') self.t5_encoder = self.t5_model.encoder for i_layer, block in enumerate(self.t5_encoder.block): if i_layer in freeze_layers: for param in block.parameters(): param.requires_grad = False else: # Input embedding for custom transformer self.in_embed = nn.Sequential(nn.Embedding(in_vocab.n, n_hid, padding_idx=in_vocab.pad), drop) # Positional embedding for custom transformer self.pos_embed = nn.Embedding(1 + n_max_in, n_hid) # Use the first position as global vector self.transformer_layers = nn.ModuleList(TransformerBlock() for _ in range(n_layers)) self.gcn = GCN(n_head=args.n_head, dropout=args.dropout) self.decoder = TreeDecoder() if not use_t5: self.apply(self.init_weight)
def convert_t5(args): logging.info('converting T5 model from Huggingface...') if not os.path.exists(args.dest_dir): os.mkdir(args.dest_dir) converted = {} # convert and save vocab convert_vocab(args, converted) # convert and save config gluon_cfg = convert_config(args, converted) # convert, (test), and save model hf_t5 = HF_T5.from_pretrained(args.model_name) gluon_t5 = Gluon_T5.from_cfg(gluon_cfg) gluon_t5 = convert_params(args, converted, hf_t5, gluon_t5) gluon_t5.hybridize() # test model if needed if args.test: test_conversion(args, hf_t5, gluon_t5) # rename with sha1sum rename(args, converted) logging.info('conversion completed.') logging.info('file statistics:') for item, new_path in converted.items(): logging.info('filename: {}\tsize: {}\tsha1sum: {}'.format( os.path.basename(new_path), os.path.getsize(new_path), sha1sum(new_path))) return converted
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}): super(T5, self).__init__() self.config_keys = [ 'max_seq_length', 'do_lower_case', 'task_identifier' ] self.do_lower_case = do_lower_case if max_seq_length > 512: logging.warning( "T5 only allows a max_seq_length of 512. Value will be set to 512" ) max_seq_length = 512 self.max_seq_length = max_seq_length if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args) self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args) self.task_identifier = task_identifier
def get_emb(inputs_list, model_name, max_length=512): if 't5' in model_name: tokenizer = T5Tokenizer.from_pretrained(TOKEN_DIR) model = T5Model.from_pretrained(MODEL_DIR) inputs = tokenizer.batch_encode_plus(inputs_list, max_length=max_length, pad_to_max_length=True, return_tensors="pt") outputs = model(input_ids=inputs['input_ids'], decoder_input_ids=inputs['input_ids']) last_hidden_states = torch.mean(outputs[0], dim=1) return last_hidden_states.tolist() elif 'bert' in model_name: tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') model = TFBertModel.from_pretrained('bert-base-multilingual-cased') batch_encoding = tokenizer.batch_encode_plus( ["this is", "the second", "the thrid"], max_length=max_length, pad_to_max_length=True) outputs = model(tf.convert_to_tensor(batch_encoding['input_ids']) ) # shape: (batch,sequence length, hidden state) embeddings = tf.reduce_mean(outputs[0], 1) return embeddings.numpy().tolist()
def __init__(self, config, x_embed): super().__init__() self.model = T5Model.from_pretrained(config.pretrained_weights) self.encoder_out_size = self.model.config.d_model # 1024 for t-large return
def __init__(self): super().__init__() self.t5 = t5 = T5Model.from_pretrained('t5-small') self.out = nn.Linear(t5.config.to_dict()['d_model'], t5.config.to_dict()['vocab_size'])
def prepare_model(self, condition_generation=False): if condition_generation: self.model = T5ForConditionalGeneration.from_pretrained('t5-base') else: t5_model = T5Model.from_pretrained('t5-base') self.model = GenerationModel(t5_model) self.load_checkpoint() self.model = self.model.cuda()
def get_model(self) -> Union[T5Model, T5EncoderModel]: if not self._decoder: model = T5EncoderModel.from_pretrained(self._model_directory) else: model = T5Model.from_pretrained(self._model_directory) # Compute in half precision, saving us half the memory if self._half_precision_model: model = model.half() return model
def __init__(self, model, num_steps, num_classes=2): super(T5Classifier, self).__init__() hidden_size = { "t5-small": 512, "t5-base": 768, "t5-large": 1024, }[model] self.model = T5Model.from_pretrained(model) self.tokenizer = T5Tokenizer.from_pretrained(model) self.num_steps = num_steps self.classifier = nn.Linear(hidden_size, num_classes)
def main(): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5Model.from_pretrained('t5-small') input_ids = tokenizer.encode("translate English to German: That is good.", return_tensors="pt") outputs = model(input_ids=input_ids) scores = outputs[0] out_indices = torch.argmax(scores, dim=2) predicted_token = tokenizer.convert_ids_to_tokens(out_indices[0]) print(predicted_token)
def prepare_model(self, condition_generation=False, template_decoding=False): print('condition_generation: ', condition_generation) if condition_generation: self.model = T5ForConditionalGeneration.from_pretrained('t5-base') else: t5_model = T5Model.from_pretrained('t5-base') if template_decoding: self.model = GenerationModel(t5_model, self.temp) else: self.model = GenerationModel(t5_model) self.lr = 1e-3 self.model = self.model.cuda()
def __init__(self, config, x_embed): super().__init__() # pretrained_weights = "xlnet-base-cased" self.model = T5Model.from_pretrained(config.pretrained_weights) # if config.use_gpu: # self.model = self.model.to(device=torch.device("cuda")) # if config.use_parallel: # self.model = torch.nn.DataParallel(self.model) self.encoder_out_size = self.model.config.d_model # 1024 for t-large return
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(T5, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 512: logging.warning( "T5 only allows a max_seq_length of 512. Value will be set to 512" ) max_seq_length = 512 self.max_seq_length = max_seq_length self.enc_model = T5Model.from_pretrained(model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case)
def __init__(self, **kwargs): """ Initialize T5 embedder. :param model_directory: """ super().__init__(**kwargs) self._model_directory = self._options["model_directory"] # Until we know whether we need the decoder, let's keep it here as an undocumented option. # Should the need arise we can just split this class in to an encoder and a decoder subclass # by setting one subclass to _decoder=True and the other to _decoder=False self._decoder = self._options.get("decoder", False) # make model self._model = T5Model.from_pretrained(self._model_directory) self._model = self._model.eval().to(self._device) self._model_fallback = None self._tokenizer = T5Tokenizer( str(Path(self._model_directory).joinpath("spiece.model")), do_lower_case=False, )
def test_model_from_pretrained(self): for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = T5Model.from_pretrained(model_name) self.assertIsNotNone(model)
options = parser.parse_args() # make transforms using only bert tokenizer! tokenizer = T5Tokenizer.from_pretrained('t5-base') # load dataset test_dataset = Task71Dataset("dev", tokenizer=tokenizer) collator_fn = Task71aCollatorTest(device='cpu') test_loader = DataLoader(test_dataset, batch_size=options.batch_size, drop_last=False, shuffle=True, collate_fn=collator_fn) # create model model = T5Model.from_pretrained('t5-base') model = T5ClassificationHead(model.encoder, model.config.hidden_size, num_classes=2, drop=0.2, act='none') if options.modelckpt is not None: state_dict = torch.load(options.modelckpt, map_location='cpu') model.load_state_dict(state_dict) model.to(DEVICE) create_submition_file(options.outfolder, model, test_loader, DEVICE)
# define senteval params params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4} # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='t5-large', help='model name or path') args = parser.parse_args() config = T5Config.from_pretrained(args.model) model = T5Model.from_pretrained(args.model,config=config) tokenizer = T5Tokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', # 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', # 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', # 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', # 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' # ] transfer_tasks = [ 'SNLI', 'ImageCaptionRetrieval' ]
def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = T5Model.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def __init__(self, hparams): super().__init__() self.hparams = hparams if self.hparams.vocab_name == "custom": self.tokenizer = get_custom_vocab() else: self.tokenizer = T5Tokenizer.from_pretrained( self.hparams.vocab_name) if "small" in self.hparams.model_name.split('-'): self.size = "small" elif "base" in self.hparams.model_name.split('-'): self.size = "base" elif "large" in self.hparams.model_name.split('-'): self.size = "large" else: raise ValueError("Couldn't detect model size from model_name.") if self.hparams.model_name[:2] == "pt": logging.info("Initializing from PTT5 checkpoint...") config, state_dict = self.get_ptt5() if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen": self.t5 = T5ForConditionalGeneration.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=state_dict) else: self.t5 = T5Model.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=state_dict) else: logging.info("Initializing from T5 checkpoint...") if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen": self.t5 = T5ForConditionalGeneration.from_pretrained( self.hparams.model_name) else: self.t5 = T5Model.from_pretrained(self.hparams.model_name) D = self.t5.config.d_model if self.hparams.architecture == "mlp": # Replace T5 with a simple nonlinear input self.t5 = NONLinearInput(self.hparams.seq_len, D) if self.hparams.architecture != "gen" and self.hparams.architecture != "categoric_gen": if self.hparams.architecture == "categoric": assert self.hparams.nout != 1, "Categoric mode with 1 nout doesn't work with CrossEntropyLoss" self.linear = nn.Linear(D, self.hparams.nout) else: self.linear = nn.Linear(D, 1) if self.hparams.architecture == "categoric" or self.hparams.architecture == "categoric_gen": self.loss = nn.CrossEntropyLoss() else: self.loss = nn.MSELoss() self.pearson_calculator = PearsonCalculator() logging.info("Initialization done.")
def setup(use_t5, train_path='data/train.json', test_path='data/test.json', n_min_vocab=5, seed=0, test_split=0.1, do_eval=False): with open(train_path, 'r') as f: data = json.load(f) constants, n_max_nP = tokenize_and_separate_quants(data, n_min_vocab) np.random.seed(seed) np.random.shuffle(data) n_test = int(test_split * len(data)) train_data, val_data = data[:-n_test], data[-n_test:] default_tokens = ['<pad>', '<unk>'] operation_tokens = ['+', '-', '*', '/'] if use_t5: from transformers import T5Tokenizer, T5Model # https://arxiv.org/pdf/1910.10683.pdf # https://huggingface.co/transformers/model_doc/t5.html # https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_t5.py t5_tokenizer = T5Tokenizer.from_pretrained(f't5-{use_t5}') t5_model = T5Model.from_pretrained(f't5-{use_t5}') in_vocab = Vocabulary([k for k, i in sorted(t5_tokenizer.get_vocab().items(), key=lambda ki: ki[1])], t5_tokenizer.pad_token, t5_tokenizer.unk_token) else: in_counts = Counter() for d in train_data: in_counts.update(d['in_tokens']) in_vocab = Vocabulary([w for w, c in in_counts.items() if c >= n_min_vocab] + default_tokens) t5_model = None out_vocab = Vocabulary(operation_tokens + constants + [(i,) for i in range(n_max_nP)] + default_tokens) out_vocab.constants = constants out_vocab.n_constants = len(constants) out_vocab.n_ops = len(operation_tokens) out_vocab.base_op = 0 out_vocab.base_quant = out_vocab.base_constant = out_vocab.base_op + out_vocab.n_ops out_vocab.base_nP = out_vocab.base_constant + out_vocab.n_constants if do_eval: with open(test_path, 'r') as f: test_data = json.load(f) tokenize_and_separate_quants(test_data, n_min_vocab) if use_t5: for d in test_data: convert_word_to_bytepair_tokenization(d, t5_tokenizer) return test_data, in_vocab, out_vocab, n_max_nP, t5_model else: for d in itertools.chain(train_data, val_data): d['out_tokens'] = infix_to_prefix(d['out_tokens']) if use_t5: convert_word_to_bytepair_tokenization(d, t5_tokenizer) d['nP_candidates'] = candidates = {} nP = d['nP'] for j, out_token in enumerate(d['out_tokens']): if out_token not in out_vocab.token2idx: # Token is a quantity not in the vocab. Generally this happens for two cases if isinstance(out_token, tuple): # 1. The equation contains two of the same numbers, e.g. ['+', '8', '8'], and we don't know which number comes first in the English sentence candidates[j] = np.array(out_token) else: # 2. The equation contains a number which represents English words such as 'nickel', 'dime', 'quarter', 'eight', etc. which was not common enough to translate into a constant during parsing. There's not much we can do here candidates[j] = np.arange(len(nP)) train_data = [d for d in train_data if not d['is_quadratic']] return train_data, val_data, in_vocab, out_vocab, n_max_nP, t5_model
def test_model_from_pretrained(self): for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
def _get_fallback_model(self) -> T5Model: """ Returns the CPU model """ if not self._model_fallback: self._model_fallback = T5Model.from_pretrained( self._model_directory).eval() return self._model_fallback
# CLS token will work as BOS token # tokenizer.bos_token = tokenizer.cls_token # SEP token will work as EOS token # tokenizer.eos_token = tokenizer.sep_token # load dataset dataset = Task71Dataset("train", tokenizer=tokenizer) collator_fn = Task71aCollatorFeatures(device='cpu') loader = DataLoader(dataset, batch_size=options.batch_size, drop_last=False, shuffle=True, collate_fn=collator_fn) # create model encoder = T5Model.from_pretrained('t5-base') # change config if you want # encoder.config.output_hidden_states = True model = T5ClassificationHead(encoder.encoder, encoder.config.hidden_size, num_classes=2, drop=0.2) if options.modelckpt is not None: state_dict = torch.load(options.modelckpt,map_location='cpu') model.load_state_dict(state_dict) model.to(DEVICE) res_dict = get_features(loader, model, DEVICE) if not os.path.exists('./features_train/'): os.makedirs('./features_train') pickle.dump(res_dict, open("./features_train/t5_features.pkl", "wb"))
import torch from tools import start_debugger_on_exception from dataset import DataSetBert import numpy as np start_debugger_on_exception() train_dataset = DataSetBert(data_file='./data/data_train/train.csv') val_dataset = DataSetBert(data_file='./data/data_train/val.csv') test_dataset = DataSetBert(data_file='./data/data_train/test.csv') from torch.utils.data import DataLoader device = torch.device('cuda:6') train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True) model = T5ForConditionalGeneration.from_pretrained( 'uer/t5-small-chinese-cluecorpussmall') model = T5Model.from_pretrained('uer/t5-small-chinese-cluecorpussmall') #model = AutoModelForSequenceClassification.from_pretrained('uer/t5-small-chinese-cluecorpussmall') from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained( 'uer/t5-small-chinese-cluecorpussmall') model.resize_token_embeddings(len(tokenizer)) model.config.n_positions = 1024 model.to(device) model.train() model.to(device) pad_token_id = model.config.pad_token_id from transformers import AdamW optimizer = AdamW(model.parameters(), lr=1e-5) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [