Пример #1
0
    def get_model(self) -> Union[T5Model, T5EncoderModel]:

        if not self._decoder:
            if self._half_precision_model:
                model = T5EncoderModel.from_pretrained(
                    self._model_directory, torch_dtype=torch.float16)
            else:
                model = T5EncoderModel.from_pretrained(self._model_directory)
        else:
            if self._half_precision_model:
                model = T5Model.from_pretrained(self._model_directory,
                                                torch_dtype=torch.float16)
            else:
                model = T5Model.from_pretrained(self._model_directory)

        return model
Пример #2
0
    def __init__(self, dropout=0.5):
        super().__init__()
        drop = nn.Dropout(dropout)

        if use_t5:
            """
            Use t5_model.encoder as the encoder for this model. Note that unlike the custom transformer, you don't
            need to use an external input or positional embedding for the T5 transformer 
            (i.e. don't define self.in_embed or self.pos_emb) since it already defines them internally

            You may specify layer weights to freeze during finetuning by modifying the freeze_layers global variable
            """
            ### Your code here ###
            self.t5_model = T5Model.from_pretrained(f't5-{use_t5}')
            self.t5_encoder = self.t5_model.encoder

            for i_layer, block in enumerate(self.t5_encoder.block):
                if i_layer in freeze_layers:
                    for param in block.parameters():
                        param.requires_grad = False
        else:
            # Input embedding for custom transformer
            self.in_embed = nn.Sequential(nn.Embedding(in_vocab.n, n_hid, padding_idx=in_vocab.pad), drop)
            # Positional embedding for custom transformer
            self.pos_embed = nn.Embedding(1 + n_max_in, n_hid)  # Use the first position as global vector
            self.transformer_layers = nn.ModuleList(TransformerBlock() for _ in range(n_layers))

        self.gcn = GCN(n_head=args.n_head, dropout=args.dropout)

        self.decoder = TreeDecoder()

        if not use_t5:
            self.apply(self.init_weight)
Пример #3
0
def convert_t5(args):
    logging.info('converting T5 model from Huggingface...')
    if not os.path.exists(args.dest_dir):
        os.mkdir(args.dest_dir)
    converted = {}
    # convert and save vocab
    convert_vocab(args, converted)
    # convert and save config
    gluon_cfg = convert_config(args, converted)
    # convert, (test), and save model
    hf_t5 = HF_T5.from_pretrained(args.model_name)
    gluon_t5 = Gluon_T5.from_cfg(gluon_cfg)
    gluon_t5 = convert_params(args, converted, hf_t5, gluon_t5)
    gluon_t5.hybridize()
    # test model if needed
    if args.test:
        test_conversion(args, hf_t5, gluon_t5)
    # rename with sha1sum
    rename(args, converted)
    logging.info('conversion completed.')
    logging.info('file statistics:')
    for item, new_path in converted.items():
        logging.info('filename: {}\tsize: {}\tsha1sum: {}'.format(
            os.path.basename(new_path), os.path.getsize(new_path),
            sha1sum(new_path)))
    return converted
Пример #4
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: Optional[bool] = None,
                 task_identifier: str = 'stsb sentence1: ',
                 model_args: Dict = {},
                 tokenizer_args: Dict = {}):
        super(T5, self).__init__()
        self.config_keys = [
            'max_seq_length', 'do_lower_case', 'task_identifier'
        ]
        self.do_lower_case = do_lower_case

        if max_seq_length > 512:
            logging.warning(
                "T5 only allows a max_seq_length of 512. Value will be set to 512"
            )
            max_seq_length = 512
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.t5model = T5Model.from_pretrained(model_name_or_path,
                                               **model_args)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path,
                                                     **tokenizer_args)
        self.task_identifier = task_identifier
Пример #5
0
def get_emb(inputs_list, model_name, max_length=512):
    if 't5' in model_name:
        tokenizer = T5Tokenizer.from_pretrained(TOKEN_DIR)
        model = T5Model.from_pretrained(MODEL_DIR)
        inputs = tokenizer.batch_encode_plus(inputs_list,
                                             max_length=max_length,
                                             pad_to_max_length=True,
                                             return_tensors="pt")
        outputs = model(input_ids=inputs['input_ids'],
                        decoder_input_ids=inputs['input_ids'])
        last_hidden_states = torch.mean(outputs[0], dim=1)
        return last_hidden_states.tolist()

    elif 'bert' in model_name:
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased')
        model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        batch_encoding = tokenizer.batch_encode_plus(
            ["this is", "the second", "the thrid"],
            max_length=max_length,
            pad_to_max_length=True)

        outputs = model(tf.convert_to_tensor(batch_encoding['input_ids'])
                        )  # shape: (batch,sequence length, hidden state)
        embeddings = tf.reduce_mean(outputs[0], 1)
        return embeddings.numpy().tolist()
    def __init__(self, config, x_embed):
        super().__init__()

        self.model = T5Model.from_pretrained(config.pretrained_weights)
        self.encoder_out_size = self.model.config.d_model  # 1024 for t-large

        return
Пример #7
0
    def __init__(self):

        super().__init__()

        self.t5 = t5 = T5Model.from_pretrained('t5-small')

        self.out = nn.Linear(t5.config.to_dict()['d_model'],
                             t5.config.to_dict()['vocab_size'])
Пример #8
0
 def prepare_model(self, condition_generation=False):
     if condition_generation:
         self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
     else:
         t5_model = T5Model.from_pretrained('t5-base')
         self.model = GenerationModel(t5_model)
     self.load_checkpoint()
     self.model = self.model.cuda()
Пример #9
0
 def get_model(self) -> Union[T5Model, T5EncoderModel]:
     if not self._decoder:
         model = T5EncoderModel.from_pretrained(self._model_directory)
     else:
         model = T5Model.from_pretrained(self._model_directory)
     # Compute in half precision, saving us half the memory
     if self._half_precision_model:
         model = model.half()
     return model
Пример #10
0
 def __init__(self, model, num_steps, num_classes=2):
     super(T5Classifier, self).__init__()
     hidden_size = {
         "t5-small": 512,
         "t5-base": 768,
         "t5-large": 1024,
     }[model]
     self.model = T5Model.from_pretrained(model)
     self.tokenizer = T5Tokenizer.from_pretrained(model)
     self.num_steps = num_steps
     self.classifier = nn.Linear(hidden_size, num_classes)
Пример #11
0
def main():
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    model = T5Model.from_pretrained('t5-small')

    input_ids = tokenizer.encode("translate English to German: That is good.",
                                 return_tensors="pt")
    outputs = model(input_ids=input_ids)
    scores = outputs[0]

    out_indices = torch.argmax(scores, dim=2)
    predicted_token = tokenizer.convert_ids_to_tokens(out_indices[0])
    print(predicted_token)
Пример #12
0
 def prepare_model(self,
                   condition_generation=False,
                   template_decoding=False):
     print('condition_generation: ', condition_generation)
     if condition_generation:
         self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
     else:
         t5_model = T5Model.from_pretrained('t5-base')
         if template_decoding:
             self.model = GenerationModel(t5_model, self.temp)
         else:
             self.model = GenerationModel(t5_model)
     self.lr = 1e-3
     self.model = self.model.cuda()
Пример #13
0
    def __init__(self, config, x_embed):
        super().__init__()

        # pretrained_weights = "xlnet-base-cased"
        self.model = T5Model.from_pretrained(config.pretrained_weights)
        

        # if config.use_gpu:
        #   self.model = self.model.to(device=torch.device("cuda"))
        # if config.use_parallel:
        #   self.model = torch.nn.DataParallel(self.model)

        self.encoder_out_size = self.model.config.d_model  # 1024 for t-large

        return
Пример #14
0
    def __init__(self,
                 model_name_or_path: str,
                 max_seq_length: int = 128,
                 do_lower_case: bool = True):
        super(T5, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 512:
            logging.warning(
                "T5 only allows a max_seq_length of 512. Value will be set to 512"
            )
            max_seq_length = 512
        self.max_seq_length = max_seq_length

        self.enc_model = T5Model.from_pretrained(model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(
            model_name_or_path, do_lower_case=do_lower_case)
Пример #15
0
    def __init__(self, **kwargs):
        """
        Initialize T5 embedder.

        :param model_directory:
        """
        super().__init__(**kwargs)

        self._model_directory = self._options["model_directory"]
        # Until we know whether we need the decoder, let's keep it here as an undocumented option.
        # Should the need arise we can just split this class in to an encoder and a decoder subclass
        # by setting one subclass to _decoder=True and the other to _decoder=False
        self._decoder = self._options.get("decoder", False)

        # make model
        self._model = T5Model.from_pretrained(self._model_directory)
        self._model = self._model.eval().to(self._device)
        self._model_fallback = None
        self._tokenizer = T5Tokenizer(
            str(Path(self._model_directory).joinpath("spiece.model")),
            do_lower_case=False,
        )
 def test_model_from_pretrained(self):
     for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = T5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
Пример #17
0
options = parser.parse_args()

# make transforms using only bert tokenizer!
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# load dataset
test_dataset = Task71Dataset("dev", tokenizer=tokenizer)

collator_fn = Task71aCollatorTest(device='cpu')
test_loader = DataLoader(test_dataset,
                         batch_size=options.batch_size,
                         drop_last=False,
                         shuffle=True,
                         collate_fn=collator_fn)

# create model
model = T5Model.from_pretrained('t5-base')
model = T5ClassificationHead(model.encoder,
                             model.config.hidden_size,
                             num_classes=2,
                             drop=0.2,
                             act='none')

if options.modelckpt is not None:
    state_dict = torch.load(options.modelckpt, map_location='cpu')
    model.load_state_dict(state_dict)

model.to(DEVICE)

create_submition_file(options.outfolder, model, test_loader, DEVICE)
Пример #18
0
# define senteval params
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
                                 'tenacity': 5, 'epoch_size': 4}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', default='t5-large', help='model name or path')
    args = parser.parse_args()

    config = T5Config.from_pretrained(args.model)
    model = T5Model.from_pretrained(args.model,config=config)
    tokenizer = T5Tokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    # transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
    #                   'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
    #                   'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
    #                   'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense',
    #                   'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    #                   ]
    transfer_tasks = [ 'SNLI', 'ImageCaptionRetrieval'
                      ]
Пример #19
0
 def test_model_from_pretrained(self):
     cache_dir = "/tmp/transformers_test/"
     for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(model)
Пример #20
0
    def __init__(self, hparams):
        super().__init__()

        self.hparams = hparams
        if self.hparams.vocab_name == "custom":
            self.tokenizer = get_custom_vocab()
        else:
            self.tokenizer = T5Tokenizer.from_pretrained(
                self.hparams.vocab_name)

        if "small" in self.hparams.model_name.split('-'):
            self.size = "small"
        elif "base" in self.hparams.model_name.split('-'):
            self.size = "base"
        elif "large" in self.hparams.model_name.split('-'):
            self.size = "large"
        else:
            raise ValueError("Couldn't detect model size from model_name.")

        if self.hparams.model_name[:2] == "pt":
            logging.info("Initializing from PTT5 checkpoint...")
            config, state_dict = self.get_ptt5()
            if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen":
                self.t5 = T5ForConditionalGeneration.from_pretrained(
                    pretrained_model_name_or_path=None,
                    config=config,
                    state_dict=state_dict)
            else:
                self.t5 = T5Model.from_pretrained(
                    pretrained_model_name_or_path=None,
                    config=config,
                    state_dict=state_dict)
        else:
            logging.info("Initializing from T5 checkpoint...")
            if self.hparams.architecture == "gen" or self.hparams.architecture == "categoric_gen":
                self.t5 = T5ForConditionalGeneration.from_pretrained(
                    self.hparams.model_name)
            else:
                self.t5 = T5Model.from_pretrained(self.hparams.model_name)

        D = self.t5.config.d_model

        if self.hparams.architecture == "mlp":
            # Replace T5 with a simple nonlinear input
            self.t5 = NONLinearInput(self.hparams.seq_len, D)

        if self.hparams.architecture != "gen" and self.hparams.architecture != "categoric_gen":
            if self.hparams.architecture == "categoric":
                assert self.hparams.nout != 1, "Categoric mode with 1 nout doesn't work with CrossEntropyLoss"
                self.linear = nn.Linear(D, self.hparams.nout)
            else:
                self.linear = nn.Linear(D, 1)

        if self.hparams.architecture == "categoric" or self.hparams.architecture == "categoric_gen":
            self.loss = nn.CrossEntropyLoss()
        else:
            self.loss = nn.MSELoss()

        self.pearson_calculator = PearsonCalculator()

        logging.info("Initialization done.")
Пример #21
0
def setup(use_t5, train_path='data/train.json', test_path='data/test.json', n_min_vocab=5, seed=0, test_split=0.1, do_eval=False):
    with open(train_path, 'r') as f:
        data = json.load(f)
    constants, n_max_nP = tokenize_and_separate_quants(data, n_min_vocab)

    np.random.seed(seed)
    np.random.shuffle(data)
    n_test = int(test_split * len(data))
    train_data, val_data = data[:-n_test], data[-n_test:]

    default_tokens = ['<pad>', '<unk>']
    operation_tokens = ['+', '-', '*', '/']
    if use_t5:
        from transformers import T5Tokenizer, T5Model
        # https://arxiv.org/pdf/1910.10683.pdf
        # https://huggingface.co/transformers/model_doc/t5.html
        # https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_t5.py

        t5_tokenizer = T5Tokenizer.from_pretrained(f't5-{use_t5}')
        t5_model = T5Model.from_pretrained(f't5-{use_t5}')
        in_vocab = Vocabulary([k for k, i in sorted(t5_tokenizer.get_vocab().items(), key=lambda ki: ki[1])], t5_tokenizer.pad_token, t5_tokenizer.unk_token)
    else:
        in_counts = Counter()
        for d in train_data:
            in_counts.update(d['in_tokens'])
        in_vocab = Vocabulary([w for w, c in in_counts.items() if c >= n_min_vocab] + default_tokens)
        t5_model = None
    out_vocab = Vocabulary(operation_tokens + constants + [(i,) for i in range(n_max_nP)] + default_tokens)
    out_vocab.constants = constants
    out_vocab.n_constants = len(constants)
    out_vocab.n_ops = len(operation_tokens)
    out_vocab.base_op = 0
    out_vocab.base_quant = out_vocab.base_constant = out_vocab.base_op + out_vocab.n_ops
    out_vocab.base_nP = out_vocab.base_constant + out_vocab.n_constants

    if do_eval:
        with open(test_path, 'r') as f:
            test_data = json.load(f)
        tokenize_and_separate_quants(test_data, n_min_vocab)
        if use_t5:
            for d in test_data:
                convert_word_to_bytepair_tokenization(d, t5_tokenizer)
        return test_data, in_vocab, out_vocab, n_max_nP, t5_model
    else:
        for d in itertools.chain(train_data, val_data):
            d['out_tokens'] = infix_to_prefix(d['out_tokens'])
            if use_t5:
                convert_word_to_bytepair_tokenization(d, t5_tokenizer)
            d['nP_candidates'] = candidates = {}
            nP = d['nP']
            for j, out_token in enumerate(d['out_tokens']):
                if out_token not in out_vocab.token2idx:
                    # Token is a quantity not in the vocab. Generally this happens for two cases
                    if isinstance(out_token, tuple):
                    # 1. The equation contains two of the same numbers, e.g. ['+', '8', '8'], and we don't know which number comes first in the English sentence
                        candidates[j] = np.array(out_token)
                    else:
                    # 2. The equation contains a number which represents English words such as 'nickel', 'dime', 'quarter', 'eight', etc. which was not common enough to translate into a constant during parsing. There's not much we can do here
                        candidates[j] = np.arange(len(nP))
        train_data = [d for d in train_data if not d['is_quadratic']]
        return train_data, val_data, in_vocab, out_vocab, n_max_nP, t5_model
 def test_model_from_pretrained(self):
     for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
         self.assertIsNotNone(model)
Пример #23
0
 def _get_fallback_model(self) -> T5Model:
     """ Returns the CPU model """
     if not self._model_fallback:
         self._model_fallback = T5Model.from_pretrained(
             self._model_directory).eval()
     return self._model_fallback
Пример #24
0
# CLS token will work as BOS token
# tokenizer.bos_token = tokenizer.cls_token
# SEP token will work as EOS token
# tokenizer.eos_token = tokenizer.sep_token

# load dataset
dataset = Task71Dataset("train", tokenizer=tokenizer)

collator_fn = Task71aCollatorFeatures(device='cpu')
loader = DataLoader(dataset, batch_size=options.batch_size,
                    drop_last=False, shuffle=True,
                    collate_fn=collator_fn)


# create model
encoder = T5Model.from_pretrained('t5-base')

# change config if you want
# encoder.config.output_hidden_states = True
model = T5ClassificationHead(encoder.encoder, encoder.config.hidden_size,
                               num_classes=2, drop=0.2)
if options.modelckpt is not None:
    state_dict = torch.load(options.modelckpt,map_location='cpu')
    model.load_state_dict(state_dict)

model.to(DEVICE)

res_dict = get_features(loader, model, DEVICE)
if not os.path.exists('./features_train/'):
    os.makedirs('./features_train')
pickle.dump(res_dict, open("./features_train/t5_features.pkl", "wb"))
Пример #25
0
import torch
from tools import start_debugger_on_exception
from dataset import DataSetBert
import numpy as np
start_debugger_on_exception()
train_dataset = DataSetBert(data_file='./data/data_train/train.csv')
val_dataset = DataSetBert(data_file='./data/data_train/val.csv')
test_dataset = DataSetBert(data_file='./data/data_train/test.csv')
from torch.utils.data import DataLoader
device = torch.device('cuda:6')
train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True)
model = T5ForConditionalGeneration.from_pretrained(
    'uer/t5-small-chinese-cluecorpussmall')
model = T5Model.from_pretrained('uer/t5-small-chinese-cluecorpussmall')
#model = AutoModelForSequenceClassification.from_pretrained('uer/t5-small-chinese-cluecorpussmall')
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    'uer/t5-small-chinese-cluecorpussmall')
model.resize_token_embeddings(len(tokenizer))
model.config.n_positions = 1024
model.to(device)
model.train()
model.to(device)
pad_token_id = model.config.pad_token_id
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [