예제 #1
0
    def __init__(self,
                 state_dim,
                 act_dim,
                 hidden_size,
                 max_length=None,
                 max_ep_len=4096,
                 action_tanh=True,
                 **kwargs):
        super().__init__(state_dim, act_dim, max_length=max_length)

        self.hidden_size = hidden_size
        config = transformers.GPT2Config(
            vocab_size=1,  # doesn't matter -- we don't use the vocab
            n_embd=hidden_size,
            **kwargs)

        # note: the only difference between this GPT2Model and the default Huggingface version
        # is that the positional embeddings are removed (since we'll add those ourselves)
        self.transformer = GPT2Model(config)

        self.embed_timestep = nn.Embedding(max_ep_len, hidden_size)
        self.embed_return = torch.nn.Linear(1, hidden_size)
        self.embed_state = torch.nn.Linear(self.state_dim, hidden_size)
        self.embed_action = torch.nn.Linear(self.act_dim, hidden_size)

        self.embed_ln = nn.LayerNorm(hidden_size)

        # note: we don't predict states or returns for the paper
        self.predict_state = torch.nn.Linear(hidden_size, self.state_dim)
        self.predict_action = nn.Sequential(
            *([nn.Linear(hidden_size, self.act_dim)] +
              ([nn.Tanh()] if action_tanh else [])))
        self.predict_return = torch.nn.Linear(hidden_size, 1)
예제 #2
0
def get_model(tokenizer, resume=False):
    if cfg('random_init'):
        # load randomly initialized model instead of pretrained
        model_config = transformers.GPT2Config()
        model = transformers.GPT2LMHeadModel(model_config)
    elif resume:
        # resume from previous best
        model = AutoModelForCausalLM.from_pretrained(
            cfg('out_path') + cfg('name'))
    else:
        # load pretrained model
        model = AutoModelForCausalLM.from_pretrained(cfg('model'))
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(cfg('device'))
    return model
예제 #3
0
def load_igpt(model_size, model_path, cluster_path, n_px):
    """ Load pretrained model and clusters """
    if model_size == "l":
        n_embd, n_head, n_layer = 1536, 16, 48
    elif model_size == "m":
        n_embd, n_head, n_layer = 1024, 8, 36
    elif model_size == "s":
        n_embd, n_head, n_layer = 512, 8, 24

    clusters = np.load(cluster_path)  # get color clusters

    vocab_size = len(clusters) + 1  # add one for start of sentence token
    config = transformers.GPT2Config(vocab_size=vocab_size, n_ctx=n_px*n_px, n_positions=n_px*n_px, n_embd=n_embd, n_layer=n_layer, n_head=n_head)
    model = ImageGPT2LMHeadModel.from_pretrained(model_path, from_tf=True, config=config)

    return model, torch.from_numpy(clusters)
예제 #4
0
    def __init__(self, model_name, model_size, models_dir, color_clusters_dir,
                 n_px, **parent_params):
        """

		Parameters
		----------
		model_name : str
			A name for this model, used for caching.
		model_size : str
			The size of iGPT used - "s" for small, "m" for medium, or "l" for large. The exact parameters are stored in
			`GPTExtractor.MODELS`.
		models_dir : str
			Path to directory with downloaded model. Make sure the params match the downloaded model.
		color_clusters_dir : str
			Path to directory with the downloaded color clusters.
		n_px : int
			The number of pixels used. All publicly available versions of iGPT are 32x32.
		parent_params
		"""
        super().__init__(model_name, **parent_params)

        self.n_px = n_px
        self.model_size = model_size

        color_clusters_file = "%s/kmeans_centers.npy" % color_clusters_dir
        self.clusters = np.load(color_clusters_file)  # get color clusters

        n_embd, n_head, n_layer = GPTExtractor.MODELS[
            model_size]  # set model hyperparameters

        self.vocab_size = len(
            self.clusters) + 1  # add one for start of sentence token

        self.config = transformers.GPT2Config(vocab_size=self.vocab_size,
                                              n_ctx=self.n_px * self.n_px,
                                              n_positions=self.n_px *
                                              self.n_px,
                                              n_embd=n_embd,
                                              n_layer=n_layer,
                                              n_head=n_head)
        self.model_path = "%s/%s/model.ckpt-1000000.index" % (models_dir,
                                                              model_size)
예제 #5
0
    def __init__(self,
                 vocab: nnlp.Vocab,
                 n_embd: int = 256,
                 n_layer: int = 4,
                 n_head: int = 4,
                 n_position: int = 128,
                 n_ctx: int = 128):
        super(GPT2Wrap, self).__init__()

        config = transformers.GPT2Config(vocab_size=len(vocab),
                                         n_embd=n_embd,
                                         n_layer=n_layer,
                                         n_head=n_head,
                                         n_positions=n_position,
                                         n_ctx=n_ctx,
                                         output_hidden_states=True)

        self.gpt2_model = transformers.GPT2LMHeadModel(config)
        self.vocab = vocab
        self.n_vocab = len(vocab)
예제 #6
0
def build_clf_model(vocab_size, params):
    # Create GPT2 languade model configuration
    clf_config = tm.GPT2Config(vocab_size,
                               params["seqlen"],
                               params["n_ctx"],
                               params["embed"],
                               params["layers"],
                               params["heads"],
                               resid_pdrop=params["drop"],
                               embd_pdrop=params["drop"],
                               attn_pdrop=params["drop"])

    # Load pre-trained GPT2 without language model head
    clf_gpt2 = GPT2Classifier(clf_config)
    if params["finetune"]:
        ckpt = tf.train.Checkpoint(net=clf_gpt2)
        ckpt.restore(tf.train.latest_checkpoint(
            params["pretr"])).expect_partial()

    return clf_gpt2
예제 #7
0
def test(loadtype: LoadType, use_cuda: bool):
    cfg = transformers.GPT2Config()
    model = transformers.GPT2Model(cfg)
    model.eval()
    torch.set_grad_enabled(False)

    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')

    cfg = model.config
    # use 4 threads for computing
    turbo_transformers.set_num_threads(4)

    input_ids = torch.tensor(
        ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]),
        dtype=torch.long)

    start_time = time.time()
    for _ in range(10):
        torch_res = model(input_ids)
    end_time = time.time()
    print("\ntorch time consum: {}".format(end_time - start_time))

    # there are three ways to load pretrained model.
    if loadtype is LoadType.PYTORCH:
        # 1, from a PyTorch model, which has loaded a pretrained model
        tt_model = turbo_transformers.GPT2Model.from_torch(model, test_device)
    else:
        raise ("LoadType is not supported")

    start_time = time.time()
    for _ in range(10):
        res = tt_model(input_ids)  # sequence_output, pooled_output
    end_time = time.time()

    print("\nturbo time consum: {}".format(end_time - start_time))
    assert (numpy.max(
        numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)
예제 #8
0
    def __init__(self,
                 vocab: nnlp.Vocab,
                 n_embd: int = 256,
                 n_layer: int = 2,
                 n_head: int = 2,
                 n_position: int = 128,
                 n_ctx: int = 128,
                 unk_hard_loss: float = -1.0):
        super(BiGPT2LM, self).__init__()

        config = transformers.GPT2Config(vocab_size=len(vocab),
                                         n_embd=n_embd,
                                         n_layer=n_layer,
                                         n_head=n_head,
                                         n_positions=n_position,
                                         n_ctx=n_ctx,
                                         output_hidden_states=True)

        self.gpt2model_fwd = transformers.GPT2LMHeadModel(config)
        self.gpt2model_rev = transformers.GPT2LMHeadModel(config)

        self.vocab = vocab
        self.unk_hard_loss = unk_hard_loss
예제 #9
0
def create_model(hparams, dictionary):
    # Config docs: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2config
    model = transformers.GPT2LMHeadModel(
        transformers.GPT2Config(vocab_size=len(dictionary),
                                n_embd=hparams["embedding_dim"],
                                n_layer=hparams["n_layer"],
                                n_head=hparams["n_head"],
                                n_positions=hparams['max_seq_length'],
                                n_ctx=hparams['max_seq_length']))

    if hparams["load_checkpoint"]:
        model.load_state_dict(
            torch.load(hparams["load_checkpoint"],
                       map_location=lambda storage, location: storage))

    if hparams["use_multi_gpu"]:
        assert torch.cuda.device_count() > 1
        print("Using %d GPUs" % torch.cuda.device_count())
        model = torch.nn.DataParallel(model)

    optim = torch.optim.Adam(model.parameters(), lr=hparams["lr"])

    return model, optim
예제 #10
0
    def __init__(
        self,
        tokenizer_model,
        train_file,
        valid_file,
        test_file,
        from_pretrained=None,
        block_size=1024,
        # [Model config]
        # for small
        n_layer=12,
        n_head=12,
        n_embd=768,
        # for medium -> n_layer=24, n_head=16, n_embd=1024
        # for large  -> n_layer=36, n_head=20, n_embd=5120
        # for XL     -> n_layer=48, n_head=24, n_embd=6400
        # [DataLoader options]
        batch_size=2,
        prefetch_factor=10,
        num_workers=1,
        shuffle_buffer_size=1000,
        lr=1e-4,
        num_warmup_steps=0,
        num_training_steps=None,
    ):
        super().__init__()

        # Load tokenzier
        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model)
        self._tokenizer = tokenizer

        # Load or initialize model
        if from_pretrained:
            config = transformers.GPT2Config.from_pretrained(from_pretrained)
            model = transformers.GPT2LMHeadModel.from_pretrained(
                from_pretrained)
        else:
            # Prepare model
            config = transformers.GPT2Config(
                vocab_size=len(tokenizer),
                tokenizer_class=tokenizer.__class__.__name__,
                bos_token_id=tokenizer.bos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                sep_token_id=tokenizer.sep_token_id,
                cls_token_id=tokenizer.cls_token_id,
                unk_token_id=tokenizer.unk_token_id,
                #
                n_layer=n_layer,
                n_head=n_head,
                n_embd=n_embd)
            model = transformers.GPT2LMHeadModel(config)

        self.model = model
        self._config = config

        self._train_file = train_file
        self._valid_file = valid_file
        self._test_file = test_file
        self._batch_size = batch_size
        self._prefetch_factor = prefetch_factor
        self._num_workers = num_workers
        self._shuffle_buffer_size = shuffle_buffer_size
        self._lr = lr
        self._num_warmup_steps = num_warmup_steps
        self._num_training_steps = num_training_steps
예제 #11
0
                                                  merges_path=MERGES_PATH,
                                                  data_path=DATA_PATH,
                                                  seq_len=seq_len)

split_ratio = [0.9, 0.1]
split_lens = [int(len(dataset) * split_ratio[0]), None]
split_lens[1] = len(dataset) - split_lens[0]

train_set, valid_set = torch.utils.data.random_split(dataset, split_lens)

print("Loading Model...")

config = transformers.GPT2Config(
    vocab_size=261,
    n_positions=seq_len,
    n_ctx=seq_len,
    n_embd=30,
    n_layer=3,
    n_head=3
)

model = transformers.GPT2LMHeadModel(config=config)

print("Training Model...")

writer = SummaryWriter()

training_args = transformers.TrainingArguments(
    output_dir="models/gpt2/",
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
예제 #12
0
def main(
        tokenizer_model,
        save_model_dir,
        train_file,
        valid_file,
        seed=None,
        block_size=1024,
        # [Model config]
        # for small
        n_layer=12,
        n_head=12,
        n_embd=768,
        # for medium -> n_layer=24, n_head=16, n_embd=1024
        # for large  -> n_layer=36, n_head=20, n_embd=5120
        # for XL     -> n_layer=48, n_head=24, n_embd=6400
        # [DataLoader options]
        batch_size=2,
        prefetch_factor=10,
        num_workers=1,
        shuffle_buffer_size=1000,
        lr=1e-4,
        num_warmup_steps=0,
        num_training_steps=None,
        # optoins for trainer
        ds_config: str = None,
        **train_options):
    # Set seed
    if seed:
        pl.seed_everything(seed)
    # Load tokenzier
    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model)

    # Prepare model
    config = transformers.GPT2Config(
        vocab_size=len(tokenizer),
        tokenizer_class=tokenizer.__class__.__name__,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        sep_token_id=tokenizer.sep_token_id,
        cls_token_id=tokenizer.cls_token_id,
        unk_token_id=tokenizer.unk_token_id,
        #
        n_layer=n_layer,
        n_head=n_head,
        n_embd=n_embd)
    print(config)

    # Load data
    train_dataset = BlockDataset.from_file(
        block_size=config.n_ctx,
        tokenizer=tokenizer,
        filename=train_file,
    )
    valid_dataset = BlockDataset.from_file(
        block_size=config.n_ctx,
        tokenizer=tokenizer,
        filename=valid_file,
    )
    shuffled_train_dataset = torch.utils.data.BufferedShuffleDataset(
        train_dataset,
        buffer_size=shuffle_buffer_size,
    )

    # Build DataLoader
    train_loader = torch.utils.data.DataLoader(
        dataset=shuffled_train_dataset,
        batch_size=batch_size,
        collate_fn=BlockDataset.collate_fn,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        collate_fn=BlockDataset.collate_fn,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    # Trainer
    print("Training options:", train_options)
    pl_model = PLModel(config=config,
                       lr=lr,
                       num_warmup_steps=num_warmup_steps,
                       num_training_steps=num_training_steps)

    # Setup callbacks
    callbacks = [
        pl.callbacks.LearningRateMonitor(),
    ]
    if "gpus" in train_options:
        callbacks.append(pl.callbacks.GPUStatsMonitor())

    # Setup plugins
    plugins = []
    if ds_config:
        plugins.append(DeepSpeedPlugin(config=ds_config))

    # Trainer
    trainer = pl.Trainer(
        **train_options,
        deterministic=True if seed else False,
        callbacks=callbacks,
    )
    trainer.fit(model=pl_model,
                train_dataloader=train_loader,
                val_dataloaders=valid_loader)
    pl_model.model.save_pretrained(save_model_dir)
    tokenizer.save_pretrained(save_model_dir)