Exemplo n.º 1
0
def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
    Peculiarities:
        - Byte-level BPE

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * gpt2
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Exemplo n.º 2
0
    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        vocab = [
            "l",
            "o",
            "w",
            "e",
            "r",
            "s",
            "t",
            "i",
            "d",
            "n",
            "lo",
            "low",
            "er",
            "low",
            "lowest",
            "newer",
            "wider",
            "<unk>",
        ]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
        special_tokens_map = {"unk_token": "<unk>"}

        with TemporaryDirectory() as tmpdirname:
            vocab_file = os.path.join(tmpdirname,
                                      VOCAB_FILES_NAMES["vocab_file"])
            merges_file = os.path.join(tmpdirname,
                                       VOCAB_FILES_NAMES["merges_file"])
            with open(vocab_file, "w") as fp:
                fp.write(json.dumps(vocab_tokens))
            with open(merges_file, "w") as fp:
                fp.write("\n".join(merges))

            input_text = "lower newer"
            output_text = "lower<unk>newer"

            create_and_check_tokenizer_commons(self, input_text, output_text,
                                               GPT2Tokenizer, tmpdirname,
                                               **special_tokens_map)

            tokenizer = GPT2Tokenizer(vocab_file, merges_file,
                                      **special_tokens_map)
            text = "lower"
            bpe_tokens = ["low", "er"]
            tokens = tokenizer.tokenize(text)
            self.assertListEqual(tokens, bpe_tokens)

            input_tokens = tokens + [tokenizer.unk_token]
            input_bpe_tokens = [13, 12, 17]
            self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                                 input_bpe_tokens)
Exemplo n.º 3
0
    def test_full_tokenizer(self):
        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file,
                                  **self.special_tokens_map)
        text = "lower"
        bpe_tokens = ["low", "er"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
        input_bpe_tokens = [13, 12, 17]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                             input_bpe_tokens)
Exemplo n.º 4
0
    def test_full_tokenizer(self):
        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file,
                                  **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                             input_bpe_tokens)
Exemplo n.º 5
0
 def get_tokenizer(self):
     return GPT2Tokenizer.from_pretrained(self.tmpdirname,
                                          **self.special_tokens_map)
Exemplo n.º 6
0
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, help="pretrained_model.")
    parser.add_argument("--model_path", type=str, help="pretrained_model.")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--context_length", type=int, help="Whether to run eval on the dev set with limited context length.", default=30)
    parser.add_argument("--shuffle_pos", type=int, help="Shuffle words starting at a certain relative position.", default=30)
    parser.add_argument("--do_local_shuffle", action='store_true', help="Whether to run eval on the dev set with shuffled word order.")
    parser.add_argument("--do_global_shuffle", action='store_true', help="Whether to run eval on the dev set with shuffled word order.")
    parser.add_argument("--word_order_context_length", type=int, help="Whether to run eval on the dev set with shuffled word order.",default=None)
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument('--data_dir', type=str, default='/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/')
    parser.add_argument('--tokenized', action='store_true', help="Whether we have tokenized data ready.")
    parser.add_argument('--load_finetuned', action='store_true', help="Whether to load a finetuned model.")
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=2)
    parser.add_argument('--eval_batch_size', type=int, default=10000)
    parser.add_argument('--sequence_length', type=int, default=512)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    args = parser.parse_args(['--output_dir', './fine_tuned_model','--do_eval', '--num_train_epochs', '1',\
                              '--model_name', 'gpt2', '--tokenized','--load_finetuned', '--context_length',\
                              '300','--shuffle_pos','200', '--do_local_shuffle'])
    #args = parser.parse_args()
    #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2'])
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.warning("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.load_finetuned:
        config = GPT2Config.from_pretrained('gpt2')
        model = GPT2LMHeadModel(config)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, 'gpt2_1epoch.bin')))

        # tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)
    elif args.model_name:
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
        config = model.config

    wandb.watch(model)

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Compute the max input length for the Transformer
    # Todo: Where is this used?
    sequence_length = max(config.n_ctx, args.sequence_length)
    if not args.tokenized:
        data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir
        corpus = TaggedGPT2Corpus(data_dir, tokenizer=tokenizer)
        torch.save(corpus.train[0], 'train_id.pt')
        torch.save(corpus.train[1], 'train_pos.pt')
        torch.save(corpus.valid[0], 'val_id.pt')
        torch.save(corpus.valid[1], 'val_pos.pt')
        train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify(tokenizer, corpus,
                                                                                              data_dir, sequence_length)
    else:
        train_id = torch.load('/home/xiongyi/dataxyz/data/corpora/wikitext-2/train_id.pt')
        train_pos = torch.load('/home/xiongyi/dataxyz/data/corpora/wikitext-2/train_pos.pt')
        train_set = (train_id, train_pos)
        val_set = torch.load('/home/xiongyi/dataxyz/data/corpora/wikitext-2/val_id.pt')[100000:110000]
        n_batch = len(train_set[0]) // sequence_length
        input_ids = train_set[0][: n_batch * sequence_length].reshape(n_batch, sequence_length)
        pos_ids = train_set[1][: n_batch * sequence_length].reshape(n_batch, sequence_length)
        all_inputs = (input_ids, pos_ids)
        train_set=tuple (t for t in all_inputs)
    #breakpoint()
    model.to(device)
    # Prepare inputs tensors and dataloaders

    train_data = TensorDataset(*train_set)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    eval_data = TensorDataset(val_set)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=len(val_set))

    # TODO: Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    #special_tokens = ['_start_', '_delimiter_']
    #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)

    # TODO: Add config



    # TODO: Load and encode the datasets

    logger.warning("Encoding dataset...")
    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          #max_grad_norm=args.max_grad_norm,
                          weight_decay=args.weight_decay)
                          #t_total=num_train_optimization_steps)

    if args.do_train:
        train_results = {}
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            ###eval on eval set
            # model.eval()
            # nb_eval_steps, nb_eval_examples = 0, 0
            # log_probs_sum = 0
            # perp = 0
            # average_loss = 0
            # for batch in tqdm(eval_dataloader, desc="Evaluating"):
            #     batch = tuple(t.to(device) for t in batch)
            #     input_ids, input_pos_ids = batch
            #
            #     with torch.no_grad():
            #         loss = model(input_ids, labels=input_ids)[0].detach().cpu().numpy()
            #         perp_batch = np.exp(loss)
            #         perp += perp_batch
            #         average_loss += loss
            #     nb_eval_steps += 1
            # perp /= nb_eval_steps
            # average_loss /= nb_eval_steps
            # print('loss', average_loss,'perp ', perp, 'epoch ', epoch)
            # train_results[epoch]= (perp, average_loss)

            model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch
                loss = model(input_ids, labels=input_ids)[0]
                #breakpoint()
                #loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} ".format(exp_average_loss)

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, args.model_name+'_epoch_' + str(args.num_train_epochs))
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        #tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned

        print (train_results)

    if args.do_eval:
        model.eval()
        with torch.no_grad():
            nb_eval_steps, nb_eval_examples = 0, 0
            perp = 0
            loss = 0
            processed_tokens = 0
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                dat = batch[0]
                #breakpoint()
                perp_batch = 0
                for i,token in enumerate(tqdm(dat)):
                    if i < args.context_length:
                        continue
                    if processed_tokens % 500 == 0 and processed_tokens:
                        print ('perp ', np.exp(loss/processed_tokens), 'processed_tokens ', processed_tokens )
                        logger.warning("'perp' = %s, 'processed_tokens = %s'", str(np.exp(loss/processed_tokens)), str(processed_tokens) )
                        wandb.log({"Eval perp": str(np.exp(loss/processed_tokens)), "Processed tokens": str(processed_tokens)})
                    input_ids = dat[i-args.context_length:i].to(device).unsqueeze(0)
                    if args.do_local_shuffle:
                        copy = input_ids[0,-args.shuffle_pos-20 : -args.shuffle_pos]
                        rand_ids = torch.randperm(len(copy))
                        copy = copy[rand_ids]
                        #random.shuffle(copy)
                        #copy.reverse()
                        input_ids[0,-args.shuffle_pos-20 : -args.shuffle_pos] = copy
                    elif args.do_global_shuffle:
                        copy = input_ids[0,:args.shuffle_pos]
                        rand_ids = torch.randperm(len(copy))
                        copy = copy[rand_ids]
                        #random.shuffle(copy)
                        #copy.reverse()
                        input_ids[0,:args.shuffle_pos]= copy

                    logits = model(input_ids)[0][0,-1,:].detach().cpu().numpy()
                    #pred_id = np.argmax(logits)
                    #pred_token = tokenizer.convert_ids_to_tokens([pred_id])[0]
                    #print (input_sent + ' ' + pred_token)
                    logprob = logits[token.item()] - logsumexp(logits)
                    #perp_tok = np.exp(-logprob)
                    #print (tokenizer.convert_ids_to_tokens([token.item()]), 'perp_tok ', perp_tok)
                    loss += -logprob
                    processed_tokens += 1
                nb_eval_steps += 1
                print ('processed ', processed_tokens)
                loss /= processed_tokens
                perp = np.exp(loss)
                # perp_word = perp / 128
                print (perp)
            result = {'eval_perp': perp}
            logger.warning("***** Eval results *****")
            logger.warning("'eval_perp' = %s", str(result['eval_perp']))