示例#1
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
示例#2
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
示例#3
0
 def __init__(self,
              chunck_size=64,
              max_length=35,
              device=torch.device('cuda:0')):
     super(GPTClient, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
     self.max_length = max_length
     # load the model
     self.model = OpenAIGPTModel.from_pretrained('openai-gpt')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()
            if 'gpt2' in args.model_checkpoint:
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel
            else:
                self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    args.model_checkpoint)
                model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel

            self.model_checkpoint = model_class.from_pretrained(
                args.model_checkpoint)
            self.model_checkpoint.to(args.device)

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']
        add_special_tokens_(self.model_checkpoint, self.tokenizer)
        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
示例#5
0
 def test_special_tokens_checkpoint_behavior(self):
     toks = [
         OpenAIGPTTokenizer.from_pretrained('openai-gpt'),
         GPT2Tokenizer.from_pretrained('gpt2')
     ]
     for tok in toks:
         self.assertEqual(len(tok.added_tokens_encoder), 0)
         tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
         self.assertEqual(len(tok.added_tokens_encoder), 5)
         # Make sure we never split
         self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2)
         ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertTrue(
             all([x > 0 for x in ids]),
             f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
         # Need to mantain indices through save. (this is also tested in pytorch-transformers)
         tok.save_pretrained(self.save_dir)
         tok_loaded = tok.from_pretrained(str(self.save_dir))
         ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
         self.assertListEqual(ids, ids2)
示例#6
0
import jieba
from torch.utils.data import Dataset, DataLoader
import torch
from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTConfig


special_tokens = ['<bos>', '<del>', '<eos>', '<pad>']
tokenizer = OpenAIGPTTokenizer.from_pretrained(
    'openai-gpt', special_tokens=special_tokens)


def convert_tokens_to_ids(text):
    return tokenizer.convert_tokens_to_ids(text)


class MyData(Dataset):

    def __init__(self, texts, labels, is_train=True):
        self.texts = [jieba.lcut(t) for t in texts]
        self.labels = labels
        # 其他操作 ....

    def __getitem__(self, item):
        token_id = convert_tokens_to_ids(self.texts[item])  # 词 -> token_id
        label = self.labels[item]
        return torch.LongTensor(token_id), torch.LongTensor([label])

    def __len__(self):
        return len(self.texts)

示例#7
0
 def __init__(self):
     self.name = 'GPTLanguageModel'
     self.trainable_model = False
     self.GPT_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
     self.model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').eval()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_special_tokens({
        'cls_token': '<CLS>',
        'sep_token': '<SEP>',
        'pad_token': '<PAD>',
        'eos_token': '<EOS>'
    })
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    special_tokens_ids = [
        tokenizer.convert_tokens_to_ids(special_token)
        for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>']
    ]
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids,
                                                 lm_labels, mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
示例#9
0
def test_gpt_embeddings():
    gpt_model: str = "openai-gpt"

    tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model)
    model = OpenAIGPTModel.from_pretrained(
        pretrained_model_name_or_path=gpt_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize(s)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #     0             1           2            3          4         5         6        7       8       9        10        11         12
    #
    # 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>'
    #     |             |           |            |          |         |         |         \      |      /          |         |          |
    #   Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #     0             1           2            3          4         5         6                7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = OpenAIGPTEmbeddings(
            model=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[7].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[0],
                                           first_layer[0]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[7], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[0]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[7], first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * 768
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 768
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        default='gpt2-medium',
                        choices=["openai-gpt", "gpt2", "gpt2-medium"],
                        help='pretrained model name')
    parser.add_argument(
        "--model_dir",
        type=str,
        help="path to model's local checkpoint",
        default=
        "/home/ouardinik/PycharmProjects/hyperlex/text_generation_GPT/logs/openai-gpt_2019-08-02_10:14:02.631911"
    )
    parser.add_argument("--bin_filename",
                        type=str,
                        help="checkpoint's filename",
                        default="final_pytorch_model.bin")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--run_parallel",
                        action='store_true',
                        help='whether to run on GPUs')

    args = parser.parse_args()

    assert os.path.exists(args.model_dir), "input model's path"

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    # Load Transformer
    logger.info("Load a trained model and vocabulary that you have fine-tuned")
    # BPE tokenizer and model
    if args.model_name == "openai-gpt":
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name)
    elif args.model_name == "gpt2" or args.model_name == "gpt2-medium":
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
        model = GPT2LMHeadModel.from_pretrained(args.model_name)

    # Device
    device = torch.device("cuda" if (
        torch.cuda.is_available() and args.run_parallel) else "cpu")
    n_gpu = torch.cuda.device_count()
    # Load top layer
    top_layer_path = os.path.join(args.model_dir, args.bin_filename)
    if device.type == "cpu":
        top_layer = torch.load(top_layer_path, map_location="cpu")
    else:
        top_layer = torch.load(top_layer_path)

    model.to(device), top_layer.to(device)
    model.eval(), top_layer.eval()

    if args.length == -1:
        args.length = model.config.n_ctx // 2
    elif args.length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        context_tokens = []
        if not args.unconditional:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = tokenizer.encode(raw_text)
            generated = 0

            for _ in range(args.nsamples // args.batch_size):
                out = sample_sequence(top_layer=top_layer,
                                      model=model,
                                      length=args.length,
                                      context=context_tokens,
                                      start_token=None,
                                      batch_size=args.batch_size,
                                      temperature=args.temperature,
                                      top_k=args.top_k,
                                      device=device)
                out = out[:, len(context_tokens):].tolist()
                for i in range(args.batch_size):
                    generated += 1
                    text = tokenizer.decode(out[i])
                    print("=" * 40 + " SAMPLE " + str(generated) + " " +
                          "=" * 40)
                    print(text)
            print("=" * 80)
        else:
            generated = 0
            for _ in range(args.nsamples // args.batch_size):
                out = sample_sequence(
                    top_layer=top_layer,
                    model=model,
                    length=args.length,
                    context=None,
                    start_token=tokenizer.encoder['<|endoftext|>'],
                    batch_size=args.batch_size,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    device=device)
                out = out[:, 1:].tolist()
                for i in range(args.batch_size):
                    generated += 1
                    text = tokenizer.decode(out[i])
                    print("=" * 40 + " SAMPLE " + str(generated) + " " +
                          "=" * 40)
                    print(text)
            print("=" * 80)
示例#11
0
# this file implements the gpt as a service, based on a pretrained model from the source
# https://github.com/huggingface/pytorch-pretrained-BERT

import torch
from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

# Tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

print(tokens_tensor.size())
# Load pre-trained model (weights)
model = OpenAIGPTModel.from_pretrained('openai-gpt')
model.eval()

cuda = torch.device('cuda:1')

# If you have a GPU, put everything on cuda
示例#12
0
    return tensor_datasets


def tokenize_and_encode(obj):
    """ Tokenize and encode a nested object """
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    elif isinstance(obj, int):
        return obj
    return list(tokenize_and_encode(o) for o in obj)


# Load tokenizer and model
# This loading functions also add new tokens and embeddings called `special tokens`
special_tokens = ['_start_', '_delimiter_', '_classify_']
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name,
                                               special_tokens=special_tokens)
special_tokens_ids = list(
    tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
model = OpenAIGPTDoubleHeadsModel.from_pretrained(
    args.model_name, num_special_tokens=len(special_tokens))
model.to(device)

# Load Snli dataset with pkl
path_train = args.data_path + "snli_1.0_train.txt"
path_test = args.data_path + "snli_1.0_test.txt"
path_dev = args.data_path + "snli_1.0_dev.txt"
pkl_path = 'data_in/snli.pkl'

if os.path.exists(pkl_path):
    with open(pkl_path, 'rb') as f:
        encoded_datasets = pickle.load(f)
示例#13
0
def load_pick(file_nm):
	with open(file_nm, 'rb') as f:
		label, df = pickle.load(f)
		print("load compeleted")
		return label, df

dev_label, dev_df = load_pick('data_in/dev.pkl')
test_label, test_df = load_pick('data_in/test.pkl')
train_label, train_df = load_pick('data_in/train.pkl')

# Load tokenizer and model
# This loading functions also add new tokens and embeddings called `special tokens`
# These new embeddings will be fine-tuned on the RocStories dataset
special_tokens = ['_start_', '_delimiter_', '_classify_']
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))


special_tokens = ['<bos>', '<del>', '<eos>', '<pad>']
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)  # OpenAI용 토크나이저 불러오기
tokenizer.add_tokens(special_tokens)

config = OpenAIGPTConfig.from_pretrained('openai-gpt')
config.num_labels = 3
config.vocab_size = len(tokenizer)
config.summary_type = 'last'

tokenizer.bos_token = '<bos>'
tokenizer.eos_token = '<eos>'
示例#14
0
def test_gpt_embeddings():
    gpt_model = 'openai-gpt'
    tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model)
    model = OpenAIGPTModel.from_pretrained(
        pretrained_model_name_or_path=gpt_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()
    s = 'Berlin and Munich have a lot of puppeteer to see .'
    with torch.no_grad():
        tokens = tokenizer.tokenize(s)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)
        hidden_states = model(tokens_tensor)[(-1)]
        first_layer = hidden_states[1][0]
    assert (len(first_layer) == len(tokens))

    def embed_sentence(sentence: str,
                       pooling_operation,
                       layers: str = '1',
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = OpenAIGPTEmbeddings(
            pretrained_model_name_or_path=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix)
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)
        return flair_sentence

    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation='first')
    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_subword_embedding_ref = first_layer[7].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation='last')
    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation='first_last')
    first_token_embedding_ref = torch.cat([first_layer[0],
                                           first_layer[0]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[7], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation='mean')
    first_token_embedding_ref = calculate_mean_embedding([first_layer[0]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()
    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[7], first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)
    sentence_mult_layers = embed_sentence(sentence='Munich',
                                          pooling_operation='first',
                                          layers='1,2,3,4')
    ref_embedding_size = (4 * 768)
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
    sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin',
                                                     pooling_operation='first',
                                                     layers='1,2,3,4',
                                                     use_scalar_mix=True)
    ref_embedding_size = (1 * 768)
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)