Exemplo n.º 1
0
    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        vocab = [
            "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>",
            "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>",
            "newer</w>", "wider</w>"
        ]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
            json.dump(vocab_tokens, fp)
            vocab_file = fp.name
        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
            fp.write("\n".join(merges))
            merges_file = fp.name

        tokenizer = OpenAIGPTTokenizer(vocab_file,
                                       merges_file,
                                       special_tokens=["<unk>"])
        os.remove(vocab_file)
        os.remove(merges_file)

        text = "lower"
        bpe_tokens = ["low", "er</w>"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + ["<unk>"]
        input_bpe_tokens = [14, 15, 20]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                             input_bpe_tokens)
def openAIGPTTokenizer(*args, **kwargs):
    """
    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
	Peculiarities:
        - lower case all inputs
        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
        - argument special_tokens and function set_special_tokens:
            can be used to add additional symbols (ex: "__classify__") to a vocabulary.

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * openai-gpt
    Keyword args:
	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
					Default: None
	max_len: An artificial maximum length to truncate tokenized sequences to;
        	 Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
			 Default: None

    Example:
		>>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
		
		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
    """
    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
 def test_tokenizer_from_pretrained(self):
     cache_dir = "/tmp/pytorch_pretrained_bert_test/"
     for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
         tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name,
                                                        cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(tokenizer)
import torch, json
import numpy as np
from pytorch_pretrained_bert.modeling_openai import OpenAIGPTLMHeadModel, OpenAIGPTConfig
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
from pytorch_pretrained_bert.optimization_openai import OpenAIAdam
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

recipes_data = json.load(open('/scratch/cluster/agupta/recipes_elmo.json',
                              'r'))

train_data = []
val_data = []
test_data = []

for data in recipes_data:
    recipes_data[data]['para'] = []
    recipes_data[data]['targets'] = np.zeros(
        (len(recipes_data[data]['text']),
         len(recipes_data[data]['ingredient_list'])))

    for step_num in range(len(recipes_data[data]['text'])):
        recipes_data[data]['para'] += recipes_data[data]['text'][str(step_num)]

    for step_num in recipes_data[data]['ingredients']:
        for ing in recipes_data[data]['ingredients'][step_num]:
            recipes_data[data]['targets'][int(step_num)][ing] = 1
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset',
                        type=str,
                        default='./train_recipes.json')
    parser.add_argument('--eval_dataset',
                        type=str,
                        default='./val_recipes.json')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=2)
    parser.add_argument('--eval_batch_size', type=int, default=2)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-6)
    parser.add_argument('--warmup_proportion', type=float, default=0.1)
    parser.add_argument('--lr_schedule', type=str, default='warmup_cosine')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    config = OpenAIGPTConfig()
    #model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model = OpenAIGPTLMHeadModel(config)
    model.set_num_special_tokens(len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    '''
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    '''
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_recipes_dataset(args.train_dataset)
    train_dataset = train_dataset

    #remove extra length train data

    print(train_dataset[0])
    eval_dataset = load_recipes_dataset(args.eval_dataset)
    print(len(eval_dataset))
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    selected_train_data = []
    print(len(encoded_datasets[0]))
    for ins in encoded_datasets[0]:
        if len(ins) <= 510:
            selected_train_data.append(ins)

    encoded_datasets[0] = selected_train_data

    print(len(encoded_datasets[0]))

    # Compute the mex input length for the Transformer
    max_length = model.config.n_positions - 2
    print(max_length)
    print(encoded_datasets[0][0])
    input_length = max(
        len(story[:max_length]) + 2 for dataset in encoded_datasets
        for story in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model
    print(input_length)
    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]
    eval_tensor_dataset = tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    print(.002 * num_train_optimization_steps)

    total_loss = 0
    total_length = 0

    print(model.transformer.h)
    '''
    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.eval()
        
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(train_dataloader, desc="Pre LM training train data ppl")
        for step, batch in enumerate(tqdm_bar):
            #print(batch)
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels = batch
            loss = model(input_ids, lm_labels = lm_labels)
            lengths = mc_token_ids.to('cpu').numpy()
            #print(np.sum(lengths))
            total_loss+=loss.item()*np.sum(lengths)
            total_length+=np.sum(lengths)

    print(total_loss/total_length)

    total_loss = 0
    total_length = 0
    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.eval()
    
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(eval_dataloader, desc="Pre LM training val data ppl")
        for step, batch in enumerate(tqdm_bar):
            #print(batch)
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels = batch
            loss = model(input_ids, lm_labels = lm_labels)
            lengths = mc_token_ids.to('cpu').numpy()
            #print(np.sum(lengths))
            total_loss+=loss.item()*np.sum(lengths)
            total_length+=np.sum(lengths)

    print(total_loss/total_length)
    '''
    if args.do_train:
        print("=" * 80 + '\n')
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                #print(batch)
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)

                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            total_loss = 0
            total_length = 0
            if args.do_train:
                nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
                model.eval()

                tr_loss = 0
                nb_tr_steps = 0
                tqdm_bar = tqdm(train_dataloader,
                                desc="Post LM training train data ppl")
                for step, batch in enumerate(tqdm_bar):
                    #print(batch)
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, mc_token_ids, lm_labels = batch
                    loss = model(input_ids, lm_labels=lm_labels)
                    lengths = mc_token_ids.to('cpu').numpy()
                    #print(np.sum(lengths))
                    total_loss += loss.item() * np.sum(lengths)
                    total_length += np.sum(lengths)

            print(total_loss / total_length)

            total_loss = 0
            total_length = 0
            if args.do_train:
                nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
                model.eval()

                tr_loss = 0
                nb_tr_steps = 0
                tqdm_bar = tqdm(eval_dataloader,
                                desc="Post LM training val data ppl")
                for step, batch in enumerate(tqdm_bar):
                    #print(batch)
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, mc_token_ids, lm_labels = batch
                    loss = model(input_ids, lm_labels=lm_labels)
                    lengths = mc_token_ids.to('cpu').numpy()
                    #print(np.sum(lengths))
                    total_loss += loss.item() * np.sum(lengths)
                    total_length += np.sum(lengths)

            print(total_loss / total_length)

            print("=" * 80 + '\n')
    # Save a trained model
    '''