Пример #1
0
 def __init__(self):
     self.lm_model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
     self.lm_model.eval()
     self.cuda = torch.cuda.is_available()
     if self.cuda:
         self.lm_model = self.lm_model.cuda()
     self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
Пример #2
0
def main():
  # 3 examples
  train_dataset = 'small brown fox jumps over the lazy dog\n' \
                  'small brown fox jumps over the lazy dog\n' \
                  'small brown fox jumps over the lazy dog\n'
  tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt',
                                                 special_tokens=[])
  tokenized = [tokenizer.tokenize(t) for t in train_dataset.strip().split('\n')]

  encoded=[tokenizer.convert_tokens_to_ids(t) for t in tokenized]  # 3x8
  dataset = TensorDataset(torch.tensor(encoded))
  sampler = SequentialSampler(dataset)
  dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)
  model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

  optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)

  batch = next(iter(dataloader))
  batch=batch[0]   # dataloader gives [batch] instead of batch...why?
 
  for i in range(20):
    loss = model(input_ids=batch, lm_labels=batch)
    print(loss.detach().numpy())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
Пример #3
0
def perplexity_filtering(sentences_df, threshold=1000, sentence_col="sentence"):
    """
    Function used to filter sentences by perplexity

    ---

    **Arguments**\n
    `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column.\n
    `threshold` (int): Perplexity threshold used for filtering. Default value = 1000.\n
    `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".

    ---

    **Returns**\n
    `sentences_df` (DataFrame): DataFrame filtered by perplexity.
    """

    # Load pre-trained model (weights)
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    def score(sentence):
        tokenize_input = tokenizer.tokenize(sentence)
        tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
        loss=model(tensor_input, lm_labels=tensor_input)
        return math.exp(loss.item())

    l = list(sentences_df)
    sentences_df['perplexity'] = sentences_df[sentence_col].apply(lambda x: score(x) if len(re.sub('[^0-9a-zA-Z ]', '', x)) > 0 else -1.0)
    return sentences_df[(sentences_df['perplexity'] <= threshold) & (sentences_df['perplexity'] != - 1.0)][l]
Пример #4
0
def main():
    global tokenizer, model

    train_dataset = 'the quick brown fox jumps over the lazy dog'
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenized = [tokenizer.tokenize(train_dataset)]

    # [[481, 2279, 2507, 8573, 11670, 715, 481, 8447, 2585]]
    encoded = [tokenizer.convert_tokens_to_ids(t) for t in tokenized]
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    batch = torch.tensor(encoded)

    start_words = ['the']
    start_tokens = [tokenizer.convert_tokens_to_ids(w) for w in start_words]

    for i in range(20):
        loss = model(input_ids=batch, lm_labels=batch)
        perplexity = math.exp(loss.item())
        print('%5.2f -- %s' % (perplexity, decode(start_tokens)))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
Пример #5
0
def tokenize_and_encode_single_part(dataset):
    special_tokens = ['<BOA>', '<EOA>']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        model_name, special_tokens=special_tokens)
    for i in range(len(dataset)):
        dataset[i] = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(dataset[i]))
    return dataset
Пример #6
0
 def __init__(self, host=HOST, port=9200, timeout=30, index=INDEX):
     self.model = GPTModel()
     self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
     self.es = Elasticsearch(host=host,
                             port=port,
                             timeout=timeout,
                             index=index)
     self.index = index
Пример #7
0
 def construct_encoder(self):
     model = OpenAIGPTModel.from_pretrained(self.model_name)
     model.cuda()
     model = torch.nn.DataParallel(model)
     model.eval()
     tokenizer = OpenAIGPTTokenizer.from_pretrained(self.model_name)
     print("Model and tokenzier are constructed!")
     return model, tokenizer
Пример #8
0
def get_tokenizer(tokenizer_name):
    if tokenizer_name == 'GPT-2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    elif tokenizer_name == 'GPT':
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    else:
        raise NotImplementedError(f'{tokenizer_name} -- No such tokenizer')

    return tokenizer
Пример #9
0
def tokenize_and_encode(dataset):
    special_tokens = ['<BOA>', '<SEP>', '<EOA>']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens)
    for i in range(len(dataset)):
        dataset[i] = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][0])),
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][1])),
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][2]))]

    return dataset
    def __init__(perplexity_threshold=137):
        ### Lang Model:
        # Load Language Model
        # Load pre-trained model (weights)
        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        model.eval()
        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

        ### For clarity error:
        self.perplexity_threshold = perplexity_threshold
    def _load_model(self):
        """ Helper function for loading model and tokenizer in one shot
        and assigning as class attributes

        """
        # Load tokenizer and model within `main` function
        ckpt = download_pretrained_model()
        print("Model location:", ckpt)

        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(ckpt)
        self.model = OpenAIGPTLMHeadModel.from_pretrained(ckpt)
        print("Tokenizer and model loaded...")
Пример #12
0
def sent_feat(text, feat_type):

    if feat_type == 'w2v':
        import gensim
        import numpy as np
        model = gensim.models.KeyedVectors.load_word2vec_format(
            '/scratch/shared/slow/yangl/w2v/GoogleNews-vectors-negative300.bin',
            binary=True)
        final_feats = []
        for word in (text.split(' ')):
            if (word != 'a') and (word in model.vocab):
                final_feats.append(model.get_vector(word))

        final_feats = np.asarray(final_feats)

    elif feat_type == 'openai':
        import json
        import torch
        from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
        import logging

        logging.basicConfig(level=logging.INFO)

        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

        # Tokenized input
        #text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        model = OpenAIGPTModel.from_pretrained('openai-gpt')
        model.eval()
        model.to('cuda')

        tokenized_text = tokenizer.tokenize(text)

        # Convert token to vocabulary indices
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')

        # Predict hidden states features for each layer
        with torch.no_grad():
            hidden_states = model(tokens_tensor)
            final_feats = hidden_states[0].cpu().numpy()

    else:
        print('Unrecognised FEAT_TYPE.')

    return final_feats
Пример #13
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model, args)
        history.append(out_ids)
        history = history[-(2*args.max_history+1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
Пример #14
0
    def __init__(self, args):
        super().__init__()

        if args.gpt_model_dir is not None:
            # load bert model from file
            gpt_model_name = str(args.gpt_model_dir) + "/"
            dict_file = gpt_model_name
            print("loading Open AI GPT model from {}".format(gpt_model_name))
        else:
            # load GPT model from huggingface cache
            gpt_model_name = args.gpt_model_name
            dict_file = gpt_model_name

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(dict_file)

        # GPT uses different way to represent BPE then BERT. Namely, the
        # final suffixes are indicated with </w> suffix, while pieces that must
        # be followed are written as is. In BERT the prefixes are written as is
        # while the parts that must follow (not be followed!) have '##' prefix.
        # There is no one-to-one coversion. But at least we may make pieces that
        # may form a full word look the same.
        # Note that we should be very careful now,
        # tokenizer.convert_tokens_to_ids won't work with our vocabulary.
        def convert_word(word):
            if word == OPENAI_UNK:
                return word
            if word == '\n</w>':
                # Redefine symbol EOS to improve visualization.
                return OPENAI_EOS
            return word[:-4] if word.endswith('</w>') else f'{word}##'

        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
        self.vocab = [convert_word(word) for word in gpt_vocab]
        self._init_inverse_vocab()

        # Get UNK symbol as it's written in the origin GPT vocab.
        unk_index = self.inverse_vocab[OPENAI_UNK]
        self.unk_symbol = self.tokenizer.decoder[unk_index]

        # Load pre-trained model (weights)
        self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name)
        self.gpt_model.eval()
        print(self.gpt_model.config)

        # Sanity check.
        assert len(self.vocab) == self.gpt_model.config.vocab_size
        assert 0 == self.gpt_model.config.n_special

        self.eos_id = self.inverse_vocab[OPENAI_EOS]
        self.model_vocab = self.vocab
Пример #15
0
def dummy_tokenize():
    from pytorch_pretrained_bert import OpenAIGPTTokenizer

    # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
    import logging
    logging.basicConfig(level=logging.INFO)

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    # Tokenized input
    text = "Who was Jim Henson ? Jim Henson was a puppeteer"
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text
Пример #16
0
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()

            self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                args.model_checkpoint)
            if self.args.eval_type == "hits@1":
                self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained(
                    args.model_checkpoint)
            else:
                self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained(
                    args.model_checkpoint)
            self.model_checkpoint.to(args.device)
            self.model_checkpoint.eval()

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']

        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
Пример #17
0
def stat(samples):
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    title1_length = []
    title2_length = []
    description_length = []

    for sample in samples:
        title1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[0]))
        title2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[1]))
        description = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(sample[2]))
        title1_length.append(len(title1))
        title2_length.append(len(title2))
        description_length.append(len(description))

    return title1_length, title2_length, description_length
Пример #18
0
def load_gpt_tokenizer():
    """ Helper function for loading sub-word tokenizer
    
    Returns:
         Instance of pytorch_pretrained_bert.OpenAIGPTTokenizer tokenizer
    """
    model_name = 'openai-gpt'
    special_tokens = ['_start_', '_end_', '_pad_']
    tok = OpenAIGPTTokenizer.from_pretrained(model_name,
                                             special_tokens=special_tokens)
    # Explicitly set padding token to be word_id = 0
    tok.special_tokens['_pad_'] = 0
    tok.encoder['<unk>'] = len(tok)
    print('GPT tokenizer initialized...')

    return tok
Пример #19
0
def dump_gpt_index(splits):
    from pytorch_pretrained_bert import OpenAIGPTTokenizer
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    #splits = ['train', 'val_seen', 'val_unseen', 'test']

    for split in splits:
        data = load_datasets(
            [split], encoder_type='lstm'
        )  # here we use lstm dataset to preprocess the data,
        indexed_tokens = []
        for item in data:
            for instr in item['instructions']:
                tokenized_text = tokenizer.tokenize(instr)
                tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
                indexed_tokens.append('_'.join([str(i) for i in tokens]))
        write_vocab(indexed_tokens, 'tasks/R2R/data/R2R_%s_gpt.txt' % split)
Пример #20
0
def load(small=False):
    """
    Load OpenAI model and NLP model

    Requires running

    > python -m spacy download en_core_web_lg
    """
    # Load pretrained model and tokenizer
    global model, tokenizer, nlp
    model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt").eval()
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    if small:
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.load("en_core_web_lg")
    return nlp
Пример #21
0
def fetch_objects():
    bert = BertModel.from_pretrained(
        'bert-base-uncased').embeddings.position_embeddings.weight.data
    gpt = OpenAIGPTModel.from_pretrained(
        'openai-gpt').positions_embed.weight.data
    gpt2 = GPT2Model.from_pretrained('gpt2').wpe.weight.data
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    return {
        'bert': bert,
        'gpt': gpt,
        'gpt2': gpt2
    }, {
        'bert': bert_tokenizer,
        'gpt': gpt_tokenizer,
        'gpt2': gpt2_tokenizer
    }
Пример #22
0
    def __init__(self):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
            download_pretrained_model())
        self.model = OpenAIGPTLMHeadModel.from_pretrained(
            download_pretrained_model())
        self.model.to(self.device)
        self.model.eval()

        with open(join(dirname(realpath(__file__)), "RoboyPersonality.txt"),
                  "r") as input_file:
            roboy_personality = input_file.read().split('\n')
        self.personality = []
        for p in roboy_personality:
            self.personality.append(self.tokenizer.encode(p))
        self.history = []
        self.fix_spaces = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*')
Пример #23
0
def filtration(samples):
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    result = []

    for sample in samples:
        total = 0
        zeros = 0
        title1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[0]))
        title2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[1]))
        description = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(sample[2]))
        for id in title1 + title2 + description:
            if id == 0:
                zeros += 1
            total += 1
        if 1.0 * zeros / total < 0.1 and len(title1 + title2 +
                                             description) <= 60:
            result.append(sample)
    return result
    def __init__(self, opt):
        super().__init__(opt)
        # initialize from voab path
        cache_vocab_dir = os.path.join(opt['datapath'], 'models', 'gpt_models')
        self.special_tokens = [
            SpecialToken.talk_1_start, SpecialToken.talk_1_end,
            SpecialToken.persona_start, SpecialToken.persona_end,
            SpecialToken.no_fact, SpecialToken.start, SpecialToken.end,
            SpecialToken.slice_sym
        ]

        # add special token after the pre-trained bpe text
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
            'openai-gpt',
            cache_dir=cache_vocab_dir,
            special_tokens=self.special_tokens)

        self.start_token = self.default_start
        self.end_token = self.default_end
        self.null_token = self.default_null

        # <unk> already in the dictionary
        self.start_idx = self.tokenizer.convert_tokens_to_ids(
            [SpecialToken.start])[0]
        # <end> is used to split a long text into different parts, which is necessary for us
        # to differentiate persona & history only passing the observation function for one time
        self.end_idx = self.tokenizer.convert_tokens_to_ids([SpecialToken.end
                                                             ])[0]
        self.pad_idx = self.tokenizer.convert_tokens_to_ids(
            [SpecialToken.pad])[0]  # should be 0
        # update for default tokenizer vocabulary
        self.tok2ind.clear()
        self.ind2tok.clear()

        # set tok2ind for special tokens
        for special_token in self.special_tokens + [
                self.start_token, self.end_token, self.null_token
        ]:
            token_id = self.tokenizer.convert_tokens_to_ids([special_token])[0]
            self.tok2ind[special_token] = token_id
            self.ind2tok[token_id] = special_token
Пример #25
0
def token_stat(samples):
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    result = {}
    total_tokens = 0
    total_BOA_EOA = 0
    total_SEP = 0

    for sample in samples:
        title1 = tokenizer.tokenize(sample[0])
        title2 = tokenizer.tokenize(sample[1])
        description = tokenizer.tokenize(sample[2])
        all_tokens = title1 + title2 + description
        for token in all_tokens:
            if token not in result:
                result[token] = 0
            result[token] += 1
        total_tokens += len(all_tokens) + 4
        total_BOA_EOA += 1
        total_SEP += 2

    return result, total_tokens, total_BOA_EOA, total_SEP
Пример #26
0
def get_GPT_embeddings(vocab, dim):
    _embeddings = np.zeros([len(vocab), dim])

    if "openai-gpt" not in OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys():
        raise ValueError("Provided OpenAI GPT model is not available.")
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    gpt_model = OpenAIGPTModel.from_pretrained("openai-gpt")

    with torch.no_grad():
        for word in vocab:
            subwords = tokenizer.tokenize(word)
            indexed_tokens = tokenizer.convert_tokens_to_ids(subwords)
            tokens_tensor = torch.tensor([indexed_tokens])
            tokens_tensor = tokens_tensor.to(flair.device)
            hidden_states = gpt_model(tokens_tensor)

            first_embedding = hidden_states[0][0]
            last_embedding = hidden_states[0][len(hidden_states[0]) - 1]
            final_embedding = torch.cat([first_embedding, last_embedding])

            _embeddings[vocab[word]] = final_embedding

    return _embeddings
Пример #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        config = model.config
        torch.save(model_to_save.state_dict(), output_model_file)

        # Load a trained model that you have fine-tuned
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTDoubleHeadsModel(config)
        model.load_state_dict(model_state_dict)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Пример #28
0
import json
from pytorch_pretrained_bert import cached_path
from pytorch_pretrained_bert import OpenAIGPTTokenizer
from keras_gpt_2 import load_trained_model_from_checkpoint, get_bpe_from_files, generate

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json"

# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
    dataset = json.loads(f.read())

# with open('dataset.json', "w", encoding="utf-8") as f:
#     f.write(json.dumps(dataset))
dataset = dataset['train']
dataset = dataset[:1]
print('\n')
print(dataset[0]['utterances'][1])
print('\n')
print(dataset[0]['utterances'][2])


# Tokenize and encode the dataset using our loaded GPT tokenizer
def tokenize(obj):
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict):
        return dict((n, tokenize(o)) for n, o in obj.items())
    return list(tokenize(o) for o in obj)
Пример #29
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    data = get_dataset_from_file(tokenizer, args.filename)
    final_output_dict = {
        "version": "squash-2.0",
        "data": [{
            "paragraphs": []
        }]
    }
    question_number = 0
    # For all the instances corresponding one paragraph, model input format is: paragraph + answer + question) 
    # Paragraph will be common accross all the instances.
    # "past" can be used to reuse precomputed hidden state for paragraph in a subsequent predictions
    
    imort copy 
    
    previous_para_index = None
    past = None
    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            current_para_index = inst['para_index']
            if current_para_index != prev_para_index:
                past = None
                currrent_inst = copy.deepcopy(inst)
                # only keeping paragraph details in the instance to get its hidden states 
                current_inst['question'] =  []
                current_inst['answer'] = []
                instance, _ = build_input_from_segments(current_inst,tokenizer,with_eos=False)
                input_ids = torch.tensor(instance['input_ids'][:-2],device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'][:-2],device=args.device).unsqueeze(0)
                _,past=model(input_ids,toekn_type_ids=toekn_type_ids,past=past) #output "past" will have paragraph embeddings
            output = sample_sequence(inst, tokenizer, model, args,past)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'], skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True)
        para_index = inst['para_index']

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0]["paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index]['qas'].append({
                'id': 'question_%d' % question_number,
                'question': generated_question,
                'answers': [{
                    'text': original_answer,
                    'answer_start': original_paragraph.index(original_answer)
                }],
                'class': output['class'],
                'algorithm': output['algorithm'],
                'is_impossible': False
            })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context': original_paragraph,
                'qas': [{
                    'id': 'question_%d' % question_number,
                    'question': generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_paragraph.index(original_answer)
                    }],
                    'class': output['class'],
                    'algorithm': output['algorithm'],
                    'is_impossible': False
                }]
            })

        question_number += 1

    with open("squash/temp/generated_questions.json", "w") as f:
        f.write(json.dumps(final_output_dict))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/user4/gpt_classification/dataset/ag_news',
                        type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--task_name",
                        default='ag_news',
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/user4/gpt_classification/experiment/ag_news',
                        type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--max_grad_norm",
                        default=1)
    parser.add_argument('--weight_decay', type=float, default=0.0)

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default='/hdd/user4/gpt_classification/pretrained',
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=9.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        default=True,
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        # n_gpu = torch.cuda.device_count()
        n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.device = device

    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](args.data_dir)
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    model = OpenAIGPTForClassification.from_pretrained(args.model_name,
                                                       num_special_tokens=len(special_tokens),
                                                       num_labels=num_labels)
    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    tr_loss = 0

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()

        # Prepare data loader
        train_examples = processor.get_train_examples()
        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, _, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model.forward(input_ids, input_mask)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)

        tb_writer.close()

    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)

    # Load a trained model and vocabulary that you have fine-tuned
    model = OpenAIGPTForClassification.from_pretrained(args.output_dir,
                                                       num_labels=num_labels)

    model.to(device)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples()
        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_eval_features_file, "rb") as reader:
                eval_features = pickle.load(reader)
        except:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None


        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model.forward(input_ids, input_mask)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            output_odp = []
            for arr in preds:
                t = (-arr).argsort()[:5]
                output_odp.append(t.tolist())
            file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT'
            with open('gpt_top5.pkl','wb') as f:
                pickle.dump(output_odp,f)


            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, out_label_ids)
        print('preds:',preds,'label:',out_label_ids)

        loss = tr_loss / global_step if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
            out_label_ids = None

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                    out_label_ids = label_ids.detach().cpu().numpy()
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(
                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(task_name, preds, out_label_ids)

            loss = tr_loss / global_step if args.do_train else None

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))