Пример #1
0
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel(logging.INFO)

if model_type == "gpt2":
    # the huggingface servers had temporary problems to serve the tokenizer, so we saved it for
    # offline use - replacing "gpt2_offline" with "gpt2" will download the latest again. The one
    # saved for offline use should be identical to the regular one.
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    if train:
        train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "lm_train"))


elif model_type == "bert":
    dataset_properties = json.load(open(os.path.join(data_dir, "dataset_properties.json")))
    special_tokens = dataset_properties["special_tokens"]
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

    config = BertConfig()
Пример #2
0
    def __init__(
        self,
        model: str = None,
        config: Union[str, GPT2Config] = None,
        vocab_file: str = None,
        merges_file: str = None,
        cache_dir: str = "aitextgen",
        tf_gpt2: str = None,
        to_gpu: bool = False,
        to_fp16: bool = False,
        verbose: bool = False,
        torchscript: bool = False,
        ts_to_trace: bool = False,
        bos_token: str = None,
        eos_token: str = None,
        unk_token: str = None,
        **kwargs,
    ) -> None:

        if not verbose:
            for module in [
                    "transformers.file_utils",
                    "transformers.configuration_utils",
                    "transformers.tokenization_utils",
                    "filelock",
                    "transformers.modeling_gpt2",
            ]:
                logging.getLogger(module).setLevel(logging.WARN)
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.ERROR)

        if torchscript:
            assert model
            logger.info(f"Loading traced GPT-2 model from provided {model}.")
            if config is None:
                config = GPT2Config()
            self.torchscript = True
            self.model = GPT2LMHeadModel(config)

            # Transpose the traced model attributes to a GPT2LMHeadModel class
            # so it can inherit its functions
            pt_model = torch.jit.load(model)
            self.model.transformer = pt_model.transformer
            self.model.lm_head = pt_model.lm_head

        elif tf_gpt2:
            # Download + convert the TF weights if a PyTorch model has not been created
            if not os.path.isfile(
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")):
                assert tf_gpt2 in [
                    "124M",
                    "355M",
                    "774M",
                    "1558M",
                ], "Invalid TensorFlow GPT-2 model size."

                logger.info(
                    f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config "
                    + "from Google's servers")

                download_gpt2(cache_dir, tf_gpt2)

                logger.info(
                    f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch."
                )

                config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json")

                convert_gpt2_checkpoint_to_pytorch(
                    os.path.join(cache_dir, tf_gpt2),
                    config_path,
                    cache_dir,
                )

                os.rename(
                    os.path.join(cache_dir, "pytorch_model.bin"),
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"),
                )

                os.rename(
                    os.path.join(cache_dir, "config.json"),
                    os.path.join(cache_dir, f"config_{tf_gpt2}.json"),
                )

            logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.")
            model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")
            config = os.path.join(cache_dir, f"config_{tf_gpt2}.json")

            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)

        elif model and os.path.exists(model):
            # A pytorch_model.bin (+ optional config/config.json) is provided
            logger.info(f"Loading GPT-2 model from provided {model}.")
            if config is None:
                config = GPT2Config()
            if ts_to_trace:
                config.torchscript = True
            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)
        elif config:
            if ts_to_trace:
                config.torchscript = True
            # Manually construct a GPT-2 model from scratch
            logger.info("Constructing GPT-2 model from provided config.")
            self.model = AutoModelWithLMHead.from_config(config=config)
        else:
            # Download and cache model from Huggingface
            if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0:
                logger.info(
                    f"Loading {model or 'gpt2'} model from /{cache_dir}.")
            else:
                logger.info(
                    f"Downloading {model or 'gpt2'} model to /{cache_dir}.")
            self.model = GPT2LMHeadModel.from_pretrained(
                model or "gpt2", cache_dir=cache_dir, torchscript=ts_to_trace)
            if model and "gpt2" not in model:
                logger.info(f"Using the tokenizer for {model}.")
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    model,
                    cache_dir=cache_dir,
                )

        if self.tokenizer is None:
            # Update tokenizer settings (if not set already)
            args = locals()
            custom_tokenizer = False
            for attr in [
                    "vocab_file",
                    "merges_file",
                    "bos_token",
                    "eos_token",
                    "unk_token",
            ]:
                if args[attr] is not None:
                    custom_tokenizer = True
                    setattr(self, attr, args[attr])

            if custom_tokenizer:
                logger.info("Using a custom tokenizer.")
            else:
                logger.info("Using the default GPT-2 Tokenizer.")

            self.tokenizer = GPT2Tokenizer(
                vocab_file=self.vocab_file,
                merges_file=self.merges_file,
                bos_token=self.bos_token,
                eos_token=self.eos_token,
                unk_token=self.unk_token,
                pad_token=self.pad_token,
            )

        if to_gpu:
            if to_fp16:
                self.to_fp16()
            self.to_gpu()
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                        self.type_vocab_size)

        mc_token_ids = None
        if self.use_mc_token_ids:
            mc_token_ids = ids_tensor([self.batch_size, self.num_choices],
                                      self.seq_length)

        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size],
                                         self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length],
                                      self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)

        config = GPT2Config(
            vocab_size=self.vocab_size,
            n_embd=self.hidden_size,
            n_layer=self.num_hidden_layers,
            n_head=self.num_attention_heads,
            # intermediate_size=self.intermediate_size,
            # hidden_act=self.hidden_act,
            # hidden_dropout_prob=self.hidden_dropout_prob,
            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            n_positions=self.max_position_embeddings,
            # type_vocab_size=self.type_vocab_size,
            # initializer_range=self.initializer_range
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            return_dict=True,
        )

        head_mask = ids_tensor(
            [self.num_hidden_layers, self.num_attention_heads], 2)

        return (
            config,
            input_ids,
            input_mask,
            head_mask,
            token_type_ids,
            mc_token_ids,
            sequence_labels,
            token_labels,
            choice_labels,
        )
    small_conv=vq_vae_small_conv,
    embedding_dim=vq_vae_embedding_dim,
    num_embeddings=vq_vae_num_embeddings,
    commitment_cost=vq_vae_commitment_cost,
    use_max_filters=vq_vae_use_max_filters,
    max_filters=vq_vae_max_filters,
)
vq_vae.load_state_dict(torch.load(vq_vae_model_path, map_location=device))
vq_vae.eval()
vq_vae.to(device)

# Create Model
configuration = GPT2Config(
    vocab_size=vocab_size,
    n_positions=max_seq_length,
    n_embd=embedding_size,
    n_layer=num_hidden_layers,
    n_head=num_attention_heads,
    resid_pdrop=resid_pdrop,
)
model = GPT2LMHeadModel(configuration)
model.eval()
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
print(model)

with torch.no_grad():
    # Get most common pixel values to feed into generation script
    _, _, _, encodings = vq_vae(sample.to(device))
    encodings, counts = encodings.unique(return_counts=True)
    bg1, bg2 = encodings[counts.topk(k=2, largest=True).indices].cpu().numpy()
Пример #5
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=0.95)
    parser.add_argument('--top_p', type=float, default=0.95)
    parser.add_argument('--top_k', type=int, default=100)
    parser.add_argument('--data-dir', type=str, default='data')
    parser.add_argument('--out-dir', type=str, default='out')

    parser.add_argument('--data_type', type=str, default='t1', choices=['t' + str(i) for i in range(9)], help="t: type")
    parser.add_argument('--model_type', type=str, default='cvae', choices=['cvae', 'ae_vae_fusion'])
    parser.add_argument('--dataset', type=str, default='wi', choices=['wp', 'wi'], help="Dataset to use for training")

    # use GPU
    parser.add_argument('--gpu', default=2, type=int)
    parser.add_argument('--no_gpu', action="store_true")

    parser.add_argument('--add_input', action="store_true")
    parser.add_argument('--add_attn', action="store_true")
    parser.add_argument('--add_softmax', action="store_true")
    parser.add_argument('--attn_proj_vary', action="store_true")

    parser.add_argument('--learn_prior', action="store_true")

    args = parser.parse_args('--model-path out/wi.1.proj_vary_cyc_cvae/model_0030000.pt '
                             '--add_input --learn_prior '.split())
    print(args)

    if args.model_type == 'cvae':
        args.learn_prior = True
    else:
        args.learn_prior = False

    # GPU
    if not torch.cuda.is_available(): args.no_gpu = True
    gpu = not args.no_gpu
    if gpu: torch.cuda.set_device(args.gpu)
    device = torch.device(args.gpu if gpu else "cpu")

    # randomness
    np.random.seed(args.seed)
    prng = np.random.RandomState()
    torch.random.manual_seed(args.seed)
    if gpu: torch.cuda.manual_seed(args.seed)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    # logging
    save_folder = args.model_path + '.eval/'
    os.makedirs(save_folder, exist_ok=True)
    importlib.reload(logging)
    logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'),
                        level=logging.INFO, format='%(asctime)s--- %(message)s')
    logging.info('\n----------------------------------------------------------------------')

    print('Loading models...')
    cache_dir = os.path.join(args.out_dir, 'model_cache')
    os.makedirs(cache_dir, exist_ok=True)
    # Load pre-trained teacher tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir)
    tokenizer.max_len = int(1e12)
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir)
    print('gpt2_params:', num_params(gpt2_model))  # gpt2: 124439808
    config = GPT2Config()

    # # add special tokens
    # special_tokens_dict = {
    #     'pad_token': '<|startoftext|>',
    #     'cls_token': '<|startofcond|>',
    #     'sep_token': '<|sepofcond|>',
    #     'mask_token': '<|endofcond|>'
    # }
    # num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    # print('We have added', num_added_toks, 'special tokens')
    # # Notice: resize_token_embeddings expect to receive the full size of the new vocab
    # gpt2_model.resize_token_embeddings(len(tokenizer))
    # assert tokenizer.pad_token == '<|startoftext|>'

    VAE = VAEModel(config, add_input=args.add_input, add_attn=args.add_attn, add_softmax=args.add_softmax,
                   attn_proj_vary=args.attn_proj_vary, learn_prior=args.learn_prior)
    init_para_frompretrained(VAE.transformer, gpt2_model.transformer, share_para=True)
    init_para_frompretrained(VAE.encoder, gpt2_model.transformer, share_para=False)
    if args.learn_prior:
        init_para_frompretrained(VAE.encoder_prior, VAE.encoder, share_para=True)
        VAE.encoder_prior.averageSelfAttention.attention_weights = VAE.encoder.averageSelfAttention.attention_weights
    VAE.lm_head.weight = gpt2_model.lm_head.weight
    if VAE.add_softmax:
        VAE.lm_head_rep = Conv1D(*gpt2_model.lm_head.weight.size())
        # VAE.lm_head_rep = LM_head_rep(*gpt2_model.lm_head.weight.size()[::-1])
    print('VAE_params:', num_params(VAE))  # 286694400
    args.load = args.model_path
    if args.load:
        print('Loading model weights...')
        state = torch.load(os.path.join(args.load), map_location='cpu')
        if 'module' in list(state.keys())[0]:  # model_path is data parallel model with attr 'module'
            state_copy = copy.copy(state)
            keys = state_copy.keys()
            for k in keys:
                state[k.replace('module.', '')] = state.pop(k)
        VAE.load_state_dict(state)
        gc.collect()
    print('Model loaded.')

    print('Setup data...')
    seq_len = VAE.config.n_ctx
    test_loader = prepare_dataset(
        args.data_dir, args.dataset, tokenizer,
        1, seq_len, 1, seq_len, args.batch_size, seq_len,
        make_train=False, make_val=False, make_test=True, data_type=args.data_type
    )[0]
    print('Done.')

    VAE.eval() # be careful about VAE.eval() vs VAE.train()
    VAE.to(device)
    loss_fn = nn.CrossEntropyLoss(reduction='none')

    logging.info('\n----------------------------------------------------------------------')
    logging.info("Testing loop. batches: %d" % len(test_loader))

    endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>")
    startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>")
    endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>")

    n_samples = 0
    bleu4_sum = 0.0
    rouge_scores_values_sum = [0.0] * 9

    model_type = args.model_type

    # test_iter = iter(test_loader); x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask = next(test_iter)
    with tqdm(total=len(test_loader)) as pbar:
        for i_test, (x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask) in enumerate(test_loader):

            length = args.length
            if length == -1:
                length = VAE.config.n_ctx - x_tokens.size(1) - 1
            elif length > VAE.config.n_ctx - x_tokens.size(1) - 1:
                raise ValueError("Can't get samples longer than window size: %s" % VAE.config.n_ctx)

            eff_samples = []
            n, l = target_tokens.size()
            storys = [tokenizer.decode(target_tokens[i, :]) for i in range(n)]
            storys = [s[s.find("<|endoftext|>") + len("<|endoftext|>"):] for s in storys]
            storys_str = [s[:s.find("<|endoftext|>") + len("<|endoftext|>")] if "<|endoftext|>" in s else s for s in storys]

            for _ in range(args.nsamples // args.batch_size):
                # model, batch_size, temperature, top_k, top_p, eos_token, sample = VAE, args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True
                out, _ = sample_sequence(
                    model=VAE,
                    tokenizer=tokenizer,
                    length=length,
                    batch_size=args.batch_size,
                    x_mask=x_mask,
                    x_tokens=x_tokens,
                    y_mask=y_mask,
                    y_tokens=y_tokens,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    top_p=args.top_p,
                    device = device,
                    eos_token=tokenizer.encoder['<|endoftext|>'],
                    model_type=model_type
                )
                out = out.tolist()

                # extract story, check metrics
                for i in range(len(out)):
                    text = out[i]
                    text = text[text.index(endoftext) + 1:]

                    if endoftext in text:
                        idx = text.index(endoftext)
                        text = text[:idx]

                    text = tokenizer.decode(text).strip()

                    # score for one long text, higher than 0.075 usually means repetition
                    # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8])
                    # if rep_score > 0.075:
                    #     # print(rep_score)
                    #     continue

                    try:
                        # check bleu
                        bleu4 = sentence_bleu([storys_str[i].split()], text, smoothing_function=SmoothingFunction().method7)

                        # check rouge
                        rouge = Rouge()
                        rouge_scores = rouge.get_scores(text, storys_str[i])
                        rouge_scores_values = [v for k in rouge_scores[0].keys() for v in rouge_scores[0][k].values()]

                        bleu4_sum += bleu4
                        rouge_scores_values_sum = [v1 + v2 for v1, v2 in zip(rouge_scores_values_sum, rouge_scores_values)]
                        n_samples += 1
                    except:
                        bleu4 = 0.0
                        rouge_scores = [{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                                         'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                                         'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}]

                    eff_samples.append((text, bleu4, rouge_scores))

                # write samples to file
                samples_file = open(save_folder + 'batch-' + '%04d' % i_test + '.txt', 'w', encoding='utf8')
                for i in range(len(eff_samples)):
                    samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " + "=" * 50)
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Outlines  " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(tokenizer.decode(x_tokens[i, :][x_mask[i, :] == 1].tolist()))
                    samples_file.write('\n' * 2)
                    samples_file.write("=" * 40 + " Story " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(storys_str[i])
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Generated " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(eff_samples[i][0])
                    samples_file.write('\n' * 4)
                    samples_file.flush()

                logging.info('batch %04d finished.', i_test)
                pbar.update(1)

    print('Test complete with %05d samples.' % n_samples)
    logging.info("Test complete with %05d samples.", n_samples)

    bleu4 = round(bleu4_sum / n_samples, 3)
    rouge_scores_values = [round(r / n_samples, 3) for r in rouge_scores_values_sum]
    print(' bleu-4:', bleu4)
    print(' rouge :', rouge_scores_values)
    logging.info(' bleu-4: %f', bleu4)
    logging.info(' rouge : %s', str(rouge_scores_values))
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, WEIGHTS_NAME, CONFIG_NAME

# parameters
model_size = "medium"

# The fine-tuned DialoGPT models published by Microsoft on azure blob do not have a model config attached to them.
# Model config with vocab and merges is available in /configs folder at https://github.com/microsoft/DialoGPT.git.
# You can download the /configs folder from https://github.com/microsoft/DialoGPT.git and run the following code.
# gpt2_config= {'small': GPT2Config.from_json_file('DialoGPT/configs/117M/config.json'),
#              'medium': GPT2Config.from_json_file('DialoGPT/configs/345M/config.json'),
#              'large': GPT2Config.from_json_file('DialoGPT/configs/762M/config.json')}

# Alternatively the model config can also be manually set.
# These are the default model config for gpt-2 small, medium and large models.
gpt2_config = {
    'small': GPT2Config(),
    'medium': GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16),
    'large': GPT2Config(n_ctx=1024, n_embd=1280, n_layer=36, n_head=20)
}

# load the gpt2 tokenizer. All three gpt2 models (small, medium, large) use the same vocabulary.
# A tokenizer is constructed from two files vocab.json and merges.txt.
# Both of these files are available in configs/117M/,  configs/345M/ and /configs/762M folders
# tokenizer = GPT2Tokenizer.from_pretrained('DialoGPT/configs/345M')

# Alternatively the following line of code will automatically download the vocab.json and merges.txt files
# and create a tokenizer from these files
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# create transformer model from the fine-tuned model weights
model = GPT2LMHeadModel(gpt2_config[model_size])
Пример #7
0
    def __init__(
        self,
        model: str = None,
        config: Union[str, GPT2Config] = None,
        vocab_file: str = None,
        merges_file: str = None,
        tokenizer_file: str = None,
        schema_tokens: List[str] = None,
        schema_return: List[str] = None,
        cache_dir: str = "aitextgen",
        tf_gpt2: str = None,
        to_gpu: bool = False,
        to_fp16: bool = False,
        verbose: bool = False,
        gradient_checkpointing: bool = False,
        bos_token: str = None,
        eos_token: str = None,
        unk_token: str = None,
        **kwargs,
    ) -> None:

        if not verbose:
            for module in [
                    "transformers.file_utils",
                    "transformers.configuration_utils",
                    "transformers.tokenization_utils",
                    "filelock",
                    "transformers.modeling_gpt2",
            ]:
                logging.getLogger(module).setLevel(logging.WARN)
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.ERROR)

        if tf_gpt2:
            self.openai_tf_gpt2 = tf_gpt2

            # Download + convert the TF weights if a PyTorch model has not been created
            if not os.path.isfile(
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")):
                assert tf_gpt2 in [
                    "124M",
                    "355M",
                    "774M",
                    "1558M",
                ], "Invalid TensorFlow GPT-2 model size."

                logger.info(
                    f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config "
                    + "from Google's servers")

                download_gpt2(cache_dir, tf_gpt2)

                logger.info(
                    f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch."
                )

                config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json")

                convert_gpt2_checkpoint_to_pytorch(
                    os.path.join(cache_dir, tf_gpt2),
                    config_path,
                    cache_dir,
                )

                os.rename(
                    os.path.join(cache_dir, "pytorch_model.bin"),
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"),
                )

                os.rename(
                    os.path.join(cache_dir, "config.json"),
                    os.path.join(cache_dir, f"config_{tf_gpt2}.json"),
                )

            logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.")
            model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")
            config = os.path.join(cache_dir, f"config_{tf_gpt2}.json")

            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)

        elif model and os.path.exists(model):
            # A pytorch_model.bin (+ optional config/config.json) is provided
            logger.info(f"Loading GPT-2 model from provided {model}.")
            if config is None:
                config = GPT2Config()
            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)
        elif config:
            # Manually construct a GPT-2 model from scratch
            logger.info("Constructing GPT-2 model from provided config.")
            if isinstance(config, str):
                config = AutoConfig.from_pretrained(config)
            self.model = GPT2LMHeadModel(config=config)
        else:
            # Download and cache model from Huggingface
            if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0:
                logger.info(
                    f"Loading {model or 'gpt2'} model from /{cache_dir}.")
            else:
                logger.info(
                    f"Downloading {model or 'gpt2'} model to /{cache_dir}.")
            self.model = GPT2LMHeadModel.from_pretrained(model or "gpt2",
                                                         cache_dir=cache_dir)
            if model and "gpt2" not in model:
                logger.info(f"Using the tokenizer for {model}.")
                self.tokenizer = GPT2TokenizerFast.from_pretrained(
                    model,
                    cache_dir=cache_dir,
                )

        if gradient_checkpointing or tf_gpt2 in ["355M", "774M", "1558M"]:
            logger.info("Gradient checkpointing enabled for model training.")
            setattr(self.model.config, "gradient_checkpointing", True)
            setattr(self.model.config, "use_cache", False)

        if schema_tokens:
            setattr(self.model.config, "schema_tokens", schema_tokens)

        if schema_tokens:
            setattr(self.model.config, "schema_return", schema_return)

        if self.tokenizer is None:
            # Update tokenizer settings (if not set already)
            args = locals()
            custom_tokenizer = False
            for attr in [
                    "vocab_file",
                    "merges_file",
                    "tokenizer_file",
                    "bos_token",
                    "eos_token",
                    "unk_token",
            ]:
                if args[attr] is not None:
                    custom_tokenizer = True
                    setattr(self, attr, args[attr])

            if custom_tokenizer:
                logger.info("Using a custom tokenizer.")
            else:
                logger.info("Using the default GPT-2 Tokenizer.")

            if tokenizer_file:
                # load the custom GPT-2 tokenizer from a serialized tokenizer
                self.tokenizer = GPT2TokenizerFast(
                    vocab_file=None,
                    merges_file=None,
                    tokenizer_file=tokenizer_file,
                    bos_token=self.bos_token,
                    eos_token=self.eos_token,
                    unk_token=self.unk_token,
                    pad_token=self.pad_token,
                )
            else:
                self.tokenizer = GPT2TokenizerFast(
                    vocab_file=self.vocab_file,
                    merges_file=self.merges_file,
                    bos_token=self.bos_token,
                    eos_token=self.eos_token,
                    unk_token=self.unk_token,
                    pad_token=self.pad_token,
                )

        self.tokenizer.padding_side = "left"

        if to_gpu:
            if to_fp16:
                logger.warn(
                    "Currently, FP16 text generation results in random output. "
                    +
                    "You may want to avoid using to_fp16 for the time being.")
                self.to_fp16()
            self.to_gpu()
Пример #8
0

#---------------------------------------------------------------------------------------#

#---------------------------------------------------------------------------------------#
# Model Initialization/Load
#---------------------------------------------------------------------------------------#

# Model Seed
seed = random.randrange(1, 100)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Model Configs:
gpt2_small_config = GPT2Config()
gpt2_medium_config = GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16)
gpt2_large_config = GPT2Config(n_ctx=1024, n_embd=1280, n_layer=36, n_head=20)

model_size = "medium"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Model Loads:

model = GPT2LMHeadModel(
    GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16))
model.load_state_dict(torch.load("medium_ft.pkl"), strict=False)

device = torch.device("cuda")
model = model.to(device)
model.lm_head.weight.data = model.transformer.wte.weight.data
Пример #9
0
        tokenizer = spm.SentencePieceProcessor()
        # here we assume that you have already added the special token from the dataset
        with open(args.tokenizer.replace(".model", ".vocab"), "r") as f:
            vocab_size = len(f.readlines())

        config = GPT2Config(
            vocab_size=vocab_size,
            n_positions=args.maxlen,
            n_ctx=args.maxlen,
            n_embd=args.n_embd,
            n_layer=args.n_layer,
            n_head=args.n_head,
            activation_function=args.activation_function,
            resid_pdrop=args.resid_pdrop,
            embd_pdrop=args.embd_pdrop,
            attn_pdrop=args.attn_pdrop,
            layer_norm_epsilon=args.layer_norm_epsilon,
            initializer_range=args.initializer_range,
            summary_type=args.summary_type,
            summary_use_proj=args.summary_use_proj,
            summary_activation=args.summary_activation,
            summary_proj_to_labels=args.summary_proj_to_labels,
            summary_first_dropout=args.summary_first_dropout,
            bos_token_id=tokenizer.bos_id(),
            eos_token_id=tokenizer.eos_id(),
        )
        
        model = GPT2LMHeadModel(config)
    else:
        print("🔋 Finetuning model from huggingface's transformers")
        tokenizer = GPT2Tokenizer.from_pretrained(args.model)
def main():
    # Create the argument parser.
    parser = argparse.ArgumentParser()
    parser.add_argument("--print-checkpoint-structure", action="store_true")
    parser.add_argument(
        "path_to_checkpoint",
        type=str,
        help="Path to the checkpoint file (.zip archive or direct .pt file)",
    )
    parser.add_argument(
        "--config_file",
        default="",
        type=str,
        help="An optional config json file describing the pre-trained model.",
    )
    args = parser.parse_args()

    # Extract the basename.
    basename = os.path.dirname(args.path_to_checkpoint)

    # Load the model.
    # the .zip is very optional, let's keep it for backward compatibility
    print(
        f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
    if args.path_to_checkpoint.endswith(".zip"):
        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
            with checkpoint.open(
                    "release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
    else:
        input_state_dict = torch.load(args.path_to_checkpoint,
                                      map_location="cpu")

    # Read the config, or default to the model released by NVIDIA.
    if args.config_file == "":
        # Spell out all parameters in case the defaults change.
        config = GPT2Config(
            vocab_size=50257,
            n_positions=1024,
            n_ctx=1024,
            n_embd=1024,
            n_layer=24,
            n_head=16,
            n_inner=4096,
            activation_function=
            "gelu",  # used to be "gelu_new" in earlier versions
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1,
            layer_norm_epsilon=1e-5,
            initializer_range=0.02,
            summary_type="cls_index",
            summary_use_proj=True,
            summary_activation=None,
            summary_proj_to_labels=True,
            summary_first_dropout=0.1,
            scale_attn_weights=True,
            use_cache=True,
            bos_token_id=50256,
            eos_token_id=50256,
        )
    else:
        config = GPT2Config.from_json_file(args.config_file)

    # Convert.
    print("Converting")
    output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
                                                    config)

    # Print the structure of converted state dict.
    if args.print_checkpoint_structure:
        recursive_print(None, output_state_dict)

    # Store the config to file.
    output_config_file = os.path.join(basename, "config.json")
    output_config = config.to_dict()
    output_config["architectures"] = ["GPT2LMHeadModel"]
    output_config["model_type"] = "gpt2"
    print(f'Saving config to "{output_config_file}"')
    with open(output_config_file, "w") as f:
        json.dump(output_config, f)

    # Store the state_dict to file.
    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
    print(f'Saving checkpoint to "{output_checkpoint_file}"')
    torch.save(output_state_dict, output_checkpoint_file)
Пример #11
0
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex

import pandas as pd
import random
import torch
import pickle
import time
import slack
import os
import re
import wget

# Stuff for nlg
gpt2_medium_config = GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel(gpt2_medium_config)
model.load_state_dict(torch.load('/datafiles/medium_ft.pkl'), strict=False)
print('Tokenizer and model ready..')

# More stuff for nlg
eos = [tokenizer.encoder["<|endoftext|>"]]
num_words = 50
device = torch.device('cpu')
model.to(device)
model.lm_head.weight.data = model.transformer.wte.weight.data

# Load indexes
bert_annoy = AnnoyIndex(768, 'angular')
bert_annoy.load('/datafiles/dim768-trees13.ann')