예제 #1
0
    def __init__(
            self,
            config,
            class_labels,
            pretrained_model_path,
            dropout=0.1,
            freeze_pretrained_part=True,
            reinitialize=False,
            n_layers=6,
    ):
        super().__init__(config, class_labels)

        if reinitialize:
            logger.info('resetting model weights')
            config = GPT2Config.from_json_file(pretrained_model_path + '/config.json')
            config = config.to_dict()
            config['n_layer'] = n_layers
            config = GPT2Config.from_dict(config)
            self.gpt2 = GPT2Model(config)
        else:
            self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path)

        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim)
        if freeze_pretrained_part:
            for param in self.gpt2.parameters():
                param.requires_grad = False
예제 #2
0
    def __init__(self, model_path, generation_type, use_finetuned=True):
        self.model_path = model_path
        self.batch_size = int(args["--batch-size"])

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        self.MAX_LEN = {
            GENERATION_TYPE_SMALL: 20,
            GENERATION_TYPE_LARGE: 500
        }[generation_type]
        logger.info(
            f"Using {generation_type} for decoding, MAX_LEN={self.MAX_LEN}")
        if use_finetuned:
            logger.info("Using a finetuned model")
            self.config = GPT2Config.from_pretrained(self.model_path)
            model = GPT2LMHeadModel.from_pretrained(self.model_path)
            with open(f"{self.model_path}/special_tokens_map.json", "r") as f:
                special_tokens = json.load(f)
            self.tokenizer.add_special_tokens(special_tokens)
        else:
            logger.info("NOT using a finetuned model")
            model = GPT2LMHeadModel(config=GPT2Config.from_pretrained(
                pretrained_model_name_or_path=self.model_path))
        self.model = model.cuda()
        self.model.eval()
예제 #3
0
    def __init__(self,
                 max_output_length=25,
                 max_input_length=300,
                 device='cpu',
                 tokenizer_type='gpt2',
                 bpe_model="",
                 starter_model=None):
        if tokenizer_type == "gpt2":
            self.tokenizer = utils_tokenizer.GPT2Tokenizer()
            config = GPT2Config.from_pretrained("gpt2")

        elif tokenizer_type == "bpecap":
            self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model)
            config = GPT2Config.from_dict({
                "finetuning_task":
                None,
                "initializer_range":
                0.02,
                "layer_norm_epsilon":
                1e-05,
                "n_ctx":
                1024,
                "n_embd":
                768,
                "n_head":
                12,
                "n_layer":
                12,
                "n_positions":
                1024,
                "num_labels":
                1,
                "resid_pdrop":
                0.1,
                "use_bfloat16":
                False,
                "vocab_size":
                self.tokenizer.vocab_size
            })
        else:
            print("Tokenizer unrecognized. Should be gpt2 or bpecap.")
            exit()

        self.model = GPT2LMHeadModel(config)

        self.model.to(device)
        self.device = device
        if starter_model is not None:
            self.reload(starter_model)

        self.max_output_length = max_output_length
        self.max_input_length = max_input_length

        self.model.train()
        self.mode = "train"
예제 #4
0
def build_model(args):
    if args.pretrained_path == '':
        config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config)
        tokenizer = BertTokenizerFast(args.vocab)
        # XXX: must add this, or can't tokenize special token in string to single char
        tokenizer.sanitize_special_tokens()
        info = None
    else:
        config = GPT2Config.from_pretrained(args.pretrained_path)
        model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path,
                                                      config=config,
                                                      output_loading_info=True)
        tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path)
    return model, tokenizer, info
예제 #5
0
    def __init__(self, config, dataset):
        super(GPT2Seq, self).__init__(config, dataset)
        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = GPT2TokenizerFast.from_pretrained(
            self.pretrained_model_path, pad_token='[PAD]')

        self.configuration = GPT2Config.from_pretrained(
            self.pretrained_model_path, pad_token_id=self.padding_token_idx)

        self.model = GPT2LMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.model.resize_token_embeddings(len(self.tokenizer))

        if config['task_type'] == "summarization":
            self.task_text = "TL;DR:"
        elif config['task_type'] == "translation":
            self.task_text = "story:"
        elif config['task_type'] == "multi_dialog":
            self.task_text = "question:"
        else:
            raise NotImplementedError(
                "Only summarization and translation are supported.")

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
예제 #6
0
def main():

    config = GPT2Config(
        vocab_size=30000,
        n_positions=1024,
        n_ctx=1024,
        n_embd=2560,
        n_layer=32,
        n_head=32,
        n_inner=4*2560,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        summary_type="cls_index",
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
        bos_token_id=30000,
        eos_token_id=30000,
        gradient_checkpointing=False,
    )

    print("initializing model")
    model = GPT2LMHeadModel(config)

    convert(
        model=model,
        m0_path="model-v1/80000/mp_rank_00_model_states.pt",
        m1_path="model-v1/80000/mp_rank_01_model_states.pt",
        save_path="model/CPM/",
    )
예제 #7
0
def load_model(train_steps, num_warmup_steps):
    try:  # try to load finetuned model at local.
        tokenizer = load_tokenizer()
        config = GPT2Config.from_pretrained(configs.model_path,
                                            return_dict=False)
        model = TFGPT2LMHeadModel.from_pretrained(configs.model_path,
                                                  return_dict=False)
        print("model loaded from local!")
    except Exception as e:
        tokenizer = BertTokenizer.from_pretrained(
            "mymusise/gpt2-medium-chinese")
        model = TFGPT2LMHeadModel.from_pretrained(
            "mymusise/gpt2-medium-chinese", return_dict=False)
        print("model loaded from remote!")

    loss = model.compute_loss
    optimizer = nlp.optimization.create_optimizer(
        5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(
        optimizer=optimizer,
        loss=[loss, *[None] * model.config.n_layer],
        # metrics=[metric]
    )
    return model
예제 #8
0
 def __init__(self, model_path):
     config = GPT2Config.from_pretrained(model_path)
     config.output_hidden_states=True
     config.output_attentions = True
     self.model = GPT2LMHeadModel.from_pretrained(model_path, config=config)
     self.model.eval()
     self.context = ''
예제 #9
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a pretrained model by supplying
        * the name of a remote model on s3 ("gpt2" ...)
        * OR a local path of a model trained via transformers ("some_dir/huggingface_model")
        * OR a local path of a model trained via FARM ("some_dir/farm_model")
        :param pretrained_model_name_or_path: The path of the saved pretrained model or its name.
        :type pretrained_model_name_or_path: str
        """

        gpt2 = cls()
        if "farm_lm_name" in kwargs:
            gpt2.name = kwargs["farm_lm_name"]
        else:
            gpt2.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(
            pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            gpt2_config = GPT2Config.from_pretrained(farm_lm_config)
            farm_lm_model = Path(
                pretrained_model_name_or_path) / "language_model.bin"
            gpt2.model = GPT2Model.from_pretrained(farm_lm_model,
                                                   config=gpt2_config,
                                                   **kwargs)
            gpt2.language = gpt2.model.config.language
        else:
            # Pytorch-transformer Style
            gpt2.model = GPT2Model.from_pretrained(
                str(pretrained_model_name_or_path), **kwargs)
            gpt2.language = cls._get_or_infer_language_from_name(
                language, pretrained_model_name_or_path)
        return gpt2
예제 #10
0
 def __init__(
     self,
     batch_size,
     epochs,
     t_total=100000,
     config_path="config/model_config.json",
     data_path="data/train.json",
     valid_examples=100,
     vocab_path="vocab/vocab.txt",
     max_length=1024,
     warm_up_steps=0,
     lr=1e-4,
 ):
     super(Net, self).__init__()
     self.batch_size = batch_size
     self.epochs = epochs
     self.t_total = t_total
     self.warm_up_steps = warm_up_steps
     self.lr = lr
     self.model_name = "bert_pretrained_model"
     self.config = GPT2Config.from_json_file(config_path)
     self.model = GPT2LMHeadModel(config=self.config)
     self.data = [json.loads(line.strip()) for line in open(data_path)]
     self.dataset_train = DS(self.data[:-valid_examples],
                             vocab_path=vocab_path,
                             max_length=max_length)
     self.dataset_valid = DS(self.data[-valid_examples:],
                             vocab_path=vocab_path,
                             max_length=max_length)
예제 #11
0
    def __init__(self, config):
        medium_config = GPT2Config(n_embd=1024, n_layer=24, n_head=16)
        model = GPT2LMHeadModel(medium_config)

        print("Step 1/3: Downloading weights [823 MB]...")
        wget.download(
            "https://convaisharables.blob.core.windows.net/lsp/multiref/medium_ft.pkl",
            "/tmp/medium_ft.pkl",
        )

        print("Step 2/3: Loading weights...")
        weights = torch.load("/tmp/medium_ft.pkl")
        weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
        weights.pop("lm_head.decoder.weight", None)

        print("Step 3/3: Loading a model...")
        model.load_state_dict(weights)

        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"using device: {device}")
        model.to(device)
        model.eval()

        self.device = device
        self.model = model
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        print("Model is ready!")
예제 #12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--config_path",
                        default="../../models/gpt2/gpt2-config.json",
                        type=str,
                        required=False)
    parser.add_argument("--model_path",
                        default="../../models/gpt2/gpt2-pytorch_model.bin",
                        type=str,
                        required=False)
    parser.add_argument("--vocab_path",
                        default="../../models/gpt2/gpt2-vocab.json",
                        type=str,
                        required=False)
    parser.add_argument("--merges_path",
                        default="../../models/gpt2/gpt2-merges.txt",
                        type=str,
                        required=False)
    parser.add_argument(
        "--sentence",
        default="In this article, I am excited to take you through",
        type=str,
        required=False)
    args = parser.parse_args()

    config = GPT2Config.from_pretrained(args.config_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config)
    tokenizer = GPT2Tokenizer(args.vocab_path, args.merges_path)
    # logging.basicConfig(filename="default.txt", level=logging.DEBUG, filemode='w')
    # gpt2_generate_greedy(model, tokenizer, sentence=sys.argv[1])
    gpt2_generate_beam_search(model, tokenizer, sentence=args.sentence)
예제 #13
0
def build_model_classifier(model_dir, device1, device2):
    config = GPT2Config()
    config = config.from_pretrained('gpt2')#config.from_pretrained('gpt2-medium')
    config.summary_first_dropout = 0.2
    config.summary_type = "cls_index"

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")#torch.load(tokenizer_dir)
    tokenizer.add_special_tokens({'cls_token': '[CLS]'})
    # device1 = torch.device("cuda:0")
    # device2 = torch.device("cuda:1")
    model_A, model_B = load_model(cfg, "small", tokenizer, device1, device2)

    # pdb.set_trace()
    print("model_clf device\n\n\n\n\n\n")
    print(model_A.device)
    print(model_B.device)
    print("here\n\n\n")
    which_to_train = ["A", "B", "TF"]
    model_clf = ModelClassifier(config=config, which_to_train=which_to_train, 
                                model_A=model_A, model_B=model_B,
                                tokenizer=tokenizer, device1=device1, device2=device2)
    
    model_clf.load_model(all_model_dir=model_dir)

    return model_clf
예제 #14
0
    def __init__(self, start_index):
        super().__init__()

        config = GPT2Config(output_hidden_states=True)
        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', config=config)        
        self.vocab_size = self.gpt2.config.vocab_size
        self.start_index = start_index
예제 #15
0
    def __init__(self, train_dataloader, val_dataloader=None):
        """
        Initialises Trainer by defining model and GPU

        Args:
        train_dataloader: torch.utils.data.DataLoader
            Dataloader to train model upon, obtained from Dataloader class
        val_dataloader: Optional torch.utils.data.DataLoader
            Dataloader to validate model upon obtained from DataLoader class,
            not required if Trainer is only used for final training
        """

        # Create GPT2 Config
        config = GPT2Config.from_pretrained("gpt2")

        # Load language head model and input default config
        model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)

        # Recreate tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                                  bos_token='<|startoftext|>',
                                                  eos_token='<|endoftext|>',
                                                  pad_token='<|pad|>')

        # Tell model we have added bos, eos, pad token
        model.resize_token_embeddings(len(tokenizer))

        # Tell pytorch to run this model on the GPU.
        device = torch.device("cuda")
        model.cuda()

        self.model = model
        self.device = device
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
예제 #16
0
    def __init__(self, args, task):
        super().__init__(task.target_dictionary)

        if not has_hf:
            raise ImportError(
                '\n\nPlease install huggingface/transformers with:'
                '\n\n  pip install transformers'
                '\n\nOr to make local edits, install the submodule:'
                '\n\n  git submodule update --init '
                'fairseq/models/huggingface/transformers')

        config = GPT2Config(
            vocab_size=len(task.target_dictionary),
            n_positions=args.max_target_positions + 1,
            n_ctx=args.max_target_positions,
            n_embd=args.embed_dim,
            n_layer=args.num_layers,
            n_head=args.num_attention_heads,
            resid_pdrop=args.dropout,
            embd_pdrop=args.dropout,
            attn_pdrop=args.attention_dropout,
            layer_norm_epsilon=1e-6,
        )
        self.model = GPT2LMHeadModel(config)

        # set zero embedding for padding symbol
        self.pad_idx = task.target_dictionary.pad()
        self.model.transformer.wte.weight.data[self.pad_idx].zero_()
        self.model.transformer.wpe.weight.data[0].zero_()
예제 #17
0
 def get_config(self,
                gradient_checkpointing=False,
                scale_attn_by_inverse_layer_idx=False,
                reorder_and_upcast_attn=False):
     return GPT2Config(
         vocab_size=self.vocab_size,
         n_embd=self.hidden_size,
         n_layer=self.num_hidden_layers,
         n_head=self.num_attention_heads,
         n_inner=self.intermediate_size,
         activation_function=self.hidden_act,
         resid_pdrop=self.hidden_dropout_prob,
         attn_pdrop=self.attention_probs_dropout_prob,
         n_positions=self.max_position_embeddings,
         n_ctx=self.max_position_embeddings,
         type_vocab_size=self.type_vocab_size,
         initializer_range=self.initializer_range,
         use_cache=True,
         bos_token_id=self.bos_token_id,
         eos_token_id=self.eos_token_id,
         pad_token_id=self.pad_token_id,
         gradient_checkpointing=gradient_checkpointing,
         scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
         reorder_and_upcast_attn=reorder_and_upcast_attn,
     )
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length],
                                   self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length],
                                        vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                            self.type_vocab_size)

            mc_token_ids = None
            if self.use_mc_token_ids:
                mc_token_ids = ids_tensor([self.batch_size, self.num_choices],
                                          self.seq_length)

            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size],
                                             self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length],
                                          self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = GPT2Config(
                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
                # intermediate_size=self.intermediate_size,
                # hidden_act=self.hidden_act,
                # hidden_dropout_prob=self.hidden_dropout_prob,
                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                n_positions=self.max_position_embeddings,
                n_ctx=self.max_position_embeddings,
                # type_vocab_size=self.type_vocab_size,
                # initializer_range=self.initializer_range
                bos_token_id=self.bos_token_id,
                eos_token_id=self.eos_token_id,
            )

            head_mask = ids_tensor(
                [self.num_hidden_layers, self.num_attention_heads], 2)

            return (
                config,
                input_ids,
                input_mask,
                head_mask,
                token_type_ids,
                mc_token_ids,
                sequence_labels,
                token_labels,
                choice_labels,
            )
예제 #19
0
    def __init__(self, args, task):
        try:
            from transformers import GPT2Config, GPT2LMHeadModel
        except ImportError:
            raise ImportError(
                "\n\nPlease install huggingface/transformers with:"
                "\n\n  pip install transformers")

        super().__init__(task.target_dictionary)

        config = GPT2Config(
            vocab_size=len(task.target_dictionary),
            n_positions=args.max_target_positions + 1,
            n_ctx=args.max_target_positions,
            n_embd=args.embed_dim,
            n_layer=args.num_layers,
            n_head=args.num_attention_heads,
            resid_pdrop=args.dropout,
            embd_pdrop=args.dropout,
            attn_pdrop=args.attention_dropout,
            layer_norm_epsilon=1e-6,
        )
        self.model = GPT2LMHeadModel(config)

        # set zero embedding for padding symbol
        self.pad_idx = task.target_dictionary.pad()
        self.model.transformer.wte.weight.data[self.pad_idx].zero_()
        self.model.transformer.wpe.weight.data[0].zero_()
예제 #20
0
    def __init__(self, config, dataset):
        super(GPT2, self).__init__(config, dataset)

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            self.pretrained_model_path,
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token)

        self.sos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.sos_token_idx = self.tokenizer.bos_token_id
        self.eos_token_idx = self.tokenizer.eos_token_id
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_seq_length = config['max_seq_length']

        self.configuration = GPT2Config.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            pad_token_id=self.padding_token_idx)

        self.decoder = GPT2LMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
예제 #21
0
 def test_TFGPT2(self):
     if enable_full_transformer_test:
         from transformers import GPT2Config, TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel
         model_list = [
             TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel
         ]
     else:
         from transformers import GPT2Config, TFGPT2Model
         model_list = [TFGPT2Model]
     # pretrained_weights = 'gpt2'
     tokenizer_file = 'gpt2_gpt2.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = GPT2Config()
     for model_instance_ in model_list:
         keras.backend.clear_session()
         model = model_instance_(config)
         model._set_inputs(inputs)
         predictions_original = model(inputs)
         predictions = [predictions_original[0]] + list(
             v_.numpy() for v_ in predictions_original[1])
         onnx_model = mock_keras2onnx.convert_keras(model, model.name)
         self.assertTrue(
             run_onnx_runtime(onnx_model.graph.name,
                              onnx_model,
                              inputs_onnx,
                              predictions,
                              self.model_files,
                              rtol=1.e-2,
                              atol=1.e-4))
    def build_model(self):
        """创建GPT-2生成模型
        """
        # 使用bert tokenizer # 初始化tokenizer
        self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path)
        # temp = self.tokenizer.convert_tokens_to_ids('')
        # print(self.tokenizer.convert_ids_to_tokens(temp))
        # tokenizer的字典大小
        self.vocab_size = len(self.tokenizer)

        self.pad_id = self.tokenizer.convert_tokens_to_ids(PAD)

        if self.args.pretrained_model:
            # 如果指定了预训练的GPT2模型
            model = GPT2LMHeadModel.from_pretrained(self.args.pretrained_model)
        else:
            # 若没有指定预训练模型,则初始化模型
            model_config = GPT2Config(self.args.model_config)
            model = GPT2LMHeadModel(config=model_config)

        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
        model.resize_token_embeddings(self.vocab_size)

        print('model config:\n{}'.format(model.config.to_json_string()))

        return model, model.config.to_dict().get("n_ctx")
예제 #23
0
def model_fn(model_dir):
    logger.info('Loading the model.')

    vocab_file_path = os.path.join(model_dir, 'vocab.json')
    merge_file_path = os.path.join(model_dir, 'merges.txt')
    model_file_path = os.path.join(model_dir, 'lyric_model.bin')

    tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
    bos = tokenizer.convert_tokens_to_ids('<s>')
    eos = tokenizer.convert_tokens_to_ids('</s>')
    pad = tokenizer.convert_tokens_to_ids('<pad>')
    unk = tokenizer.convert_tokens_to_ids('<unk>')

    config = GPT2Config(vocab_size=52003,
                        resid_pdrop=0,
                        embd_pdrop=0,
                        attn_pdrop=0,
                        summary_first_dropout=0)

    model = GPT2LMHeadModel(config)

    model.load_state_dict(torch.load(model_file_path, map_location=device),
                          strict=False)
    model.to(device)

    return model, tokenizer
예제 #24
0
def load_model(target_folder, config):
    # Parse parameters
    model_size = config.get('model', 'model_size')
    no_cuda = config.getboolean('model', 'no_cuda')

    logger.info("Loading the model...")
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    # Tokenizer
    tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'),
                              os.path.join(target_folder, 'merges.txt'))
    # Config
    config = GPT2Config.from_json_file(
        os.path.join(target_folder, 'config.json'))
    # Weights
    state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0]
    state_dict = torch.load(state_dict_path, map_location=device)
    if model_size == 'small':
        for key in list(state_dict.keys()):
            state_dict[key.replace('module.', '')] = state_dict.pop(key)
    state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight']
    state_dict.pop("lm_head.decoder.weight", None)
    # Model
    model = GPT2LMHeadModel(config)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model, tokenizer
예제 #25
0
    def __init__(self, hparams):
        super().__init__()

        self.hparams = hparams
        self.d = None
        self.tokenizer = None

        # hotfixes
        if 'unfreeze' not in hparams:
            self.hparams.unfreeze = False
        if 'lang' not in hparams:
            self.hparams.lang = 'nld'

        autofix_paths(self.hparams)

        # GPT with LM head and correct embedding size
        with open(Path('data') / self.hparams.lang / 'config.json') as f:
            cfg = json.load(f)

        if self.hparams.unfreeze:
            self.n_unfreeze = 0
            if self.hparams.resume_from_checkpoint is not None:
                print('Resuming from checkpoint: unfreezing all layers')
                self.n_unfreeze = None

        config = GPT2Config.from_pretrained(self.hparams.pretrained_path,
                                            **cfg)
        if self.hparams.unfreeze and self.n_unfreeze is not None:
            config.torchscript = True
        self.m = GPT2LMHeadModel.from_pretrained(self.hparams.pretrained_path,
                                                 config=config)

        # Resize vocab
        self.m.resize_token_embeddings(self.hparams.vocab_size)
예제 #26
0
    def _create_model(self, precision):
        """Construct the model for benchmarking.

        Args:
            precision (Precision): precision of model and input data, such as float32, float16.
        """
        self._config = GPT2Config(n_embd=self._args.hidden_size,
                                  n_layer=self._args.num_hidden_layers,
                                  n_head=self._args.num_attention_heads)

        try:
            self._model = GPT2BenchmarkModel(self._config,
                                             self._args.num_classes)
            self._model = self._model.to(dtype=getattr(torch, precision.value))
            if self._gpu_available:
                self._model = self._model.cuda()
        except BaseException as e:
            logger.error(
                'Create model with specified precision failed - model: {}, precision: {}, message: {}.'
                .format(self._name, precision, str(e)))
            return False

        self._target = torch.LongTensor(self._args.batch_size).random_(
            self._args.num_classes)
        if self._gpu_available:
            self._target = self._target.cuda()

        return True
예제 #27
0
    def __init__(self, config: Munch):
        r""" Init a new GPT2 synapse module.

            Args:
                config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(GPT2LMSynapse, self).__init__(config=config)
        if config == None:
            config = GPT2LMSynapse.build_config()

        # Build hugging face config.
        huggingface_config = GPT2Config(
            vocab_size=bittensor.__vocab_size__,
            n_embd=bittensor.__network_dim__,
            n_layer=config.synapse.n_layer,
            n_head=config.synapse.n_head,
            n_inner=config.synapse.n_inner,
            activation_function=config.synapse.activation_function,
            resid_pdrop=config.synapse.resid_pdrop,
            embd_pdrop=config.synapse.embd_pdrop,
            attn_pdrop=config.synapse.attn_pdrop,
            layer_norm_epsilon=config.synapse.layer_norm_epsilon,
            initializer_range=config.synapse.initializer_range,
            summary_type=config.synapse.summary_type,
            summary_use_proj=config.synapse.summary_use_proj,
            summary_activation=config.synapse.summary_activation,
            summary_proj_to_labels=config.synapse.summary_proj_to_labels,
            summary_first_dropout=config.synapse.summary_first_dropout,
        )

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = GPT2Model(huggingface_config)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__]
        self.pooler = GPT2Pooler(huggingface_config)

        # router: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss.
        # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__]
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)
예제 #28
0
    def __init__(self):
        super().__init__()

        self.tokenizer = BertTokenizer(vocab_file=FLAGS.vocab_path)

        self.config = GPT2Config.from_json_file(FLAGS.model_config)

        self.model = GPT2LMHeadModel(config=self.config)
예제 #29
0
 def gpt2_model(freeze=True, configuration=None):
     if configuration is None:
         configuration = GPT2Config()
     gp2_model = TFGPT2Model.from_pretrained('gpt2', config=configuration)
     if freeze:
         for layer in gp2_model.layers:
             layer.trainable = False
     return gp2_model
예제 #30
0
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full,
                                       gpt2_config_file,
                                       pytorch_dump_folder_path):
    #putting requirements here so users can see usage info before it errors out on missing modules
    from io import open
    from shutil import copyfile
    import logging
    logging.basicConfig(level=logging.INFO)
    from pathlib import Path
    import torch
    #WEIGHTS_NAME = "pytorch_model.bin"
    #CONFIG_NAME = "config.json"
    from transformers import (
        CONFIG_NAME,
        WEIGHTS_NAME,
        GPT2Config,
        GPT2Model,
        load_tf_weights_in_gpt2,
    )
    gpt2_checkpoint_path = Path(gpt2_checkpoint_path)
    print(gpt2_checkpoint_path.name)

    if pytorch_dump_folder_path == '':
        prefix = '32BIT-' if full else '16BIT-'
        pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name
    pytorch_dump_folder_path = Path(pytorch_dump_folder_path)

    pytorch_dump_folder_path.mkdir(exist_ok=True)

    # Construct model
    if gpt2_config_file == "":
        #This doesn't seem to work. We will use the hparams.json file that seems to be included in
        #config = GPT2Config()
        gpt2_config_file = gpt2_checkpoint_path / 'hparams.json'

    config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
    if not full:
        model.half()

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME
    print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path)))

    torch.save(model.state_dict(), pytorch_weights_dump_path)

    print("Save configuration file to: " + str(pytorch_config_dump_path))
    with pytorch_config_dump_path.open("w", encoding="utf-8") as f:
        f.write(config.to_json_string())

    copyfile(gpt2_checkpoint_path / 'vocab.bpe',
             pytorch_dump_folder_path / 'merges.txt')
    copyfile(gpt2_checkpoint_path / 'encoder.json',
             pytorch_dump_folder_path / 'vocab.json')