예제 #1
0
    def __init__(
            self,
            config,
            class_labels,
            pretrained_model_path,
            dropout=0.1,
            freeze_pretrained_part=True,
            reinitialize=False,
            n_layers=6,
    ):
        super().__init__(config, class_labels)

        if reinitialize:
            logger.info('resetting model weights')
            config = GPT2Config.from_json_file(pretrained_model_path + '/config.json')
            config = config.to_dict()
            config['n_layer'] = n_layers
            config = GPT2Config.from_dict(config)
            self.gpt2 = GPT2Model(config)
        else:
            self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path)

        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim)
        if freeze_pretrained_part:
            for param in self.gpt2.parameters():
                param.requires_grad = False
예제 #2
0
def load_model(target_folder, config):
    # Parse parameters
    model_size = config.get('model', 'model_size')
    no_cuda = config.getboolean('model', 'no_cuda')

    logger.info("Loading the model...")
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    # Tokenizer
    tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'),
                              os.path.join(target_folder, 'merges.txt'))
    # Config
    config = GPT2Config.from_json_file(
        os.path.join(target_folder, 'config.json'))
    # Weights
    state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0]
    state_dict = torch.load(state_dict_path, map_location=device)
    if model_size == 'small':
        for key in list(state_dict.keys()):
            state_dict[key.replace('module.', '')] = state_dict.pop(key)
    state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight']
    state_dict.pop("lm_head.decoder.weight", None)
    # Model
    model = GPT2LMHeadModel(config)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model, tokenizer
예제 #3
0
 def __init__(
     self,
     batch_size,
     epochs,
     t_total=100000,
     config_path="config/model_config.json",
     data_path="data/train.json",
     valid_examples=100,
     vocab_path="vocab/vocab.txt",
     max_length=1024,
     warm_up_steps=0,
     lr=1e-4,
 ):
     super(Net, self).__init__()
     self.batch_size = batch_size
     self.epochs = epochs
     self.t_total = t_total
     self.warm_up_steps = warm_up_steps
     self.lr = lr
     self.model_name = "bert_pretrained_model"
     self.config = GPT2Config.from_json_file(config_path)
     self.model = GPT2LMHeadModel(config=self.config)
     self.data = [json.loads(line.strip()) for line in open(data_path)]
     self.dataset_train = DS(self.data[:-valid_examples],
                             vocab_path=vocab_path,
                             max_length=max_length)
     self.dataset_valid = DS(self.data[-valid_examples:],
                             vocab_path=vocab_path,
                             max_length=max_length)
예제 #4
0
    def build_model(self):
        """创建GPT-2生成模型
        """
        # 使用bert tokenizer # 初始化tokenizer
        self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path)
        # temp = self.tokenizer.convert_tokens_to_ids('')
        # print(self.tokenizer.convert_ids_to_tokens(temp))
        # tokenizer的字典大小
        self.vocab_size = len(self.tokenizer)

        self.pad_id = self.tokenizer.convert_tokens_to_ids(PAD)

        if self.args.pretrained_model:
            # 如果指定了预训练的GPT2模型
            model = GPT2LMHeadModel.from_pretrained(self.args.pretrained_model)
        else:
            # 若没有指定预训练模型,则初始化模型
            model_config = GPT2Config.from_json_file(self.args.model_config)
            # 将一些特殊字符id都设置为0
            if model_config.eos_token_id != 0:
                model_config.eos_token_id = 0
            if model_config.bos_token_id != 0:
                model_config.bos_token_id = 0
            if model_config.pad_token_id != 0:
                model_config.pad_token_id = 0
            model = GPT2LMHeadModel(config=model_config)

        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
        model.resize_token_embeddings(self.vocab_size)

        print('model config:\n{}'.format(model.config.to_json_string()))

        return model, model.config.to_dict().get("n_ctx")
예제 #5
0
    def __init__(self):
        super().__init__()

        self.tokenizer = BertTokenizer(vocab_file=FLAGS.vocab_path)

        self.config = GPT2Config.from_json_file(FLAGS.model_config)

        self.model = GPT2LMHeadModel(config=self.config)
예제 #6
0
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full,
                                       gpt2_config_file,
                                       pytorch_dump_folder_path):
    #putting requirements here so users can see usage info before it errors out on missing modules
    from io import open
    from shutil import copyfile
    import logging
    logging.basicConfig(level=logging.INFO)
    from pathlib import Path
    import torch
    #WEIGHTS_NAME = "pytorch_model.bin"
    #CONFIG_NAME = "config.json"
    from transformers import (
        CONFIG_NAME,
        WEIGHTS_NAME,
        GPT2Config,
        GPT2Model,
        load_tf_weights_in_gpt2,
    )
    gpt2_checkpoint_path = Path(gpt2_checkpoint_path)
    print(gpt2_checkpoint_path.name)

    if pytorch_dump_folder_path == '':
        prefix = '32BIT-' if full else '16BIT-'
        pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name
    pytorch_dump_folder_path = Path(pytorch_dump_folder_path)

    pytorch_dump_folder_path.mkdir(exist_ok=True)

    # Construct model
    if gpt2_config_file == "":
        #This doesn't seem to work. We will use the hparams.json file that seems to be included in
        #config = GPT2Config()
        gpt2_config_file = gpt2_checkpoint_path / 'hparams.json'

    config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
    if not full:
        model.half()

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME
    print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path)))

    torch.save(model.state_dict(), pytorch_weights_dump_path)

    print("Save configuration file to: " + str(pytorch_config_dump_path))
    with pytorch_config_dump_path.open("w", encoding="utf-8") as f:
        f.write(config.to_json_string())

    copyfile(gpt2_checkpoint_path / 'vocab.bpe',
             pytorch_dump_folder_path / 'merges.txt')
    copyfile(gpt2_checkpoint_path / 'encoder.json',
             pytorch_dump_folder_path / 'vocab.json')
예제 #7
0
def create_model(pre_trained=False):
    if pre_trained:
        model = GPT2LMHeadModel.from_pretrained(config.MODEL_PATH)
    else:
        model_config = GPT2Config.from_json_file(config.CONFIG_JSON_FILE)
        model = GPT2LMHeadModel(config=model_config)
    # model.resize_token_embeddings(vocab_size)
    n_ctx = model.config.to_dict().get("n_ctx")
    return model, n_ctx
def main():
    # 设置模型训练参数
    args = set_args()

    # 设置cuda信息
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICE"] = args.device

    # 获取device信息,用于模型训练
    device = torch.device(
        "cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu"
    )

    # 设置随机种子
    if args.seed:
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        np.random.seed(args.seed)

    # 加载模型的config
    model_config = GPT2Config.from_json_file(args.config_path)

    if args.pretrained_model_path:
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
    else:
        # 如果没有指定的预训练模型,则初始化模型
        model = GPT2LMHeadModel(config=model_config)

    # 实例化tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True)

    # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']";
    # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']"
    tokenizer.add_tokens("[Space]", special_tokens=True)
    # 创建模型的输出目录
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # 加载训练数据和测试数据
    train_data = GPT2NewsTitleDataSet(
        tokenizer,
        args.max_len,
        args.title_max_len,
        args.data_dir,
        "train",
        args.train_file_path,
    )
    test_data = GPT2NewsTitleDataSet(
        tokenizer,
        args.max_len,
        args.title_max_len,
        args.data_dir,
        "test",
        args.test_file_path,
    )
    # 开始训练
    train(model, device, train_data, test_data, args)
예제 #9
0
def load_pretrained_model(args):

    if args.pretrained_model:
        logger.info(f'loading pretrained model from {args.pretrained_model}')
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:
        logger.info('init pretrained model...')
        config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config)
    return model, model.config.to_dict().get("n_ctx")
def main():
    # Config
    config = InferenceConfig()
    gpt_config = GPT2Config.from_json_file(config.model_config_path)

    # torch related
    torch.set_grad_enabled(False)
    torch.manual_seed(config.random_seed)

    # Logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    handler = StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
    logger.addHandler(handler)

    # Text Utils
    logging.info(f"loading Tokenizer...")
    tokenizer = GPT2Tokenizer(config.tokenizer_vocab_path,
                              config.tokenizer_merge_path)

    # Forward Model
    logging.info(f"loading Forward Model...")
    forward_model = GPT2LMHeadModel(gpt_config)
    forward_model.load_state_dict(
        load_model_weight(gpt_config, config.forward_model_path))

    # Backward Model
    logging.info(f"loading Backward Model...")
    backward_model = GPT2LMHeadModel(gpt_config)
    backward_model.load_state_dict(
        load_model_weight(gpt_config, config.backward_model_path))

    # Example
    example_contexts = [
        "<|endoftext|>".join(["How are you doing?"]),
        "<|endoftext|>".join(["Does money buy happiness?"]),
        "<|endoftext|>".join([
            "Does money buy happiness?",
            "Depends how much money you spend on it .",
        ]),
        "<|endoftext|>".join([
            "Does money buy happiness?",
            "Depends how much money you spend on it .",
            "What is the best way to buy happiness ?",
        ]),
    ]
    inferencer = Inferencer(config, tokenizer, forward_model, backward_model)
    results = inferencer.run(example_contexts)

    for context, results in zip(example_contexts, results):
        logging.info(f"Example Context:{context}")
        for i, reply in enumerate(results):
            logging.info(f"Output Utterance Top-{i+1}: {reply}")
 def __init__(self):
     super(ConditionalGenerationModel, self).__init__()
     if args.base_model.endswith(".json"):
         # 使用config
         model_config = GPT2Config.from_json_file(args.base_model)
         self.base_model = GPT2LMHeadModel(config=model_config)
     else:
         # 载预训练模型
         self.base_model = GPT2LMHeadModel.from_pretrained(args.base_model)
     self.base_model.resize_token_embeddings(len(args.tokenizer))
     self.config = self.base_model.config
예제 #12
0
def main():
    # 初始化参数
    args = set_args()
    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    args.cuda = not args.no_cuda

    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda:0' if args.cuda else 'cpu'
    args.device = device
    logger.info('using device:{}'.format(device))

    # 初始化tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model)
    args.sep_id = tokenizer.sep_token_id
    args.pad_id = tokenizer.pad_token_id
    args.cls_id = tokenizer.cls_token_id

    # 创建模型的输出目录
    if not os.path.exists(args.save_model_path):
        os.mkdir(args.save_model_path)

    # 创建模型
    if args.pretrained_model:  # 加载预训练模型
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 初始化模型
        model_config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config=model_config)
    model = model.to(device)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    assert model.config.vocab_size == tokenizer.vocab_size

    # 并行训练模型
    if args.cuda and torch.cuda.device_count() > 1:
        model = DataParallel(model).cuda()
        logger.info("use GPU {} to train".format(args.device))

    # 计算模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 记录参数设置
    logger.info("args:{}".format(args))

    # 加载训练集和验证集
    # ========= Loading Dataset ========= #
    train_dataset, validate_dataset = load_dataset(args)
    train(tokenizer, model, train_dataset, validate_dataset, args)
예제 #13
0
    def __init__(self, args, pretrained, model_checkpoint, report_every,
                 ren, norm_fn, device, logdir=None):
        self.args = args
        self._ren = ren
        self._device = device

        self.tokenizer = GPT2Tokenizer.from_pretrained(
            model_checkpoint, do_lower_case=True)
        self.pad_id = self.tokenizer.eos_token_id
        self.use_segments = True

        self._config = GPT2Config.from_json_file(os.path.join(model_checkpoint, CONFIG_NAME))
        self._max_len = 256  # 512  # self._config.n_ctx

        self._model = GPT2LMHeadModel.from_pretrained(
            model_checkpoint).to(device) if pretrained else GPT2LMHeadModel(self._config).to(device)
        num_param, _, __ = _tally_parameters(self._model)
        logger.info("model paramerters: {}".format(num_param))

        if not os.path.exists("checkpoints"):
            os.mkdir("checkpoints")
        self.save_dir = os.path.join("checkpoints", args.save_dir)
        if not os.path.exists(self.save_dir):
            os.mkdir(self.save_dir)
        elif args.infer_from == "":
            if SYS != "Windows":
                raise Exception("path exists {}".format(self.save_dir))

        self._optimizer = None
        self.writer = SummaryWriter(logdir=logdir)
        self.report_every = report_every
        self.batch_step = 0
        self.training_step = 1
        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        self.max_val_step = args.max_val_step

        self._dataset = {}
        self._data_loader = {}

        self._weights = None
        self._w_decay = None

        if norm_fn == 'linear':
            self._norm_fn = _linear_normalize
        elif norm_fn == 'softmax':
            self._norm_fn = _softmax_normalize

        if ren:
            assert norm_fn == 'linear'
예제 #14
0
def build_model(args):
    if args.pretrained_path == '':
        config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config)
        tokenizer = BertTokenizerFast(args.vocab)
        # XXX: must add this, or can't tokenize special token in string to single char
        tokenizer.sanitize_special_tokens()
        info = None
    else:
        config = GPT2Config.from_pretrained(args.pretrained_path)
        model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path,
                                                      config=config,
                                                      output_loading_info=True)
        tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path)
    return model, tokenizer, info
예제 #15
0
파일: train.py 프로젝트: msfenlei/bert4pl
    def __init__(self):
        super().__init__()

        self.config = GPT2Config.from_json_file(FLAGS.model_config)

        self.tokenizer = BertTokenizer(vocab_file=FLAGS.vocab_path)

        if FLAGS.train_mmi and FLAGS.use_pretrain:
            self.model = GPT2LMHeadModel.from_pretrained(FLAGS.mmi_model_path,
                                                         config=self.config)
        elif not FLAGS.train_mmi and FLAGS.use_pretrain:
            self.model = GPT2LMHeadModel.from_pretrained(
                FLAGS.dialogue_model_path, config=self.config)
        else:
            self.model = GPT2LMHeadModel(config=self.config)

        self.n_ctx = self.model.config.to_dict().get("n_ctx")
        self.pad_id = self.tokenizer.convert_tokens_to_ids("[PAD]")
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
    # Construct model
    if gpt2_config_file == "":
        config = GPT2Config()
    else:
        config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
예제 #17
0
def run_pplm_example(
    pretrained_model="gpt2-medium",    #预训练好的LM
    cond_text="",        #起始词
    uncond=False,
    num_samples=1,       #生成样本数
    bag_of_words=None,  #使用的BOW词典,主题
    discrim=None,      #是否使用判别式属性模型
    discrim_weights=None,
    discrim_meta=None,
    class_label=-1,
    length=100,      #生成的长度
    stepsize=0.02,
    temperature=1.0,
    top_k=10,
    sample=False,   #是否采用抽样
    num_iterations=3,
    grad_length=10000,
    horizon_length=1,
    window_length=0,
    decay=False,
    gamma=1.5,
    gm_scale=0.9,
    kl_scale=0.01,
    seed=0,
    no_cuda=False,
    colorama=False,     #彩色高亮显示
    repetition_penalty=1.0,   #重复单词惩罚性
):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
    #加判别式模型的参数
    if discrim == "generic":
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
        print("discrim = {}, pretrained_model set to discriminator's = {}".format(discrim, pretrained_model))

    config = GPT2Config.from_json_file('./gpt2sanwen/config.json')
예제 #18
0
def load_model(target_folder_name, config):
    # Parse parameters
    data_folder = config.get('model', 'data_folder')
    model_size = config.get('model', 'model_size')
    no_cuda = config.getboolean('model', 'no_cuda')

    logger.info(f"Loading model from {target_folder_name}...")
    print(1)
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    print(2)
    # Tokenizer
    target_folder = os.path.join(data_folder, target_folder_name)
    tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'),
                              os.path.join(target_folder, 'merges.txt'))
    print(3)
    # Config
    config = GPT2Config.from_json_file(
        os.path.join(target_folder, 'config.json'))
    print(4)
    # Weights
    torch.cuda.set_device(0)
    state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0]
    state_dict = torch.load(state_dict_path, map_location=device)
    print(5)
    if model_size == 'small':
        for key in list(state_dict.keys()):
            state_dict[key.replace('module.', '')] = state_dict.pop(key)
    state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight']
    state_dict.pop("lm_head.decoder.weight", None)
    # Model
    print(6)
    model = GPT2LMHeadModel(config)
    print(7)
    model.load_state_dict(state_dict)
    print(8)
    model.to(device)
    print(9)
    model.eval()
    print(10)
    return model, tokenizer
예제 #19
0
def create_model(args, vocab_size):
    """
    :param args:
    :param vocab_size:字典大小
    :return:
    """
    if args.pretrained_model:  # 如果指定了预训练的GPT2模型,默认没有
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    # elif os.path.exists(args.best_model): # 若已经进行过训练,则在原训练基础上继续训练
    #     model =  GPT2LMHeadModel.from_pretrained(args.best_model)
    elif os.path.exists(
            args.dialogue_model_output_path):  # 若已经进行过训练,则在原训练基础上继续训练
        model = GPT2LMHeadModel.from_pretrained(
            args.dialogue_model_output_path + "model_epoch10/")
    else:  # 若没有指定预训练模型,则初始化模型
        model_config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    model.resize_token_embeddings(vocab_size)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    return model, model.config.to_dict().get("n_ctx")
예제 #20
0
def main():
    # 设置模型训练参数
    args = set_args()

    # 设置随机种子,方便模型复现
    if args.seed:
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        np.random.seed(args.seed)
    # 加载模型的config
    model_config = GPT2Config.from_json_file(args.config_path)

    # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。
    if args.pretrained_model_path:
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
    else:
        # 如果没有指定的预训练模型,则初始化模型
        model = GPT2LMHeadModel(config=model_config)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)

    # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']";
    # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']"
    tokenizer.add_tokens("[Space]", special_tokens=True)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # 加载训练数据和测试数据
    train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                      args.title_max_len, args.data_dir,
                                      "train", args.train_file_path)
    test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                     args.title_max_len, args.data_dir, "test",
                                     args.test_file_path)
    # 开始训练
    train(model, train_data, test_data, args)
예제 #21
0
def create_model(args, vocab_size):
    """
    :param args:
    :param vocab_size:字典大小
    :return:
    """
    print('配置模型参数')
    # model_config = GPT2Config.from_json_file('config/model_config_dialogue_small.json')
    print(vocab_size)
    print('创建model')
    # model = TFGPT2LMHeadModel.from_pretrained('gpt2')
    if args.pretrained_model:  # 如果指定了预训练的GPT2模型
        model = TFGPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 若没有指定预训练模型,则初始化模型
        print('初始化模型')
        model_config = GPT2Config.from_json_file(args.model_config)
        print('config:\n' + model_config.to_json_string())
        model = TFGPT2LMHeadModel(config=model_config)
        print('构造好模型')
        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    #model.resize_token_embeddings(vocab_size)

    # model = TFGPT2LMHeadModel.from_pretrained()#实例化一个类
    return model, model.config.to_dict().get("n_ctx")
예제 #22
0
import torch
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from config import device_f, device_r, num_samples, MMI_temperature, top_k

torch.set_grad_enabled(False)

tokenizer = GPT2Tokenizer('medium/vocab.json', 'medium/merges.txt')

weights = torch.load('medium/medium_ft.pkl')
# fix misused key value
weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
weights.pop("lm_head.decoder.weight", None)

cfg = GPT2Config.from_json_file('medium/config.json')
model: GPT2LMHeadModel = GPT2LMHeadModel(cfg)
model.load_state_dict(weights)
if device_f == 'cuda':
    model.half()
model.to(device_f)
model.eval()

weights = torch.load('medium/small_reverse.pkl')
# fix misused key value
weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
weights.pop("lm_head.decoder.weight", None)

reverse_model: GPT2LMHeadModel = GPT2LMHeadModel(cfg)
reverse_model.load_state_dict(weights)
if device_r == 'cuda':
    reverse_model.half()
def main():
    # Create the argument parser.
    parser = argparse.ArgumentParser()
    parser.add_argument("--print-checkpoint-structure", action="store_true")
    parser.add_argument(
        "path_to_checkpoint",
        type=str,
        help="Path to the checkpoint file (.zip archive or direct .pt file)",
    )
    parser.add_argument(
        "--config_file",
        default="",
        type=str,
        help="An optional config json file describing the pre-trained model.",
    )
    args = parser.parse_args()

    # Extract the basename.
    basename = os.path.dirname(args.path_to_checkpoint)

    # Load the model.
    # the .zip is very optional, let's keep it for backward compatibility
    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
    if args.path_to_checkpoint.endswith(".zip"):
        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
    else:
        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")

    ds_args = input_state_dict.get("args", None)

    # Read the config, or default to the model released by NVIDIA.
    if args.config_file == "":

        if ds_args is not None:
            if ds_args.bias_gelu_fusion:
                activation_function = "gelu_fast"
            elif ds_args.openai_gelu:
                activation_function = "gelu_new"
            else:
                activation_function = "gelu"
        else:
            # in the very early days this used to be "gelu_new"
            activation_function = "gelu_new"

        # Spell out all parameters in case the defaults change.
        config = GPT2Config(
            vocab_size=50257,
            n_positions=1024,
            n_embd=1024,
            n_layer=24,
            n_head=16,
            n_inner=4096,
            activation_function=activation_function,
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1,
            layer_norm_epsilon=1e-5,
            initializer_range=0.02,
            summary_type="cls_index",
            summary_use_proj=True,
            summary_activation=None,
            summary_proj_to_labels=True,
            summary_first_dropout=0.1,
            scale_attn_weights=True,
            use_cache=True,
            bos_token_id=50256,
            eos_token_id=50256,
        )
    else:
        config = GPT2Config.from_json_file(args.config_file)

    config.architectures = ["GPT2LMHeadModel"]

    # Convert.
    print("Converting")
    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)

    # Print the structure of converted state dict.
    if args.print_checkpoint_structure:
        recursive_print(None, output_state_dict)

    # Add tokenizer class info to config
    # see https://github.com/huggingface/transformers/issues/13906)
    if ds_args is not None:
        tokenizer_type = ds_args.tokenizer_type
        if tokenizer_type == "GPT2BPETokenizer":
            tokenizer_model_name = "gpt2"
        elif tokenizer_type == "PretrainedFromHF":
            tokenizer_model_name = ds_args.tokenizer_name_or_path
        else:
            raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
    else:
        tokenizer_model_name = "gpt2"

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
    tokenizer_class = type(tokenizer).__name__
    config.tokenizer_class = tokenizer_class

    # Store the config to file.
    print("Saving config")
    config.save_pretrained(basename)

    # Save tokenizer based on args
    print(f"Adding {tokenizer_class} tokenizer files")
    tokenizer.save_pretrained(basename)

    # Store the state_dict to file.
    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
    print(f'Saving checkpoint to "{output_checkpoint_file}"')
    torch.save(output_state_dict, output_checkpoint_file)
예제 #24
0
def main():
    # Create the argument parser.
    parser = argparse.ArgumentParser()
    parser.add_argument("--print-checkpoint-structure", action="store_true")
    parser.add_argument(
        "path_to_checkpoint",
        type=str,
        help="Path to the ZIP file containing the checkpoint",
    )
    parser.add_argument(
        "--config_file",
        default="",
        type=str,
        help="An optional config json file describing the pre-trained model.",
    )
    args = parser.parse_args()

    # Extract the basename.
    basename = os.path.dirname(args.path_to_checkpoint)

    # Load the model.
    print(
        f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
    with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
        with checkpoint.open(
                "release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
            input_state_dict = torch.load(pytorch_dict, map_location="cpu")

    # Read the config, or default to the model released by NVIDIA.
    if args.config_file == "":
        # Spell out all parameters in case the defaults change.
        config = GPT2Config(
            vocab_size=50257,
            n_positions=1024,
            n_ctx=1024,
            n_embd=1024,
            n_layer=24,
            n_head=16,
            n_inner=4096,
            activation_function="gelu_new",
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1,
            layer_norm_epsilon=1e-5,
            initializer_range=0.02,
            summary_type="cls_index",
            summary_use_proj=True,
            summary_activation=None,
            summary_proj_to_labels=True,
            summary_first_dropout=0.1,
            scale_attn_weights=True,
            gradient_checkpointing=False,
            use_cache=True,
            bos_token_id=50256,
            eos_token_id=50256,
        )
    else:
        config = GPT2Config.from_json_file(args.config_file)

    # Convert.
    print("Converting")
    output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
                                                    config)

    # Print the structure of converted state dict.
    if args.print_checkpoint_structure:
        recursive_print(None, output_state_dict)

    # Store the config to file.
    output_config_file = os.path.join(basename, "config.json")
    output_config = config.to_dict()
    output_config["architectures"] = ["GPT2LMHeadModel"]
    output_config["model_type"] = "gpt2"
    print(f'Saving config to "{output_config_file}"')
    with open(output_config_file, "w") as f:
        json.dump(output_config, f)

    # Store the state_dict to file.
    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
    print(f'Saving checkpoint to "{output_checkpoint_file}"')
    torch.save(output_state_dict, output_checkpoint_file)
                 'distilgpt2-config.json')
}
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
    "distilgpt2":
    os.path.join(model_and_config_dir, 'MODEL_ARCHIVE_MAP',
                 'distilgpt2-pytorch_model.bin')
}

if __name__ == '__main__':
    benchmark_name = "Pereira2018-encoding"
    model_name = "distilgpt2"
    config_file = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP[model_name]
    model_file = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP[model_name]
    benchmark_tsk = benchmark_name

    config = GPT2Config.from_json_file(config_file)
    num_layers = config.n_layer
    config.output_hidden_states = True
    config.state_dict = None
    model = GPT2Model(config)
    model.from_pretrained(model_file, config=config)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model_identifier = config.weight_identifier
    # find model index in model_configs
    config_idx = int(
        np.argwhere([
            x['weight_identifier'] == config.weight_identifier
            for x in transformer_configurations
        ]))
    brainscore_config = transformer_configurations[config_idx]
    # - tokenizer_ctr: the importable class name of the model's tokenizer class
예제 #26
0
#将模型的基本参数打印出来
print('args:\n' + args.__repr__())
# datapath='C:/Users/ubt/Desktop/train_max24.tok'
# f = open(datapath,encoding = 'utf-8')
# for lines in f:
#     print(len(lines))
#     print(lines)
os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
print(args.model_config)

#if not args.pretrained_model:
#    model = transformers.modeling_gpt2.GPT2LMHeadModel(config=args.model_config)
#else:
#    model = transformers.modeling_gpt2.GPT2LMHeadModel(config=args.model_config)
# Load config file
config = GPT2Config.from_json_file(args.model_config)
model = GPT2LMHeadModel(config)
model.train()
model.cuda()

num_parameters = 0
parameters = model.parameters()
for parameter in parameters:
    num_parameters += parameter.numel()
print('number of parameters: {}'.format(num_parameters))

full_len = 0
for i in tqdm(range(args.num_pieces)):
    with open(args.tokenizer_data_path + 'tokenizer_train_{}.txt'.format(i),
              'r') as f:
        full_len += len([int(item) for item in f.read().strip().split()])
예제 #27
0
파일: interact.py 프로젝트: 210010/botback
                                       classes_num=len(labels2idx)).cuda()
    checkpoint = T.load("Classifier/Model_Backup/model.pt")
    dialog_act_classifier.load_state_dict(checkpoint['model_state_dict'])
    dialog_act_classifier = dialog_act_classifier.eval()

# Load TTS model
with T.no_grad():
    text2speech = tts_class()

# LOAD DialoGPT Generator

with T.no_grad():
    tokenizer = GPT2Tokenizer.from_pretrained('Generator/DialoGPT/Configs/')
    weights = T.load('Generator/DialoGPT/Parameters/medium_ft.pkl')
    weights_reverse = T.load('Generator/DialoGPT/Parameters/small_reverse.pkl')
    cfg = GPT2Config.from_json_file('Generator/DialoGPT/Configs/config.json')
    model = GPT2LMHeadModel(cfg)
    model_reverse = GPT2LMHeadModel(cfg)

    # fix misused key value
    weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
    weights.pop("lm_head.decoder.weight", None)
    weights_reverse["lm_head.weight"] = weights_reverse[
        "lm_head.decoder.weight"]
    weights_reverse.pop("lm_head.decoder.weight", None)

    model.load_state_dict(weights)
    model.to('cuda')
    model.eval()

    model_reverse.load_state_dict(weights_reverse)
예제 #28
0
def main():
    # 初始化参数
    args = set_args()

    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    args.cuda = not args.no_cuda

    if args.batch_size < 2048 and args.warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n' \
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \
              'Using smaller batch w/o longer warmup may cause ' \
              'the warmup stage ends with only little data trained.')

    # 创建日志对象
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda:0' if args.cuda else 'cpu'
    args.device = device
    logger.info('using device:{}'.format(device))

    # 初始化tokenizer
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_path,
                                  sep_token="[SEP]",
                                  pad_token="[PAD]",
                                  cls_token="[CLS]")
    args.sep_id = tokenizer.sep_token_id
    args.pad_id = tokenizer.pad_token_id
    args.cls_id = tokenizer.cls_token_id

    # 创建模型的输出目录
    if not os.path.exists(args.save_model_path):
        os.mkdir(args.save_model_path)

    # 创建模型
    if args.pretrained_model:  # 加载预训练模型
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 初始化模型
        model_config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config=model_config)
    model = model.to(device)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    assert model.config.vocab_size == tokenizer.vocab_size

    # 并行训练模型
    if args.cuda and torch.cuda.device_count() > 1:
        model = DataParallel(model).cuda()
        # model = BalancedDataParallel(args.gpu0_bsz, model, dim=0).cuda()
        logger.info("use GPU {} to train".format(args.device))

    # 计算模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    logger.info('number of model parameters: {}'.format(num_parameters))

    # 记录参数设置
    logger.info("args:{}".format(args))

    # 加载训练集和验证集
    # ========= Loading Dataset ========= #
    train_dataset, validate_dataset = load_dataset(logger, args)

    train(model, logger, train_dataset, validate_dataset, args)
예제 #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--device",
                        default="0",
                        type=str,
                        required=False,
                        help="生成设备")
    parser.add_argument("--length",
                        default=-1,
                        type=int,
                        required=False,
                        help="生成长度")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        required=False,
                        help="生成的batch size")
    parser.add_argument("--nsamples",
                        default=10,
                        type=int,
                        required=False,
                        help="生成几个样本")
    parser.add_argument("--temperature",
                        default=1,
                        type=float,
                        required=False,
                        help="生成温度")
    parser.add_argument("--topk",
                        default=8,
                        type=int,
                        required=False,
                        help="最高几选一")
    parser.add_argument("--topp",
                        default=0,
                        type=float,
                        required=False,
                        help="最高积累概率")
    parser.add_argument(
        "--model_config",
        default="config/model_config.json",
        type=str,
        required=False,
        help="模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="vocab/vocab.txt",
        type=str,
        required=False,
        help="词表路径",
    )
    parser.add_argument(
        "--model_path",
        default="model/epoch=0-step=99.ckpt",
        type=str,
        required=False,
        help="模型路径",
    )
    parser.add_argument("--prefix",
                        default="我",
                        type=str,
                        required=False,
                        help="生成文章的开头")
    parser.add_argument("--no_wordpiece",
                        action="store_true",
                        help="不做word piece切词")
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--fast_pattern",
                        action="store_true",
                        help="采用更加快的方式生成文本")
    parser.add_argument("--save_samples", action="store_true", help="保存产生的样本")
    parser.add_argument("--save_samples_path",
                        default=".",
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument("--repetition_penalty",
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = BertTokenizer(vocab_file=args.tokenizer_path)
    model_config = GPT2Config.from_json_file(args.model_config)
    model = GPT2LMHeadModel(config=model_config)
    state_dict = {
        key[6:]: value
        for key, value in torch.load(args.model_path, map_location="cpu")
        ["state_dict"].items()
    }
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    for i in range(10):
        raw_text = args.prefix
        encoded = tokenizer.encode_plus(raw_text)["input_ids"]
        out = sample_sequence(
            model,
            encoded,
            length=512,
            n_ctx=1024,
            tokenizer=tokenizer,
            temperature=temperature,
            top_k=topk,
            top_p=topp,
            repitition_penalty=repetition_penalty,
            device=device,
        )
        print(tokenizer.decode(out))
예제 #30
0
    args.do_train = True
    args.train_data_file = './prepare_data_dict/text1.txt'
    args.save_steps = 1
    args.block_size = 16
    args.line_by_line = False
    args.train_batch_size = 2
    args.num_train_epochs = 3
    args.overwrite_output_dir = True

    #### tokenizer
    tokenizer = GPT2Tokenizer_inherit(
        vocab_file='./prepare_data_dict/vocab.json',
        merges_file='./prepare_data_dict/merge.txt')
    vocabsz = tokenizer.vocab_size
    #### config
    config = GPT2Config.from_json_file(
        './prepare_data_dict/gpt2-config-small.json')
    config.vocab_size = vocabsz
    config.n_layer = 12
    config.n_head = 12
    #config.output_hidden_states=True

    main(tokenizer, config)

# GPT-2/GPT and causal language modeling
# The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is that of causal language modeling.
#
# export TRAIN_FILE=/path/to/dataset/wiki.train.raw
# export TEST_FILE=/path/to/dataset/wiki.test.raw
#
# python run_language_modeling.py \
#     --output_dir=output \