예제 #1
0
    def __init__(self,
                 classifier_config_dir,
                 device,
                 task_type,
                 n_clf_layers=6,
                 use_dm=True,
                 use_pm=True,
                 use_rt=True,
                 use_bio=False,
                 use_name=False,
                 use_network=False,
                 use_count=False):
        super(ConcatenatedClassifier, self).__init__()
        # load text model
        self.device = device
        self.task_type = task_type
        self.use_text = use_dm | use_pm | use_rt
        self.use_bio = use_bio
        self.use_name = use_name
        self.use_etc = use_network | use_count
        self.text_model = RobertaModel.from_pretrained(
            "vinai/bertweet-base",
            output_attentions=False,
            output_hidden_states=False)
        if self.use_name:
            self.charEmbedding = nn.Embedding(
                num_embeddings=302, embedding_dim=300,
                padding_idx=301)  # 302: 300-top frequent + pad + unk
            self.conv3 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=3,
                                   padding=1)
            self.conv4 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=4,
                                   padding=1)
            self.conv5 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=5,
                                   padding=1)

        # load classifier for combining these features
        config = RobertaConfig()
        config = config.from_json_file(classifier_config_dir)
        config.num_hidden_layers = n_clf_layers
        config.num_attention_heads = n_clf_layers
        config.max_position_embeddings = 7
        if self.use_bio:
            config.max_position_embeddings += 2
        if self.use_name:
            config.max_position_embeddings += 4
        self.concat_model = RobertaModel(config)
        self.classifier = ClassifierLayer(use_count=use_count,
                                          use_network=use_network)
        return
예제 #2
0
    def __init__(self, args):
        super(Model, self).__init__()
        args.out_size = len(args.dense_features)
        self.dropout = nn.Dropout(args.hidden_dropout_prob)
        self.args = args

        #创建BERT模型,并且导入预训练模型
        config = RobertaConfig.from_pretrained(args.pretrained_model_path)
        config.output_hidden_states = True
        args.hidden_size = config.hidden_size
        args.num_hidden_layers = config.num_hidden_layers
        self.text_layer = RobertaModel.from_pretrained(
            args.pretrained_model_path, config=config)
        self.text_linear = nn.Linear(
            args.text_dim + args.vocab_dim_v1 * len(args.text_features),
            args.hidden_size)
        logger.info("Load linear from %s",
                    os.path.join(args.pretrained_model_path, "linear.bin"))
        self.text_linear.load_state_dict(
            torch.load(os.path.join(args.pretrained_model_path, "linear.bin")))
        logger.info("Load embeddings from %s",
                    os.path.join(args.pretrained_model_path, "embeddings.bin"))
        self.text_embeddings = nn.Embedding.from_pretrained(torch.load(
            os.path.join(args.pretrained_model_path,
                         "embeddings.bin"))['weight'],
                                                            freeze=True)
        args.out_size += args.hidden_size * 2

        #创建Decoder模型,随机初始化
        config = RobertaConfig()
        config.num_hidden_layers = 4
        config.intermediate_size = 2048
        config.hidden_size = 512
        config.num_attention_heads = 16
        config.vocab_size = 5
        self.text_layer_1 = RobertaModel(config=config)
        self.text_layer_1.apply(self._init_weights)
        self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512)
        self.text_linear_1.apply(self._init_weights)
        self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size)
        args.out_size += 1024

        #创建分类器,随机初始化
        self.classifier = ClassificationHead(args)
        self.classifier.apply(self._init_weights)
예제 #3
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--output_dir",
        default="saved_models",
        type=str,
        required=False,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        required=False,
        help=
        "An optional input evaluation data file to evaluate the perplexity on (a text file)."
    )

    parser.add_argument("--model_type",
                        default="roberta",
                        type=str,
                        help="The model architecture to be fine-tuned.")
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        help="The model checkpoint for weights initialization.")

    parser.add_argument(
        "--mlm",
        action='store_true',
        help=
        "Train with masked-language modeling loss instead of language modeling."
    )
    parser.add_argument(
        "--mlm_probability",
        type=float,
        default=0.2,
        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument(
        "--config_name",
        default="roberta-base",
        type=str,
        help=
        "Optional pretrained config name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Optional pretrained tokenizer name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)"
    )
    parser.add_argument(
        "--block_size",
        default=128,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens)."
    )
    parser.add_argument(
        "--dfg_size",
        default=64,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens)."
    )
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=64,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=64,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=100000,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=10000,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=10000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        '--save_total_limit',
        type=int,
        default=500,
        help=
        'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default'
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=123456,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")

    parser.add_argument('--log_file', type=str, default='')
    parser.add_argument('--tensorboard_dir',
                        type=str,
                        default='saved_models/tensorboard_logs')
    parser.add_argument('--lang', type=str)
    parser.add_argument('--pretrain', type=str, default='')
    args = parser.parse_args()
    pool = None
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # 设置log信息
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # 设置随机种子
    set_seed(args)

    # 判断是否有checkpoint,从而继续预训练
    args.start_epoch = 0
    args.start_step = 0
    checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last')
    if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last):
        args.model_name_or_path = os.path.join(checkpoint_last,
                                               'pytorch_model.bin')
        args.config_name = os.path.join(checkpoint_last, 'config.json')
        step_file = os.path.join(checkpoint_last, 'step_file.txt')
        if os.path.exists(step_file):
            with open(step_file, encoding='utf-8') as stepf:
                args.start_step = int(stepf.readlines()[0].strip())

        logger.info("reload model from {}, resume from {} epoch".format(
            checkpoint_last, args.start_epoch))

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    base_path = "../src/data"
    text_features = [
        [
            base_path + "/sequence_text_user_id_product_id.128d",
            'sequence_text_user_id_product_id', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_ad_id.128d",
            'sequence_text_user_id_ad_id', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_creative_id.128d",
            'sequence_text_user_id_creative_id', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_advertiser_id.128d",
            'sequence_text_user_id_advertiser_id', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_industry.128d",
            'sequence_text_user_id_industry', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_product_category.128d",
            'sequence_text_user_id_product_category', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_time.128d",
            'sequence_text_user_id_time', 128, True
        ],
        [
            base_path + "/sequence_text_user_id_click_times.128d",
            'sequence_text_user_id_click_times', 128, True
        ],
    ]

    #读取训练数据
    train_df = pd.read_pickle(os.path.join(base_path, 'train_user.pkl'))
    test_df = pd.read_pickle(os.path.join(base_path, 'test_user.pkl'))
    dev_data = train_df.iloc[-10000:]
    train_data = train_df.iloc[:-10000].append(test_df)

    #创建输入端的词表,每个域最多保留10w个id
    try:
        dic = pickle.load(
            open(os.path.join(args.output_dir, 'vocab.pkl'), 'rb'))
    except:
        dic = {}
        dic['pad'] = 0
        dic['mask'] = 1
        dic['unk'] = 2
        for feature in text_features:
            conter = Counter()
            for item in train_df[feature[1]].values:
                for word in str(item).split():
                    try:
                        conter[(feature[1], word)] += 1
                    except:
                        conter[(feature[1], word)] = 1
            most_common = conter.most_common(100000)
            cont = 0
            for x in most_common:
                if x[1] > 5:
                    dic[x[0]] = len(dic)
                    cont += 1
                    if cont < 10:
                        print(x[0], dic[x[0]])
            print(cont)

    #读取或重新创建BERT
    if args.model_name_or_path is not None:
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            cache_dir=args.cache_dir if args.cache_dir else None)
        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None)
        args.text_dim = config.hidden_size
    else:
        config = RobertaConfig()
        config.num_hidden_layers = 12
        config.hidden_size = 512
        config.intermediate_size = config.hidden_size * 4
        config.num_attention_heads = 16
        config.vocab_size = 5
        model = model_class(config)
        config.vocab_size_v1 = len(dic)
        config.vocab_dim_v1 = 64
        logger.info("%s", config)
    logger.info("Training/evaluation parameters %s", args)

    #保存输入端词表
    args.vocab_dic = dic
    pickle.dump(dic, open(os.path.join(args.output_dir, 'vocab.pkl'), 'wb'))

    #读取word embedding
    import gensim
    embedding_table = []
    for x in text_features:
        print(x)
        embedding_table.append(pickle.load(open(x[0], 'rb')))

    #创建输出端词表,每个域最多保留10w个id
    vocab = []
    for feature in text_features:
        conter = Counter()
        for item in train_data[feature[1]].values:
            for word in str(item).split():
                try:
                    conter[word] += 1
                except:
                    conter[word] = 1
        most_common = conter.most_common(100000)
        dic = {}
        for idx, x in enumerate(most_common):
            dic[x[0]] = idx + 1
        vocab.append(dic)

    #设置参数
    args.vocab_size_v1 = config.vocab_size_v1
    args.vocab_dim_v1 = config.vocab_dim_v1
    args.vocab = vocab
    args.text_dim = sum([x[2] for x in text_features])
    args.text_features = text_features
    train_dataset = TextDataset(args, train_data, embedding_table)
    dev_dataset = TextDataset(args, dev_data, embedding_table)
    args.vocab_size = [len(x) + 1 for x in vocab]
    #创建模型
    model = Model(model, config, args)
    #如果有checkpoint,读取checkpoint
    if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last):
        logger.info("Load model from %s",
                    os.path.join(checkpoint_last, "model.bin"))
        model.load_state_dict(
            torch.load(os.path.join(checkpoint_last, "model.bin")))

    #训练
    train(args, train_dataset, dev_dataset, model)
예제 #4
0
def train(args):
    # 构建词表对象
    vocab = Vocab(args.vocab_file, 50000, args.train_data_path)

    # 取出词和id的字典
    args.vocab = vocab

    # 读取预训练好的embeddings
    embs = load_pkl('E:/CodeSleepEatRepeat/data/58tech/data/word2vec.txt')

    # 构建mlm的训练数据
    batches = batcher(args, embs)

    # load pretrained model
    if args.pre_trained_model:
        config = RobertaConfig.from_pretrained(args.pre_trained_model)
        model_roberta = TFRobertaModel.from_pretrained(args.pre_trained_model,
                                                       config=config)
    else:
        # huggingface transformers 模型配置
        config = RobertaConfig()
        config.num_hidden_layers = args.num_hidden_layers  # 12
        config.hidden_size = args.hidden_size  # 128
        config.intermediate_size = args.hidden_size * 4
        config.num_attention_heads = args.num_attention_heads  # 8
        config.vocab_size = args.vocab.word_size()

        model_roberta = TFRobertaModel(config)

    model = Model_Roberta(args, model_roberta)
    # model.summary()

    optimizer = tf.keras.optimizers.Nadam()
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy()

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')

    # checkpoint_dir = args.checkpoints_dir
    # ckpt = tf.train.Checkpoint(model=model)
    # ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3)

    if args.checkpoints_dir:
        print("Creating the checkpoint manager")
        checkpoint_dir = args.checkpoints_dir
        ckpt = tf.train.Checkpoint(model=model)
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_dir,
                                                  max_to_keep=5)

        if ckpt_manager.latest_checkpoint:
            # ckpt.restore('./checkpoints/ckpt-53')
            ckpt.restore(ckpt_manager.latest_checkpoint)
            print("Restored from {}".format(ckpt_manager.latest_checkpoint))
        else:
            print("Initializing from scratch.")

    count = 0
    best_loss = 20
    for epoch in tf.range(1, args.epochs + 1):

        for batch in batches:
            # inputs, inputs_ids, attention_masks, labels = batch[0], batch[1], batch[2], batch[3]
            gradients, loss, predictions, labels = train_step(
                model, batch, loss_func, args)

            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))

            train_loss.update_state(loss)
            train_metric.update_state(labels, predictions)

            logs = 'Epoch={},Loss:{},Accuracy:{}'

            # print(predictions)
            # print('-'*20)
            # print(masks_labels)
            # print('*'*20)
            # print(tf.reduce_mean(loss))
            # print('='*20)
            # label = tf.argmax(predictions[0])
            # print(label)

            if count % 100 == 0 and count != 0:
                tf.print(
                    tf.strings.format(
                        logs,
                        (epoch, train_loss.result(), train_metric.result())))
                tf.print("")
                if count % 1000 == 0 and train_loss.result() < best_loss:
                    best_loss = train_loss.result()
                    ckpt_save_path = ckpt_manager.save()
                    print('*' * 20)
                    print('Saving checkpoint for epoch {} at {} ,best loss {}'.
                          format(epoch, ckpt_save_path, best_loss))
                    print('*' * 20)
            count += 1

        train_loss.reset_states()
        train_metric.reset_states()

    model.encoder.save_pretrained('./pretrained-roberta/')