def __init__(self, classifier_config_dir, device, task_type, n_clf_layers=6, use_dm=True, use_pm=True, use_rt=True, use_bio=False, use_name=False, use_network=False, use_count=False): super(ConcatenatedClassifier, self).__init__() # load text model self.device = device self.task_type = task_type self.use_text = use_dm | use_pm | use_rt self.use_bio = use_bio self.use_name = use_name self.use_etc = use_network | use_count self.text_model = RobertaModel.from_pretrained( "vinai/bertweet-base", output_attentions=False, output_hidden_states=False) if self.use_name: self.charEmbedding = nn.Embedding( num_embeddings=302, embedding_dim=300, padding_idx=301) # 302: 300-top frequent + pad + unk self.conv3 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=3, padding=1) self.conv4 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=4, padding=1) self.conv5 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=5, padding=1) # load classifier for combining these features config = RobertaConfig() config = config.from_json_file(classifier_config_dir) config.num_hidden_layers = n_clf_layers config.num_attention_heads = n_clf_layers config.max_position_embeddings = 7 if self.use_bio: config.max_position_embeddings += 2 if self.use_name: config.max_position_embeddings += 4 self.concat_model = RobertaModel(config) self.classifier = ClassifierLayer(use_count=use_count, use_network=use_network) return
def __init__(self, args): super(Model, self).__init__() args.out_size = len(args.dense_features) self.dropout = nn.Dropout(args.hidden_dropout_prob) self.args = args #创建BERT模型,并且导入预训练模型 config = RobertaConfig.from_pretrained(args.pretrained_model_path) config.output_hidden_states = True args.hidden_size = config.hidden_size args.num_hidden_layers = config.num_hidden_layers self.text_layer = RobertaModel.from_pretrained( args.pretrained_model_path, config=config) self.text_linear = nn.Linear( args.text_dim + args.vocab_dim_v1 * len(args.text_features), args.hidden_size) logger.info("Load linear from %s", os.path.join(args.pretrained_model_path, "linear.bin")) self.text_linear.load_state_dict( torch.load(os.path.join(args.pretrained_model_path, "linear.bin"))) logger.info("Load embeddings from %s", os.path.join(args.pretrained_model_path, "embeddings.bin")) self.text_embeddings = nn.Embedding.from_pretrained(torch.load( os.path.join(args.pretrained_model_path, "embeddings.bin"))['weight'], freeze=True) args.out_size += args.hidden_size * 2 #创建Decoder模型,随机初始化 config = RobertaConfig() config.num_hidden_layers = 4 config.intermediate_size = 2048 config.hidden_size = 512 config.num_attention_heads = 16 config.vocab_size = 5 self.text_layer_1 = RobertaModel(config=config) self.text_layer_1.apply(self._init_weights) self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512) self.text_linear_1.apply(self._init_weights) self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size) args.out_size += 1024 #创建分类器,随机初始化 self.classifier = ClassificationHead(args) self.classifier.apply(self._init_weights)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--output_dir", default="saved_models", type=str, required=False, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, required=False, help= "An optional input evaluation data file to evaluate the perplexity on (a text file)." ) parser.add_argument("--model_type", default="roberta", type=str, help="The model architecture to be fine-tuned.") parser.add_argument( "--model_name_or_path", default=None, type=str, help="The model checkpoint for weights initialization.") parser.add_argument( "--mlm", action='store_true', help= "Train with masked-language modeling loss instead of language modeling." ) parser.add_argument( "--mlm_probability", type=float, default=0.2, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument( "--config_name", default="roberta-base", type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path" ) parser.add_argument( "--cache_dir", default="", type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)" ) parser.add_argument( "--block_size", default=128, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens)." ) parser.add_argument( "--dfg_size", default=64, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens)." ) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=64, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=100000, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=10000, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=10000, help="Save checkpoint every X updates steps.") parser.add_argument( '--save_total_limit', type=int, default=500, help= 'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default' ) parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=123456, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--log_file', type=str, default='') parser.add_argument('--tensorboard_dir', type=str, default='saved_models/tensorboard_logs') parser.add_argument('--lang', type=str) parser.add_argument('--pretrain', type=str, default='') args = parser.parse_args() pool = None device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # 设置log信息 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # 设置随机种子 set_seed(args) # 判断是否有checkpoint,从而继续预训练 args.start_epoch = 0 args.start_step = 0 checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last): args.model_name_or_path = os.path.join(checkpoint_last, 'pytorch_model.bin') args.config_name = os.path.join(checkpoint_last, 'config.json') step_file = os.path.join(checkpoint_last, 'step_file.txt') if os.path.exists(step_file): with open(step_file, encoding='utf-8') as stepf: args.start_step = int(stepf.readlines()[0].strip()) logger.info("reload model from {}, resume from {} epoch".format( checkpoint_last, args.start_epoch)) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] base_path = "../src/data" text_features = [ [ base_path + "/sequence_text_user_id_product_id.128d", 'sequence_text_user_id_product_id', 128, True ], [ base_path + "/sequence_text_user_id_ad_id.128d", 'sequence_text_user_id_ad_id', 128, True ], [ base_path + "/sequence_text_user_id_creative_id.128d", 'sequence_text_user_id_creative_id', 128, True ], [ base_path + "/sequence_text_user_id_advertiser_id.128d", 'sequence_text_user_id_advertiser_id', 128, True ], [ base_path + "/sequence_text_user_id_industry.128d", 'sequence_text_user_id_industry', 128, True ], [ base_path + "/sequence_text_user_id_product_category.128d", 'sequence_text_user_id_product_category', 128, True ], [ base_path + "/sequence_text_user_id_time.128d", 'sequence_text_user_id_time', 128, True ], [ base_path + "/sequence_text_user_id_click_times.128d", 'sequence_text_user_id_click_times', 128, True ], ] #读取训练数据 train_df = pd.read_pickle(os.path.join(base_path, 'train_user.pkl')) test_df = pd.read_pickle(os.path.join(base_path, 'test_user.pkl')) dev_data = train_df.iloc[-10000:] train_data = train_df.iloc[:-10000].append(test_df) #创建输入端的词表,每个域最多保留10w个id try: dic = pickle.load( open(os.path.join(args.output_dir, 'vocab.pkl'), 'rb')) except: dic = {} dic['pad'] = 0 dic['mask'] = 1 dic['unk'] = 2 for feature in text_features: conter = Counter() for item in train_df[feature[1]].values: for word in str(item).split(): try: conter[(feature[1], word)] += 1 except: conter[(feature[1], word)] = 1 most_common = conter.most_common(100000) cont = 0 for x in most_common: if x[1] > 5: dic[x[0]] = len(dic) cont += 1 if cont < 10: print(x[0], dic[x[0]]) print(cont) #读取或重新创建BERT if args.model_name_or_path is not None: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) args.text_dim = config.hidden_size else: config = RobertaConfig() config.num_hidden_layers = 12 config.hidden_size = 512 config.intermediate_size = config.hidden_size * 4 config.num_attention_heads = 16 config.vocab_size = 5 model = model_class(config) config.vocab_size_v1 = len(dic) config.vocab_dim_v1 = 64 logger.info("%s", config) logger.info("Training/evaluation parameters %s", args) #保存输入端词表 args.vocab_dic = dic pickle.dump(dic, open(os.path.join(args.output_dir, 'vocab.pkl'), 'wb')) #读取word embedding import gensim embedding_table = [] for x in text_features: print(x) embedding_table.append(pickle.load(open(x[0], 'rb'))) #创建输出端词表,每个域最多保留10w个id vocab = [] for feature in text_features: conter = Counter() for item in train_data[feature[1]].values: for word in str(item).split(): try: conter[word] += 1 except: conter[word] = 1 most_common = conter.most_common(100000) dic = {} for idx, x in enumerate(most_common): dic[x[0]] = idx + 1 vocab.append(dic) #设置参数 args.vocab_size_v1 = config.vocab_size_v1 args.vocab_dim_v1 = config.vocab_dim_v1 args.vocab = vocab args.text_dim = sum([x[2] for x in text_features]) args.text_features = text_features train_dataset = TextDataset(args, train_data, embedding_table) dev_dataset = TextDataset(args, dev_data, embedding_table) args.vocab_size = [len(x) + 1 for x in vocab] #创建模型 model = Model(model, config, args) #如果有checkpoint,读取checkpoint if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last): logger.info("Load model from %s", os.path.join(checkpoint_last, "model.bin")) model.load_state_dict( torch.load(os.path.join(checkpoint_last, "model.bin"))) #训练 train(args, train_dataset, dev_dataset, model)
def train(args): # 构建词表对象 vocab = Vocab(args.vocab_file, 50000, args.train_data_path) # 取出词和id的字典 args.vocab = vocab # 读取预训练好的embeddings embs = load_pkl('E:/CodeSleepEatRepeat/data/58tech/data/word2vec.txt') # 构建mlm的训练数据 batches = batcher(args, embs) # load pretrained model if args.pre_trained_model: config = RobertaConfig.from_pretrained(args.pre_trained_model) model_roberta = TFRobertaModel.from_pretrained(args.pre_trained_model, config=config) else: # huggingface transformers 模型配置 config = RobertaConfig() config.num_hidden_layers = args.num_hidden_layers # 12 config.hidden_size = args.hidden_size # 128 config.intermediate_size = args.hidden_size * 4 config.num_attention_heads = args.num_attention_heads # 8 config.vocab_size = args.vocab.word_size() model_roberta = TFRobertaModel(config) model = Model_Roberta(args, model_roberta) # model.summary() optimizer = tf.keras.optimizers.Nadam() loss_func = tf.keras.losses.SparseCategoricalCrossentropy() train_loss = tf.keras.metrics.Mean(name='train_loss') train_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') # checkpoint_dir = args.checkpoints_dir # ckpt = tf.train.Checkpoint(model=model) # ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3) if args.checkpoints_dir: print("Creating the checkpoint manager") checkpoint_dir = args.checkpoints_dir ckpt = tf.train.Checkpoint(model=model) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=5) if ckpt_manager.latest_checkpoint: # ckpt.restore('./checkpoints/ckpt-53') ckpt.restore(ckpt_manager.latest_checkpoint) print("Restored from {}".format(ckpt_manager.latest_checkpoint)) else: print("Initializing from scratch.") count = 0 best_loss = 20 for epoch in tf.range(1, args.epochs + 1): for batch in batches: # inputs, inputs_ids, attention_masks, labels = batch[0], batch[1], batch[2], batch[3] gradients, loss, predictions, labels = train_step( model, batch, loss_func, args) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss.update_state(loss) train_metric.update_state(labels, predictions) logs = 'Epoch={},Loss:{},Accuracy:{}' # print(predictions) # print('-'*20) # print(masks_labels) # print('*'*20) # print(tf.reduce_mean(loss)) # print('='*20) # label = tf.argmax(predictions[0]) # print(label) if count % 100 == 0 and count != 0: tf.print( tf.strings.format( logs, (epoch, train_loss.result(), train_metric.result()))) tf.print("") if count % 1000 == 0 and train_loss.result() < best_loss: best_loss = train_loss.result() ckpt_save_path = ckpt_manager.save() print('*' * 20) print('Saving checkpoint for epoch {} at {} ,best loss {}'. format(epoch, ckpt_save_path, best_loss)) print('*' * 20) count += 1 train_loss.reset_states() train_metric.reset_states() model.encoder.save_pretrained('./pretrained-roberta/')