def main(): parser = argparse.ArgumentParser() # 1. 训练和测试数据路径 parser.add_argument("--data_dir", default='./data/cluener', type=str, help="Path to data.") # 2. 预训练模型路径 parser.add_argument("--vocab_file", default="data/pretrain/vocab.txt", type=str, help="Init vocab to resume training from.") parser.add_argument("--config_path", default="data/pretrain/config.json", type=str, help="Init config to resume training from.") parser.add_argument("--init_checkpoint", default="data/pretrain/pytorch_model.bin", type=str, help="Init checkpoint to resume training from.") # 3. 保存模型 parser.add_argument("--save_path", default="./check_points", type=str, help="Path to save checkpoints.") parser.add_argument("--load_path", default=None, type=str, help="Path to load checkpoints.") # 训练和测试参数 parser.add_argument("--do_train", default=True, type=bool, help="Whether to perform training.") parser.add_argument("--do_eval", default=True, type=bool, help="Whether to perform evaluation on test data set.") parser.add_argument("--do_predict", default=False, type=bool, help="Whether to perform evaluation on test data set.") parser.add_argument("--do_adv", default=True, type=bool) parser.add_argument("--epochs", default=20, type=int, help="Number of epoches for fine-tuning.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total examples' number in batch for training.") parser.add_argument("--eval_batch_size", default=1, type=int, help="Total examples' number in batch for eval.") parser.add_argument("--max_seq_len", default=256, type=int, help="Number of words of the longest seqence.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="Learning rate used to train with warmup.") parser.add_argument( "--warmup_proportion", default=0.01, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10% of training.") parser.add_argument("--use_cuda", type=bool, default=True, help="whether to use cuda") parser.add_argument("--log_steps", type=int, default=20, help="The steps interval to print loss.") parser.add_argument("--eval_step", type=int, default=200, help="The steps interval to print loss.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.use_cuda: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.save_path): os.mkdir(args.save_path) model_path_postfix = '' if args.do_adv: model_path_postfix += '_adv' args.save_path = os.path.join(args.save_path, model_path_postfix) if not os.path.exists(args.save_path): os.mkdir(args.save_path) args.label2id = { 'NONE': 0, 'address': 0, 'book': 1, 'company': 2, 'game': 3, 'government': 4, 'movie': 5, 'name': 6, 'organization': 7, 'position': 8, 'scene': 9 } args.id2label = {v: k for k, v in args.label2id.items()} bert_tokenizer = util.CNerTokenizer.from_pretrained(args.vocab_file) bert_config = BertConfig.from_pretrained(args.config_path, num_labels=len(args.label2id)) # 获取数据 train_dataset = None eval_dataset = None train_dataset = None if args.do_train: logger.info("loading train dataset") train_dataset = data_helper.NER_dataset( os.path.join(args.data_dir, 'train.json'), bert_tokenizer, args.max_seq_len, args.label2id) if args.do_eval: logger.info("loading eval dataset") eval_dataset = data_helper.NER_dataset(os.path.join( args.data_dir, 'dev.json'), bert_tokenizer, args.max_seq_len, args.label2id, shuffle=False) if args.do_predict: logger.info("loading test dataset") test_dataset = data_helper.NER_dataset(os.path.join( args.data_dir, 'test.json'), bert_tokenizer, args.max_seq_len, args.label2id, shuffle=False) if args.do_train: logging.info("Start training !") train_helper.train(bert_tokenizer, bert_config, args, train_dataset, eval_dataset) if not args.do_train and args.do_eval: logging.info("Start evaluating !") biaffine_model = biaffine_ner.Biaffine_NER(bert_config) biaffine_model.load_state_dict(torch.load(args.load_path)) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: biaffine_model.cuda() train_helper.evaluate(args, eval_dataset, biaffine_model) if args.do_predict: logging.info("Start predicting !") biaffine_model = biaffine_ner.Biaffine_NER(bert_config) biaffine_model.load_state_dict(torch.load(args.load_path)) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: biaffine_model.cuda() predict_res = train_helper.predict(args, test_dataset, biaffine_model)
def main(): parser = argparse.ArgumentParser() # 1. 训练和测试数据路径 parser.add_argument("--data_dir", default='./data/cluener', type=str, help="Path to data.") # 2. 预训练模型路径 parser.add_argument("--vocab_file", default="data/pretrain/vocab.txt", type=str, help="Init vocab to resume training from.") parser.add_argument("--config_path", default="data/pretrain/config.json", type=str, help="Init config to resume training from.") parser.add_argument("--init_checkpoint", default="data/pretrain/pytorch_model.bin", type=str, help="Init checkpoint to resume training from.") # 3. 保存模型 parser.add_argument("--save_path", default="./check_points", type=str, help="Path to save checkpoints.") parser.add_argument("--load_path", default=None, type=str, help="Path to load checkpoints.") # 训练和测试参数 parser.add_argument("--do_train", default=True, type=bool, help="Whether to perform training.") parser.add_argument("--do_eval", default=True, type=bool, help="Whether to perform evaluation on test data set.") parser.add_argument("--do_predict", default=False, type=bool, help="Whether to perform evaluation on test data set.") parser.add_argument("--do_adv", default=True, type=bool) parser.add_argument("--epochs", default=20, type=int, help="Number of epoches for fine-tuning.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total examples' number in batch for training.") parser.add_argument("--eval_batch_size", default=1, type=int, help="Total examples' number in batch for eval.") parser.add_argument("--max_seq_len", default=256, type=int, help="Number of words of the longest seqence.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="Learning rate used to train with warmup.") parser.add_argument( "--warmup_proportion", default=0.01, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10% of training.") parser.add_argument("--use_cuda", type=bool, default=True, help="whether to use cuda") parser.add_argument("--log_steps", type=int, default=20, help="The steps interval to print loss.") parser.add_argument("--eval_step", type=int, default=100, help="The steps interval to print loss.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.use_cuda: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.save_path): os.mkdir(args.save_path) model_path_postfix = '' if args.do_adv: model_path_postfix += '_adv' args.save_path = os.path.join(args.save_path, 'ner' + model_path_postfix) if not os.path.exists(args.save_path): os.mkdir(args.save_path) bert_tokenizer = util.CNerTokenizer.from_pretrained(args.vocab_file) bert_config = BertConfig.from_pretrained(args.config_path) type2id = bert_tokenizer.get_label(args.data_dir) args.type2id = type2id args.id2type = {v: k for k, v in type2id.items()} # 获取数据 train_dataset = None if args.do_train: logger.info("loading train dataset") train_dataset = data_helper.NER_dataset( os.path.join(args.data_dir, 'train.json'), bert_tokenizer, args.max_seq_len, args.type2id) if args.do_train: logging.info("Start training !") train_helper.train(bert_tokenizer, bert_config, args, train_dataset) if not args.do_train and args.do_eval: logging.info("Start evaluating !") bert_model = BertModel.from_pretrained(args.init_checkpoint, config=bert_config) span_model = span_type.EntitySpan(bert_config) type_model = span_type.EntityType() state = torch.load(args.load_path) bert_model.load_state_dict(state['bert_state_dict']) span_model.load_state_dict(state['span_state_dict']) type_model.load_state_dict(state['type_state_dict']) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: bert_model.cuda() span_model.cuda() type_model.cuda() model_list = [bert_model, span_model, type_model] train_helper.evaluate(args, bert_tokenizer, model_list) if args.do_predict: logging.info("Start predicting !") bert_model = BertModel.from_pretrained(args.init_checkpoint, config=bert_config) span_model = span_type.EntitySpan(bert_config) type_model = span_type.EntityType() state = torch.load(args.load_path) bert_model.load_state_dict(state['bert_state_dict']) span_model.load_state_dict(state['span_state_dict']) type_model.load_state_dict(state['type_state_dict']) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: bert_model.cuda() span_model.cuda() type_model.cuda() model_list = [bert_model, span_model, type_model] predict_res = train_helper.predict(args, bert_tokenizer, model_list)
def main(): parser = argparse.ArgumentParser() # 1. 训练和测试数据路径 parser.add_argument("--data_dir", default='./data/cluener', type=str, help="Path to train/dev/test data.") # 2. 预训练词向量路径 # parser.add_argument("--pretrain_emb", default=None, type=str, help="Path to pretrain word emb.") # 3. 保存模型 parser.add_argument("--save_path", default="./check_points", type=str, help="Path to save checkpoints.") parser.add_argument("--load_path", default=None, type=str, help="Path to load checkpoints.") # 4. 模型参数 parser.add_argument('--embedding_size', default=128, type=int) parser.add_argument('--hidden_size', default=384, type=int) # 训练和测试参数 parser.add_argument("--do_train", default=True, type=bool, help="Whether to perform training.") parser.add_argument("--do_eval", default=True, type=bool, help="Whether to perform evaluation on eval data set.") parser.add_argument("--do_test", default=False, type=bool, help="Whether to perform evaluation on test data set.") parser.add_argument("--do_adv", default=True, type=bool) parser.add_argument("--epochs", default=50, type=int, help="Number of epoches for fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total examples' number in batch for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total examples' number in batch for eval.") parser.add_argument("--max_seq_len", default=256, type=int, help="Number of words of the longest seqence.") parser.add_argument("--learning_rate", default=1e-3, type=float, help="Learning rate used to train with warmup.") parser.add_argument("--use_cuda", type=bool, default=True, help="whether to use cuda") parser.add_argument("--log_steps", type=int, default=20, help="The steps interval to print loss.") parser.add_argument("--eval_step", type=int, default=200, help="The steps interval to print loss.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.use_cuda: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.save_path): os.mkdir(args.save_path) model_path_postfix = '' if args.do_adv: model_path_postfix += '_adv' args.save_path = os.path.join(args.save_path, 'ner' + model_path_postfix) if not os.path.exists(args.save_path): os.mkdir(args.save_path) # 获取词表 tokenizer = util.CNerTokenizer(args.data_dir) tokenizer.get_vocab() args.tokenizer = tokenizer args.label2id = tokenizer.get_label() args.id2label = {v: k for k, v in args.label2id.items()} args.num_labels = len(args.label2id) # 获取数据 train_dataset = None eval_dataset = None test_dataset = None if args.do_train: logger.info("loading train dataset") train_dataset = data_helper.NER_dataset( os.path.join(args.data_dir, 'train.json'), args.tokenizer, args.max_seq_len, args.label2id) if args.do_eval: logger.info("loading eval dataset") eval_dataset = data_helper.NER_dataset(os.path.join( args.data_dir, 'dev.json'), args.tokenizer, args.max_seq_len, args.label2id, shuffle=False) if args.do_test: logger.info("loading test dataset") test_dataset = data_helper.NER_dataset(os.path.join( args.data_dir, 'test.json'), args.tokenizer, args.max_seq_len, args.label2id, shuffle=False) if args.do_train: logging.info("Start training !") train_helper.train(args, train_dataset, eval_dataset) if not args.do_train and args.do_eval: logging.info("Start evaluating !") model = ner_model.BilstmCrf(args) model.load_state_dict(torch.load(args.load_path)) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: model.cuda() train_helper.evaluate(args, eval_dataset, model) if args.do_test: logging.info("Start predicting !") model = ner_model.BilstmCrf(args) model.load_state_dict(torch.load(args.load_path)) logging.info("Checkpoint: %s have been loaded!" % (args.load_path)) if args.use_cuda: model.cuda() predict_res = train_helper.predict(args, test_dataset, model)