def train(): #加载训练用的数据 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) #加载验证集和测试集合 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES)使用选定的标记方案I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) _c, char_to_id, id_to_char = char_mapping( train_sentences, FLAGS.lower) #统计每个字的频率以及为每个字分配一个id _t, tag_to_id, id_to_tag = tag_mapping( train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) #统计每个命名实体的频率以及为每个命名实体分配一个id #将字典写入pkl文件中 with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) #准备数据,获取包含索引的列表集合,得到用于输入网络进行训练的数据 train_data = prepare_dataset( # train_data[0][0]:一句话;train_data[0][1]:单个字的编号;train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2.。。,2,3; train_data[0][3]:每个字的标签 train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) train_manager = BatchManager( train_data, FLAGS.batch_size) # 将数据拆分成以60句话为一个batch,得到一个可迭代对象 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) config = config_model(char_to_id, tag_to_id) #补全参数配置 #限制GPU的使用 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, load_word2vec, config, id_to_char) saver = tf.train.Saver() # 用于保存模型 with tf.device("/cpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step( sess, True, batch) # 按批次训练模型 这个是训练的开始,可以从这里倒着找整个网络怎么训练 #每训练5次做一次验证并计算模型的f1 if (i + 1) % 1 == 0: f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag) print("验证集的F1系数:", f1) #每训练20次保存一次模型 if (i + 10) % 1 == 0: saver.save(sess, save_path=FLAGS.ckpt_path)
def test(): # 加载配置文件 config = load_config(FLAGS.config_file) # 加载日志管理器 log_path = os.path.join("log", FLAGS.test_log_file) logger = get_logger(log_path) # 配置GPU tf_config = tf.ConfigProto() # 加载数据集 test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 读取词典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 格式化test test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) # 加载batch test_manager = BatchManager(test_data, 20) with tf.Session(config=tf_config) as sess: logger.info("start testing...") start = time.time() # 根据保存的模型读取模型 model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # 获取testbatch evaluate(sess, model, "test", test_manager, id_to_tag, logger) logger.info("The best_f1 on test_dataset is {}".format( model.best_test_f1.eval())) logger.info('Time test for 10 batch is {} sec\n'.format(time.time() - start))
def post(self): """ Parse multiple string and return the associated entity for each token in each string. """ args = self.parser.parse_args() ref_strings = args.get('strings') tokens = [[[token] for token in ref_string.split(" ")] for ref_string in ref_strings] data = prepare_dataset(tokens, current_app.word_to_id, current_app.char_to_id, {}, current_app.model.parameters['lower'], True) tagged = [] for index, datum in enumerate(data): model_inputs = create_input(datum, current_app.model.parameters, False) y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] tags = [ current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred)) ] tagged.append([ Entity(term=term, entity=entity) for term, entity in zip(ref_strings[index].split(" "), tags) ]) response = ParseBatchResponse(reference_strings=ref_strings, data=tagged) return response
def post(self): event = self.get_argument('event') lines = self.new_text_split(event) inputs = convert(prepare_dataset(lines,FLAGS.max_seq_len,tag_to_id,train=False)) result = model.evaluate_lines(sess, inputs, id_to_tag) self.write(json.dumps(result, ensure_ascii=False))
def predict(): """ 对一个数据集进行实体识别 :return: """ config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory # 从训练阶段生成的map_file中恢复各映射字典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower, train=False) test_manager = BatchManager(test_data, 1) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger) logger.info("predict data......") ner_results = model.predict(sess, test_manager, id_to_tag) result_write_evaluate(ner_results, FLAGS.result_path, "test")
def post(self): """ Parse a single string and return the associated entity for each token in the string. """ args = self.parser.parse_args() ref_string = args.get('string') if ref_string is None or ref_string == "": # Hackish way as reqparse can't catch empty string abort(400, description='string is empty or not provided.') tokens = ref_string.split(" ") data = prepare_dataset([[[token] for token in tokens]], current_app.word_to_id, current_app.char_to_id, {}, current_app.model.parameters['lower'], True) model_inputs = create_input(data[0], current_app.model.parameters, False) y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] tags = [ current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred)) ] response = ParseResponse(reference_string=ref_string, data=[ Entity(term=term, entity=entity) for term, entity in zip(tokens, tags) ]) return response
def main(_): if FLAGS.train: if FLAGS.clean: clean(FLAGS) train() else: # 下面使用testdata来进行评估模型 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) log_path = os.path.join("log", FLAGS.log_file) config = load_config(FLAGS.config_file) logger = get_logger(log_path) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) test_manager = BatchManager(test_data, 100) with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def get_batch_data(self): """ 得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及 真实标签列表,然后获取batch管理类,用于生成batch数据 :return: """ if not os.path.isfile(FLAGS.train_dev_file): train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) with open(FLAGS.train_dev_file, "wb") as f: pickle.dump([train_data, dev_data], f) else: with open(FLAGS.train_dev_file, "rb") as f: train_data, dev_data = pickle.load(f) print("%i / %i sentences in train / dev ." % (len(train_data), len(dev_data))) self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size)) self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size))
def main(): os.environ['PYTHONHASHSEED'] = str(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True train_sentences = load_sentences(args.train_file) dev_sentences = load_sentences(args.dev_file) test_sentences = load_sentences(args.test_file) update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) update_tag_scheme(dev_sentences, args.tag_schema) with open(args.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) train_manager = BatchManager(train_data, args.batch_size, args.num_steps) dev_manager = BatchManager(dev_data, 100, args.num_steps) test_manager = BatchManager(test_data, 100, args.num_steps) if args.cuda >= 0: torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device(args.cuda) else: device = torch.device('cpu') print("device: ", device) if args.train: train(id_to_char, id_to_tag, train_manager, dev_manager, device) f1, res_info = eval_model(id_to_char, id_to_tag, test_manager, device, args.log_name) log_handler.info("\n resinfo {} \v F1: {} ".format(res_info, f1))
def load_gramcnn(): #load parameters #print '------params----' opts, parameters, model_name = load_object('main_params.pkl') #prep for gram-cnn #print '------gram-cnn params----' lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] word_to_id, char_to_id, tag_to_id, pt_to_id, dico_words, id_to_tag = reload_mappings( os.path.join(models_path, model_name, 'mappings.pkl')) if os.path.isfile(opts.test): test_sentences = loader.load_sentences(opts.test, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) if os.path.isfile(opts.test): test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, pt_to_id, lower) max_seq_len = m3 if m3 > 200 else 200 word_emb_weight = np.zeros((len(dico_words), parameters['word_dim'])) n_words = len(dico_words) #print '------gramcnn model----' print ' [*] Loading GRAMCNN tensorflow model (3min)...' gramcnn = GRAMCNN( n_words, len(char_to_id), len(pt_to_id), use_word=parameters['use_word'], use_char=parameters['use_char'], use_pts=parameters['pts'], num_classes=len(tag_to_id), word_emb=parameters['word_dim'], drop_out=0, word2vec=word_emb_weight, feature_maps=parameters['num_kernels'], #,200,200, 200,200], kernels=parameters['kernels'], hidden_size=parameters['word_lstm_dim'], hidden_layers=parameters['hidden_layer'], padding=parameters['padding'], max_seq_len=max_seq_len) #print '------gramcnn load----' gramcnn.load(models_path, model_name) compilation = [ opts, id_to_tag, word_to_id, char_to_id, tag_to_id, pt_to_id, lower, max_seq_len ] print ' [*] Finished loading.' return compilation, parameters, gramcnn
def test(): make_path(FLAGS) config = load_config(FLAGS.config_file) with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) update_tag_scheme(test_sentences, FLAGS.tag_schema) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) test_manager = BatchManager(test_data, 100) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) os.environ["CUDA_VISIBLE_DEVICES"] = "3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def evaluate_sentence(sentence): tokens = nltk.word_tokenize(sentence) test_sentences = [[[unicode(w), unicode('O')] for w in tokens]] if os.path.isfile(opts.test): test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, pt_to_id, lower) arr_results = evaluate(parameters, gramcnn, test_sentences, test_data, id_to_tag, remove=False, max_seq_len=max_seq_len, padding=parameters['padding'], use_pts=parameters['pts']) return process_results(arr_results, sentence)
list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_buckets, train_stats, train_unique_words = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) dev_buckets, dev_stats, dev_unique_words = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) test_buckets, test_stats, test_unique_words = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_stats), len(dev_stats), len(test_stats))
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): #print batch step, batch_loss = model.run_step(sess, True, batch) #print step loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
binary=False) # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] if os.path.isfile(opts.test): test_sentences = loader.load_sentences(opts.test, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) word_to_id, char_to_id, tag_to_id, pt_to_id, dico_words, id_to_tag = reload_mappings( os.path.join(models_path, model_name, 'mappings.pkl')) if os.path.isfile(opts.test): test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, pt_to_id, lower) print "%i sentences in test." % (len(test_data)) n_epochs = 100 # number of epochs over the training set freq_eval = 2000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 max_seq_len = m3 if m3 > 200 else 200 #initilaze the embedding matrix word_emb_weight = np.zeros((len(dico_words), parameters['word_dim'])) n_words = len(dico_words) gramcnn = GRAMCNN(
parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) if parameters['gaz_dim']: '''1: read from gazetteers file with the format: <gazeetteer <list of categories>> 2: once we read the gazetteers, we create a one-hot-encoding gazetteer vector for every word in the sentence. The length of vector is equal to no of categories
def main(): # load data sets global args args = parser.parse_args() pp.pprint(vars(args)) # running_name = 'X' use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu) # use_cuda = False # train_file = 'data/example.train' # dev_file = 'data/example.dev' test_file = 'data/example.test' # embedding_file = 'data/vec.txt' map_file = 'map.pkl' # config_file = 'config_file_pytorch' tag_file = 'tag.pkl' # embedding_easy_file = 'data/easy_embedding.npy' # train_sentences = load_sentences(train_file) # dev_sentences = load_sentences(dev_file) test_sentences = load_sentences(test_file) # train_sentences = dev_sentences # update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) # update_tag_scheme(dev_sentences, args.tag_schema) if not os.path.isfile(tag_file): print("Tag file {:s} Not found".format(tag_file)) sys.exit(-1) else: with open(tag_file, 'rb') as t: tag_to_id, id_to_tag = pickle.load(t) if not os.path.isfile(map_file): print("Map file {:s} Not found".format(map_file)) # create dictionary for word # dico_chars_train = char_mapping(train_sentences)[0] # dico_chars, char_to_id, id_to_char = augment_with_pretrained( # dico_chars_train.copy(), # embedding_file, # list(itertools.chain.from_iterable( # [[w[0] for w in s] for s in test_sentences]) # ) # ) # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences) # # with open(map_file, "wb") as f: # pickle.dump([char_to_id, id_to_char], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char = pickle.load(f) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) print("{:d} sentences in test.".format(len(test_data))) test_manager = BatchManager(test_data, 1) save_places = dir_utils.save_places(args.eval) # log_path = os.path.join("log", FLAGS.log_file) logger = get_logger( os.path.join(save_places.log_save_dir, 'evaluation-{:d}.txt'.format(args.fileid))) config = config_model(char_to_id, tag_to_id, args) print_config(config, logger) logger.info("start training") #Update: create model and embedding! model = NERModel.CNERPointer(char_dim=args.char_dim, seg_dim=args.seg_dim, hidden_dim=args.hidden_dim, max_length=15, output_classes=4, dropout=args.dropout, embedding_path=None, id_to_word=id_to_char, easy_load=None) print("Number of Params\t{:d}".format( sum([p.data.nelement() for p in model.parameters()]))) #Update: this won't work! # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu) if use_cuda: model = model.cuda() model.eval() if args.eval is not None: # if os.path.isfile(args.resume): ckpt_filename = os.path.join( save_places.model_save_dir, 'checkpoint_{:04d}.pth.tar'.format(args.fileid)) assert os.path.isfile( ckpt_filename), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt_filename, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'], strict=True) train_iou = checkpoint['IoU'] print("=> loading checkpoint '{}', current iou: {:.04f}".format( ckpt_filename, train_iou)) ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5) eval_lines = test_ner(ner_results, save_places.summary_save_dir) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) return f1
def train(): # load data sets datasets = load_sentences(FLAGS.train_file, FLAGS.lower) random.shuffle(datasets) train_sentences = datasets[:14000] test_sentences = datasets[14000:] # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train / dev." % (len(train_data), len(test_data))) elmo_batcher = get_batcher() train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher) test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: elmo_model = load_elmo() model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "test", test_manager, id_to_tag, logger) # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger)
for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] logging.info("Reading test data from %s..." % opts.input) lower = parameters['lower'] zeros = parameters['zeros'] test_sentences, len_mention = load_ner2line_sentences(opts.input, lower, zeros) raw_sentences, _ = load_ner2line_sentences(opts.input, lower=False, zeros=False) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, parameters['mode'], lower, parameters['overlap_rate'], parameters['negative_ratio'], parameters['max_len']) logging.info("%d sentences find in test dataset" % len(test_data)) logging.info("%d mentions find in test dataset" % len_mention) t_time = time.time() logging.info("Tagging...") _, _, fb, _, _, preds, _ = model.eval(test_data) f_output = codecs.open(opts.output, 'w', 'utf-8') logging.info("Time used for tagging:%s" % time_used(t_time))
def get_ne(sentence): test_sentences = loader.load_test_sentence(sentence, lower, zeros) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower) return evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags)
def train(): # load data sets train_sentences = load_sentences( FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dimension:num_sentence*len_sentence*2 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme( train_sentences, FLAGS.tag_schema) # dimension:num_sentence*len_sentence*2 update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 如果使用预训练的词嵌入 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[ 0] # dico_chars_train dimension: 训练数据集中出现的字符类别数*2, dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 利用测试数据样本集中的字对dico_chars_train进行补充 dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: # 创建map_file文件 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower) # dimension: NumSentence*4*LenSentence dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager( train_data, FLAGS.batch_size ) # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): # 若已有config_file则读取加载 config = load_config(FLAGS.config_file) else: # 若没有config_file则新建并保存为文件 config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 将config打印到日志文件 # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 动态申请内存 steps_per_epoch = train_manager.len_data # len_data: ceil(NumSentence/BatchSize) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): # 括号中数字是epoach数量 for batch in train_manager.iter_batch( shuffle=True ): # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序 step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) # View the tensorboard graph by running the following code and then going to the terminal and typing: # tensorboard --logdir = tensorboard_logs merged = tf.summary.merge_all() if not os.path.exists('tensorboard_logs/'): os.makedirs('tensorboard_logs/') my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
for s in test_sentences + dev_sentences + test_sentences])) if not parameters['all_emb'] else None) else: dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_chars_train = dico_chars dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) dico_tags_train = dico_tags list_prefix = read_list(opts.dictionary) # Index data label_cnt = len(dico_tags) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, lower, list_prefix=list_prefix, label_cnt=label_cnt) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, lower, list_prefix=list_prefix, label_cnt=label_cnt) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, lower, list_prefix=list_prefix, label_cnt=label_cnt)
# Save the mappings to disk print 'Saving the mappings to disk...' # how this work, should have a mapping in disk and load every time? model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters) # Reload previous model values if opts.reload: print 'Reloading previous model...' model.reload() def get_ne(sentence): test_sentences = loader.load_test_sentence(sentence, lower, zeros) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower) return evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) if __name__ == '__main__': while (True): sentence = raw_input("input >>> ") test_sentences = loader.load_test_sentence(sentence, lower, zeros) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower) print evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags)
def train_new(): train_sent = load_sentences(FLAGS.filepath) update_tag_scheme(train_sent, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower) print("random embedding") # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sent) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 数据准备,划分验证集和训练集 np.random.seed(10) train_sent_ = np.array(train_sent) shuffle_indices = np.random.permutation(np.arange(len(train_sent))) sent_shuffled = train_sent_[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent))) train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[ dev_sample_index:] train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train." % (len(train_data), len(dev_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = FLAGS.log_file logger = get_logger(log_path) print_config(config, logger) # 根据需求,设置动态使用GPU资源 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: fig = plt.figure() ax = fig.add_subplot(211) ax2 = fig.add_subplot(212) plt.grid(True) plt.ion() model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % 20 == 0: ax.scatter(step, np.mean(loss), c='b', marker='.') plt.pause(0.001) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) ax2.scatter(i + 1, f1, c='b', marker='.') plt.pause(0.001) if best: save_model(sess, model, FLAGS.ckpt_path, logger, "best")
parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag)
) else: dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_chars_train = dico_chars dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # dico_pos, pos_to_id, id_to_pos = pos_mapping(train_sentences) list_prefix = read_list(opts.dictionary) # Index data label_cnt = len(dico_tags) train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos ) # False:表示将data['gaze']one-hot向量全部置为0;True:不置0 #print "train_data[0]['gaze']:", train_data[0]['gaze'] dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos ) # for data in dev_data: # data['pos_one_hot'] test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk
sentences = [] for line in codecs.open(path, 'r', 'utf8'): sentence = [] line = line.rstrip() if line: word = line.split() for elem in word: sentence.append([elem]) sentences.append(sentence) return sentences test_sentences = load_sentences(opts.input) test_data = prepare_dataset(test_sentences, None, parameters, parameters['lower'], isTest=True) f_output = codecs.open(opts.output, 'w', 'utf-8') start = time.time() def xmlformat(sentence, tags): #{{{ assert len(sentence) == len(tags) res = [] preTag = "drug" for i in range(len(tags)): if tags[i][0] == 'B': if len(preTag): res.append("</" + preTag + ">")
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) # 加载bert模型的参数设置 if FLAGS.max_seq_length > bert_config.max_position_embeddings: # 限制ner的max_seq_length不大于bert的最大长度限制512 raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels( ) # 获取label标签["O", "B-DIS", "I-DIS", "X", "[CLS]", "[SEP]"] tokenizer = tokenization.FullTokenizer( # 对vocab的初始处理,包括word:id,大小写等 vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: # use_tpu 默认为False tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS. save_checkpoints_steps, # how often to save the model checkpoint. 1000 tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, # 1000 num_shards=FLAGS.num_tpu_cores, # 8 per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None # warm up 步数的比例,比如说总共学习100步,warmup_proportion=0.1表示前10步用来warm up,warm up时以 # 较低的学习率进行学习(lr = global_step/num_warmup_steps * init_lr),10步之后以正常(或衰减)的学习 # 率来学习。 ################## train_sentences = load_sentences( os.path.join(FLAGS.data_dir, "ner.train"), FLAGS.lower, FLAGS.zeros) # 加载训练数据,格式为二维list,外层存储每一句话,内层为每句话的一个字和对应的tag dev_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"), FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"), FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) # 默认IOBES,更新tag方案,将IOB转化为IOBES update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 默认IOBES,更新tag方案,将IOB转化为IOBES update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(map_file): # create dictionary for word if FLAGS.pre_emb: # use pre-trained embedding dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 为了保证训练集中未出现的测试集中的字至少也能用预训练的word embedding dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences ]) # 将嵌套的列表拼接 )) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) # 执行mark_mapping _c, mark_to_id, id_to_mark = mark_mapping(train_sentences) entropy_dict = load_entropy_dict(FLAGS.entropy_dict) with open(map_file, "wb") as f: pickle.dump([ char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict ], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict = pickle.load( f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) ############### if FLAGS.do_train: train_examples = processor.get_train_examples( FLAGS.data_dir, train_data) # 返回的每一个元素是一个InputExample对象 num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS. init_checkpoint, # 将预训练的bert模型的参数加载到模型中作为fine-tuning的初始化参数 learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) token_path = os.path.join(FLAGS.output_dir, "token_test.txt") with open(FLAGS.output_dir + '/label2id.pkl', 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} if os.path.exists(token_path): os.remove(token_path) predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") # batch_labels 是以句为单位的[[1,2,0,0,1,2],[...]] batch_tokens, batch_labels = filed_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") for actual_train_step in list(range(1000, num_train_steps, 2000)) + [num_train_steps]: if FLAGS.do_train: start = time.clock() tf.logging.info("start training time: %f", start) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", actual_train_step) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=actual_train_step) end = time.clock() tf.logging.info("end training time: %f", end) tf.logging.info("training time: %f", end - start) if FLAGS.do_eval: start = time.clock() tf.logging.info("start evaluation time: %f", start) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) end = time.clock() tf.logging.info("end evaluation time: %f", end) tf.logging.info("evaluation time: %f", end - start) if FLAGS.do_predict: start = time.clock() tf.logging.info("start predict time: %f", start) tf.logging.info("***** Running prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) _result = [] for prediction in result: _result += [prediction_id for prediction_id in prediction] output_predict_file = os.path.join( FLAGS.output_dir + "/label_test/", "label_test.txt-" + str(actual_train_step)) Writer(output_predict_file, _result, batch_tokens, batch_labels, id2label) end = time.clock() tf.logging.info("end predict time: %f", end) tf.logging.info("predict time: %f", end - start)
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag schema(IOB / IOBES) I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据 # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) # with open('maps.txt','w',encoding='utf8') as f1: # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # # prepare data, get a collection of list containing index # train_data[0][0]:一句话; # train_data[0][1]:单个字的编号; # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3; # train_data[0][3]:每个字的标签 train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) # 按batch size将数据拆分 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0) with tf.device("/cpu:0"): for i in range(100): # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练 for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) # 打印信息: # iteration:迭代次数,也就是经过多少个epoch; # if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim): #results File resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/" for u_dropout in dropout: for v_char_dim in char_dim: for w_char_lstm_dim in char_lstm_dim: for x_word_dim in word_dim: for y_word_lstm_dim in word_lstm_dim: for dataset in datasets: print "+++++++++++++++" print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset parameters['dropout'] = u_dropout parameters['char_dim'] = v_char_dim parameters['char_lstm_dim'] =w_char_lstm_dim parameters['word_dim'] = x_word_dim parameters['word_lstm_dim'] = y_word_lstm_dim # If dataset is DrugBank assign predefined path if(dataset == "i2b2-2010"): opts.train = i2b2BasePath+"train.txt" opts.dev = i2b2BasePath+ "dev.txt" opts.test = i2b2BasePath+ "test.txt" resultsFile = resultsPath +"i2b2_2010_Results.txt" # Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) print "Calling the prepare_dataset :--" # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters) # Reload previous model values if opts.reload: print 'Reloading previous model...' model.reload() # Train network # singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) n_epochs = 2 # number of epochs over the training set freq_eval = 1000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) #if i % 50 == 0 and i > 0 == 0: # print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) print "Score on dev: %.5f" % dev_score print "Score on test: %.5f" % test_score if dev_score > best_dev: best_dev = dev_score print "New best score on dev."+str(best_dev) # print "Saving model to disk..." # model.save() if test_score > best_test: best_test = test_score print "New best score on test."+str(best_test) # print "Config values used are : " print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)) # Write the best dev and test scores to the file del model with open(resultsFile, 'a') as f: f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim: |"+str(parameters['char_dim'])+ "| char_lstm_dim: "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n") return
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) #if best: save_model(sess, model, FLAGS.ckpt_path, logger)
morpho_tag_column_index=parameters['mt_ci'], joint_learning=True) else: id_to_morpho_tag = {} morpho_tag_to_id = {} if opts.overwrite_mappings: print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag, id_to_morpho_tag) model.reload_mappings() # Index data train_buckets, train_stats, train_unique_words, train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id, lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'], ) dev_buckets, dev_stats, dev_unique_words, dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id, lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'], ) test_buckets, test_stats, test_unique_words, test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id, lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'], ) if parameters['test_with_yuret'] or parameters['train_with_yuret']: # yuret train and test datasets yuret_train_buckets, yuret_train_stats, yuret_train_unique_words, yuret_train_data = prepare_dataset( yuret_train_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id, lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'],