def get_voc(self): ''' 读入词表 :return: ''' word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) print("word vocab size: {}".format(len(word2id))) return word2id
def getDicEmbed(): word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') return word2id, embeddings
def test101(**kwargs): import argparse from utils import str2bool from data import read_dictionary, tag2label print('test101', kwargs) ## parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') args = parser.parse_args([]) ## word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) client = BiLSTM_CRF_Client(args, tag2label, word2id) demo_sent = kwargs.get("demo_sent") demo_sent = list(demo_sent.strip()) print('demo_sent', len(demo_sent)) demo_data = [(demo_sent, ['O'] * len(demo_sent))] ret1 = client.demo_one(kwargs.get("server"), demo_data, verbose=False) print('result-1', ret1) from utils import get_entity PER, LOC, ORG = get_entity(ret1, demo_sent) print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG))
def __init__(self, args): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 paths, model_path = get_paths(args) ckpt_file = tf.train.latest_checkpoint(model_path) paths['model_path'] = ckpt_file word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) self.model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) self.model.build_graph() self.saver = tf.train.Saver() self.sess = tf.Session(config=config) self.saver.restore(self.sess, ckpt_file)
def ner(sent): config = tf.ConfigProto() parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') parser.add_argument('--epoch', type=int, default=10, help='#epoch of training') parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=False, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1563773712', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## paths setting paths = {} paths['summary_path'] = './' model_path = r'C:\Users\Houking\Desktop\web_api\ner\checkpoint' paths['model_path'] = os.path.join(model_path, "model") paths['result_path'] = './' paths['log_path'] = './' ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, ckpt_file) while (1): print('Please input your sentence:') demo_sent = input() if demo_sent == '' or demo_sent.isspace(): print('See you next time!') break else: sent = list(sent) data = [(sent, ['O'] * len(sent))] tag = model.demo_one(sess, data) PER, SEX, TIT, REA = get_entity(tag, sent) print('PER: {}\nSEX: {}\nTIT: {}\nREA: {}'.format( PER, SEX, TIT, REA))
parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') # 解决梯度爆炸的影响 parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='embedding_mat.npy',help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=200, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training raw_data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1586501733', help='model for test and demo')#random_char_300,1524919794 parser.add_argument('--embedding_dir', type=str, default='word2vector', help='embedding files dir') args = parser.parse_args() # get char embeddings word2id = read_dictionary(os.path.join('.', args.data_dir, args.dictionary)) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = os.path.join(os.path.curdir, args.embedding_dir, args.pretrain_embedding) embeddings = np.array(np.load(embedding_path), dtype='float32') # read corpus and get training raw_data # 读取数据 if args.mode != 'demo': train_path = os.path.join('.', args.data_dir, args.train_data) test_path = os.path.join('.', args.data_dir, args.test_data) train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) #paths setting
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='154138625', help='model for test and demo') args = parser.parse_args() def file_name(file_dir): for root, dirs, files in os.walk(file_dir): return files ## get char embeddings word2id = read_dictionary('../code/word2id.pkl') if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data) test_path = os.path.join('.', args.test_data) train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data)
if not args.use_pre_emb: # vocabulary build if not os.path.exists( os.path.join('data_path', args.dataset_name, 'word2id.pkl')): #原始数据集:txt文件 字1\ttag1\n # 字2\ttag2\n # line1 ... # 字n\ttagn\n 需要注意的是每个line 用两个换行符隔开 # \n # line2 同上 vocab_build( os.path.join('data_path', args.dataset_name, 'word2id.pkl'), os.path.join('data_path', args.dataset_name, train_file)) # get word dictionary word2id = read_dictionary( os.path.join('data_path', args.dataset_name, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) log_pre = 'not_use_pretrained_embeddings' else: with open('data_path//DaGuang//dr_d_td_all.pkl', 'rb') as f: id2word = pickle.load(f) word2id = pickle.load(f) print('word2id的length:', len(word2id)) _ = pickle.load(f) embeddings_path = os.path.join('data_path', args.dataset_name, 'pretrain_embedding.npy') if not os.path.exists(embeddings_path): build_character_embeddings(args.pretrained_emb_path, embeddings_path) embeddings = np.array(np.load(embeddings_path), dtype='float32') log_pre = 'use_pretrained_embeddings'
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() # 解析命令行参数 ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) # 词典:id,count(去除过低频次,归一化过数字,字母等非中文) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) # 均匀初始化词向量 else: embedding_path = 'pretrain_embedding.npy' # 否则,读取预训练的词向量 embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': # 读取训练集测试集,每个句子是词id序列和标签序列 train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data)
"Comma-separated list of hostname:port pairs") flags.DEFINE_string( "worker_hosts", "172.16.23.5:2226,172.16.23.5:2227,172.16.23.5:2228,172.16.23.5:2229", "Comma-separated list of hostname:port pairs") # flags.DEFINE_string("worker_hosts", # "172.16.23.5:2223,172.16.23.5:2224,172.16.23.5:2225,172.16.23.5:2226," # "172.16.23.11:2223,172.16.23.11:2224,172.16.23.11:2225,172.16.23.11:2226", # "Comma-separated list of hostname:port pairs") flags.DEFINE_string("job_name", None, "job name: worker or ps") FLAGS = flags.FLAGS # get word embeddings word2id = read_dictionary(os.path.join('./', FLAGS.word2id, 'word2id.pkl')) if FLAGS.pretrain_embedding == 'random': embeddings = random_embedding(word2id, FLAGS.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') # read corpus and get training data if FLAGS.mode != 'demo': train_path = os.path.join('.', FLAGS.train_data_path, 'train_data') train_data_len = get_train_data_len(train_path) # test_path = os.path.join('.', FLAGS.test_data_path, 'test_data') # train_data = read_corpus(train_path) # test_data = read_corpus(test_path) # path setting
args = { 'batch_size': 128, 'epoch': 20, 'hidden_dim': 300, 'optimizer': 'Adam', 'CRF': True, 'lr': 0.001, 'clip': 5.0, 'dropout': 0.8, 'update_embedding': True, 'shuffle': True } ## get char embeddings #word2id = read_dictionary(os.path.join(os.environ['DMPPATH'],'gz_case_address/data_path/word2id.pkl')) word2id = read_dictionary("./gz_case_address/data_path/word2id.pkl") embeddings = random_embedding(word2id, 300) ## paths setting #output_path = os.path.join(os.environ['DMPPATH'],'dmp/gongan/gz_case_address/mode_save') output_path = os.path.join("./gz_case_address/mode_save") # output_path = ('./mode_save') if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints") if not os.path.exists(model_path): os.makedirs(model_path)
type=str, default='demo', help='train/test/demo/text') # parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') parser.add_argument('--demo_model', type=str, default='1550144205', help='model for test and demo') parser.add_argument('--text_file', type=str, default='my.txt', help='text file for demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary('./data_path/word2id.pkl') embeddings = random_embedding(word2id, 300) output_path = './data_path_save/1577156952' model_path = os.path.join(output_path, "checkpoints/") ckpt_prefix = os.path.join(model_path, "model") ckpt_file = tf.train.latest_checkpoint(model_path) ## paths setting paths = {} summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path
default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1536659706', help='model for test and demo') args = parser.parse_args( ) #PyCharm中Run - Edit Configurations - Script Parameters设置命令行参数 ## get char embeddings word2id = read_dictionary( r"D:\data\100dim\word2id_100.pkl") #data.py会生成pkl文件,是词和词向量的对应关系 if args.pretrain_embedding == 'random': embeddings = random_embedding( word2id, args.embedding_dim) #随机生成len(word2id)*args.embedding_dim维array else: embedding_path = 'D:\\data\\100dim\\np_100.npy' #磁盘中数组的二进制格式文件 embeddings = np.array(np.load(embedding_path), dtype='float32') #加载词向量到内存中 ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train1.txt') #训练集文件路径拼接 test_path = os.path.join('.', args.test_data, 'test1.txt') #测试集文件路径拼接 train_data = read_corpus(train_path) #读训练集,为自己定义的一个函数,返回list test_data = read_corpus(test_path) test_size = len(test_data)
def infer(): # Load arguments args = parse_args() # args.batch_size = 1 word2id = data.read_dictionary("data/pre_trained_word2id.pkl") embeddings = np.load("data/pre_trained_embeddings.npy") # word2id = data.read_dictionary("data/pre_trained_copy_mini_word2id.pkl") # embeddings = np.load("data/pre_trained_copy_mini_embeddings.npy") # word2id_output_mini = {} # for i, k in enumerate(word2id): # word2id_output_mini[k] = i # if i > 9100: # break # word2id_output_mini["<S>"] = 1 # word2id_output_mini["<E>"] = 2 # word2id = word2id_output_mini word2id_output = word2id.copy() word_ori_size = len(word2id) # word_mini_size = len(word2id_output) # word_size = word_ori_size # word_size = word_mini_size word_size = 0 tag_size = 0 for k in tag2label: if tag2label[k] > tag_size: tag_size = tag2label[k] tag2label[k] += args.max_length if tag2label[k] > word_size: word_size = tag2label[k] # word2id_output.update(tag2label) word2id_output = tag2label word2id_output["<S>"] = word_size + 1 word2id_output["<E>"] = word_size + 2 word_size += 3 tag_size += 3 print("output size", word_size, tag_size) # # Dictrionaries init # word2id = data.read_dictionary("data/pre_trained_word2id.pkl") # embeddings = np.load("data/pre_trained_embeddings.npy") # word2id_output = word2id.copy() # word_mini_size = len(word2id) # word_size = 0 # for k in tag2label: # tag2label[k] += word_mini_size # if tag2label[k] > word_size: # word_size = tag2label[k] # tag2label["<S>"] = word_size + 1 # tag2label["<E>"] = word_size + 2 # word_size += 3 # word2id_output.update(tag2label) # # print(type(word2id), len(word2id)) # # print(type(entity2id), len(entity2id)) # # print(type(pos2id), len(pos2id)) # # print(type(word2id_output), len(word2id_output)) id2entity = {} for k in entity2id: id2entity[entity2id[k]] = k id2word = {} for k in word2id: id2word[word2id[k]] = k id2word_output = {} for k in word2id_output: id2word_output[word2id_output[k]] = k src_dict, trg_dict = id2word, id2word_output # Load data # data_train = data_load("data/train_pos.txt", # data=data, word2id=word2id, entity2id=entity2id, # pos2id=pos2id, word2id_output=word2id_output, # event_args=event_args) data_train = data_load("data/ace_data/train.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args, generate=True) data_dev = data_load("data/ace_data/dev.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args, generate=True) data_test = data_load("data/ace_data/test.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args, generate=True) # data_test = data_train print("=====Init scores") scores = generate_pr(word_dict=id2word_output) scores.append_label(data_test) # Inference net = model.net( args.embedding_dim, args.encoder_size, args.decoder_size, word_ori_size, word_size, tag_size, True, # False, beam_size=args.beam_size, max_length=args.max_length, source_entity_dim=len(entity2id), source_pos_dim=len(pos2id), embedding_entity_dim=args.embedding_entity_dim, embedding_pos_dim=args.embedding_pos_dim, end_id=word2id_output["<E>"]) # test_batch_generator = paddle.batch( # paddle.reader.shuffle( # paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), # batch_size=args.batch_size, # drop_last=False) dev_batch_generator = paddle.batch(paddle.reader.buffered(data_dev, size=1000), batch_size=args.batch_size, drop_last=False) test_batch_generator = paddle.batch(paddle.reader.buffered(data_test, size=1000), batch_size=args.batch_size, drop_last=False) print("begin memory optimization ...") # fluid.memory_optimize(train_program) fluid.memory_optimize(framework.default_main_program()) print("end memory optimization ...") place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) # # exe = fluid.ParallelExecutor(use_cuda=args.use_gpu) # os.environ['CPU_NUM'] = "2" # exe = fluid.parallel_executor.ParallelExecutor( # use_cuda=args.use_gpu, num_trainers=2, # # loss_name=avg_cost.name, # main_program=fluid.default_main_program()) # LOAD Model model_path = os.path.join(args.save_dir, str(args.load_pass_num)) fluid.io.load_persistables(executor=exe, dirname=model_path, main_program=framework.default_main_program()) print("==Model loaded", args.save_dir) translation_ids = net.translation_ids translation_scores = net.translation_scores feed_order = net.feeding_list feed_list = [ framework.default_main_program().global_block().var(var_name) for var_name in feed_order ] # print(feed_list) feeder = fluid.DataFeeder(feed_list, place) scores.reset() for batch_id, _data in enumerate(test_batch_generator()): print("=====", batch_id, len(_data)) # The value of batch_size may vary in the last batch batch_size = len(_data) # Setup initial ids and scores lod tensor # init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') init_ids_data = np.array( [word2id_output["<S>"] for _ in range(batch_size)], dtype='int64') init_scores_data = np.array([1. for _ in range(batch_size)], dtype='float32') init_ids_data = init_ids_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1)) init_recursive_seq_lens = [1] * batch_size init_recursive_seq_lens = [ init_recursive_seq_lens, init_recursive_seq_lens ] init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens, place) init_scores = fluid.create_lod_tensor(init_scores_data, init_recursive_seq_lens, place) # print(init_ids_data.shape) # print(init_recursive_seq_lens) # print(init_ids.lod()) # print(init_scores.lod()) # Feed dict for inference feed_dict = feeder.feed([x for x in _data]) feed_dict['init_ids'] = init_ids feed_dict['init_scores'] = init_scores print("=====") fetch_outs = exe.run( framework.default_main_program(), feed=feed_dict, fetch_list=[translation_ids, translation_scores], # fetch_list=[translation_ids], return_numpy=False) # print(np.array(fetch_outs[0])) # print(np.array(fetch_outs[0]).shape) print("=====Update scores") scores.update(preds=fetch_outs[0], labels=[_[-1] for _ in _data], words_list=[_[0] for _ in _data], for_generate=True) # Split the output words by lod levels end_id = word2id_output["<E>"] result = [] paragraphs = [] for ids in np.array(fetch_outs[0]): # print("##", ids.shape) # print("##", ids) new_ids = [] new_words = [] pre_id = -1 for _id in ids: if _id == end_id or \ _id == pre_id: break pre_id = _id new_ids.append(_id) if _id < args.max_length: new_words.append(str(_id)) else: new_words.append(trg_dict[_id]) result.append(new_ids) paragraphs.append(new_words) # lod_level_1 = fetch_outs[0].lod()[1] # token_array = np.array(fetch_outs[0]) # result = [] # for i in six.moves.xrange(len(lod_level_1) - 1): # sentence_list = [ # trg_dict[token] # for token in token_array[lod_level_1[i]:lod_level_1[i + 1]] # ] # sentence = " ".join(sentence_list[1:-1]) # result.append(sentence) # lod_level_0 = fetch_outs[0].lod()[0] # paragraphs = [ # result[lod_level_0[i]:lod_level_0[i + 1]] # for i in six.moves.xrange(len(lod_level_0) - 1) # ] # target_sentence_list = [" ".join( # [trg_dict[__] # for __ in _[-1]]) # for _ in _data] target_sentence_list = [] for item in _data: target_words = [] for _id in item[-1]: if _id < args.max_length: target_words.append(str(_id)) else: target_words.append(trg_dict[_id]) target_sentence_list.append(" ".join(target_words)) source_sentence_list = [] source_entity_list = [] for item in _data: target_words = [] for _id in item[0]: target_words.append(src_dict[_id]) source_sentence_list.append(target_words) entity_tag = [] for _id in item[1]: entity_tag.append(id2entity[_id]) source_entity_list.append(entity_tag) print("=====Print text") for paragraph, sentence, source , entities in \ zip(paragraphs, target_sentence_list, \ source_sentence_list, source_entity_list): print("-----") new_words = [] indexes = range(len(source)) for i, word, entity in zip(indexes, source, entities): new_words.append(word + "(" + str(i) + " " + entity + ")") print(" ".join(new_words)) print("=Predict:", " ".join(paragraph[1:])) print("=Label:", sentence) scores.eval_show()
default=512, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings # word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) word2id = read_dictionary('word.pkl') if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) # embeddings = BERTEmbedding("chinese", sequence_length=50, task=kashgari.LABELING) print('embeddings') print(len(embeddings)) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': # train_path = os.path.join('.', args.train_data, 'train_data') # test_path = os.path.join('.', args.test_data, 'test_data') train_path = 'sample3.data' test_path = 'train.data'
transition_params_proto = result.outputs['transition_params'] # transition_params_shape = [transition_params_proto.tensor_shape.dim[i].size # for i in range(len(transition_params_proto.tensor_shape.dim))] # transition_params = numpy.array(transition_params_proto.float_val).reshape(transition_params_shape) transition_params = tf.contrib.util.make_ndarray(transition_params_proto) label_list = [] for logit, seq_len in zip(logits, seq_len_list): viterbi, viterbi_score = viterbi_decode(logit[:seq_len], transition_params) label_list.append(viterbi) return label_list, seq_len_list word2id = read_dictionary(os.path.join('.', 'data_path', 'word2id.pkl')) def main(test_sent): start_time = time.time() channel = implementations.insecure_channel('192.168.1.210', 5075) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) test_sent = list(test_sent.strip()) test_data = [(test_sent, ['O'] * len(test_sent))] label_list = [] for seqs, labels in batch_yield(test_data, batch_size=64, vocab=word2id, tag2label=tag2label, shuffle=False):
help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() # Creating .pkl file vocab_build(Path + '\\word2id.pkl', Path + '\\vocab.txt', 3) # get char embeddings word2id = read_dictionary(Path + '\\word2id.pkl') if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = 'D:\\resource\\general_hypernym_extraction\\data\\train.txt' test_path = 'D:\\resource\\general_hypernym_extraction\\data\\valid.txt' train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) ## paths setting
def train(data=data): # Load arguments args = parse_args() options = vars(args) print(json.dumps(options, ensure_ascii=False, indent=4)) # if not conf.pre_train_word_embedding: # word2id = data.read_dictionary("train_data/word2id.pkl") # embeddings = data.random_embedding(word2id, conf.embedding_dim) # else: # Dictrionaries init word2id = data.read_dictionary("train_data/pre_trained_word2id.pkl") # embeddings = np.load("train_data/pre_trained_embeddings.npy") # word2id = data.read_dictionary("train_data/pre_trained_mini_word2id.pkl") # embeddings = np.load("train_data/pre_trained_mini_embeddings.npy") # word2id = data.read_dictionary("train_data/pre_trained_copy_mini_word2id.pkl") # embeddings = np.load("train_data/pre_trained_copy_mini_embeddings.npy") # word2id_output_mini = {} # for i, k in enumerate(word2id): # word2id_output_mini[k] = i # if i > 9100: # break # word2id_output_mini["<S>"] = 1 # word2id_output_mini["<E>"] = 2 # word2id = word2id_output_mini word2id_output = word2id.copy() word_ori_size = len(word2id) # word_mini_size = len(word2id_output) # word_size = word_ori_size # word_size = word_mini_size word_size = 0 tag_size = 0 for k in tag2label: if tag2label[k] > tag_size: tag_size = tag2label[k] tag2label[k] += args.max_length if tag2label[k] > word_size: word_size = tag2label[k] # word2id_output.update(tag2label) word2id_output = tag2label word2id_output["<S>"] = word_size + 1 word2id_output["<E>"] = word_size + 2 word_size += 3 tag_size += 3 print("output size", word_size, tag_size) # print(type(word2id), len(word2id)) # print(type(entity2id), len(entity2id)) # print(type(pos2id), len(pos2id)) # print(type(word2id_output), len(word2id_output)) # Load data data_train = data_load("train_data/ace_data/train.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args) data_dev = data_load("train_data/ace_data/dev.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args, generate=True) data_test = data_load("train_data/ace_data/test.txt", data=data, word2id=word2id, entity2id=entity2id, pos2id=pos2id, word2id_output=word2id_output, event_args=event_args, generate=True) if args.enable_ce: framework.default_startup_program().random_seed = 111 # # Training process # net = model.net( # args.embedding_dim, # args.encoder_size, # args.decoder_size, # word_ori_size, # word_size, # tag_size, # False, # beam_size=args.beam_size, # max_length=args.max_length, # source_entity_dim=len(entity2id), # source_pos_dim=len(pos2id), # embedding_entity_dim=args.embedding_entity_dim, # embedding_pos_dim=args.embedding_pos_dim, # end_id=word2id_output["<E>"]) # avg_cost = net.avg_cost # feed_order = net.feeding_list # # Test net # net_test = model.net( # args.embedding_dim, # args.encoder_size, # args.decoder_size, # word_mini_size, # word_size, # True, # beam_size=args.beam_size, # max_length=args.max_length, # source_entity_dim=len(entity2id), # source_pos_dim=len(pos2id), # embedding_entity_dim=args.embedding_entity_dim, # embedding_pos_dim=args.embedding_pos_dim, # end_id=word2id_output["<E>"]) # # # clone from default main program and use it as the validation program # main_program = fluid.default_main_program() # inference_program = fluid.default_main_program().clone(for_test=True) # optimizer = fluid.optimizer.Adam( # learning_rate=args.learning_rate, # regularization=fluid.regularizer.L2DecayRegularizer( # regularization_coeff=1e-5)) # optimizer.minimize(avg_cost, no_grad_set=net.no_grad_set) # print("begin memory optimization ...") # # fluid.memory_optimize(train_program) # fluid.memory_optimize(main_program) # print("end memory optimization ...") # loss = avg_cost train_program = fluid.Program() train_startup = fluid.Program() # if "CE_MODE_X" in os.environ: # train_program.random_seed = 110 # train_startup.random_seed = 110 with fluid.program_guard(train_program, train_startup): with fluid.unique_name.guard(): # Training process net = model.net(args.embedding_dim, args.encoder_size, args.decoder_size, word_ori_size, word_size, tag_size, False, beam_size=args.beam_size, max_length=args.max_length, source_entity_dim=len(entity2id), source_pos_dim=len(pos2id), embedding_entity_dim=args.embedding_entity_dim, embedding_pos_dim=args.embedding_pos_dim, end_id=word2id_output["<E>"]) loss = net.avg_cost feed_order = net.feeding_list # gradient clipping fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByValue(max=1.0, min=-1.0)) optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate, regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-5)) # optimizer = fluid.optimizer.Adam( # learning_rate=fluid.layers.exponential_decay( # learning_rate=args.learning_rate, # decay_steps=400, # decay_rate=0.9, # staircase=True)) optimizer.minimize(loss) avg_cost = loss # print("begin memory optimization ...") # fluid.memory_optimize(train_program) # print("end memory optimization ...") test_program = fluid.Program() test_startup = fluid.Program() # if "CE_MODE_X" in os.environ: # test_program.random_seed = 110 # test_startup.random_seed = 110 with fluid.program_guard(test_program, test_startup): with fluid.unique_name.guard(): # Test net net_test = model.net( args.embedding_dim, args.encoder_size, args.decoder_size, word_ori_size, word_size, tag_size, True, beam_size=args.beam_size, max_length=args.max_length, source_entity_dim=len(entity2id), source_pos_dim=len(pos2id), embedding_entity_dim=args.embedding_entity_dim, embedding_pos_dim=args.embedding_pos_dim, end_id=word2id_output["<E>"]) test_program = test_program.clone(for_test=True) main_program = train_program inference_program = test_program # print(type(paddle.dataset.wmt14.train(args.dict_size))) # print(type(paddle.reader.shuffle( # data_train, buf_size=1000))) # print(args.enable_ce) # for batch_id, data in enumerate(paddle.batch( # paddle.reader.shuffle( # paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), # batch_size=args.batch_size, # drop_last=False)()): # print(data) # break # Disable shuffle for Continuous Evaluation only if not args.enable_ce: train_batch_generator = paddle.batch(paddle.reader.shuffle( data_train, buf_size=1000), batch_size=args.batch_size, drop_last=False) else: train_batch_generator = paddle.batch(data_train, batch_size=args.batch_size, drop_last=False) dev_batch_generator = paddle.batch(paddle.reader.buffered(data_dev, size=1000), batch_size=args.batch_size, drop_last=False) test_batch_generator = paddle.batch(paddle.reader.buffered(data_test, size=1000), batch_size=args.batch_size, drop_last=False) # print (type(train_batch_generator)) # Init model if args.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("device count %d" % dev_count) # print("theoretical memory usage: ") # print(fluid.contrib.memory_usage( # program=main_program, batch_size=args.batch_size)) # print("=====Init Main program") # exe = Executor(place) # # Init para # exe.run(framework.default_startup_program()) # # exe = fluid.ParallelExecutor(use_cuda=args.use_gpu) # # os.environ['CPU_NUM'] = "2" # # exe = fluid.parallel_executor.ParallelExecutor( # # use_cuda=args.use_gpu, num_trainers=2, # # loss_name=avg_cost.name, # # main_program=fluid.default_main_program()) exe = fluid.Executor(place) print("=====Init train program") exe.run(train_startup) print("=====Init test program") exe.run(test_startup) # print("=====Init train exe") # train_exe = fluid.ParallelExecutor( # use_cuda=args.use_gpu, loss_name=loss.name, main_program=train_program) # print("=====Init test exe") # test_exe = fluid.ParallelExecutor( # use_cuda=args.use_gpu, # main_program=test_program, # share_vars_from=train_exe) ## Set word emb #print("=====Set word embedding") #embeddings = embeddings.astype("float32") #word_emb_param = fluid.global_scope().find_var( # "emb").get_tensor() #word_emb_param.set(embeddings, place) print("=====Init Feeder") feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feed_list_test = [ inference_program.global_block().var(var_name) for var_name in net_test.feeding_list ] # print(feed_list) feeder = fluid.DataFeeder(feed_list, place) feeder_test = fluid.DataFeeder(feed_list_test, place) # return def validation(generater, test_scores): # Use test set as validation each pass test_scores.reset() total_loss = 0.0 count = 0 # val_feed_list = [ # inference_program.global_block().var(var_name) # for var_name in net_test.feeding_list # ] # val_feeder = fluid.DataFeeder(val_feed_list, place) for batch_id, data in enumerate(generater()): # The value of batch_size may vary in the last batch batch_size = len(data) # Setup initial ids and scores lod tensor init_ids_data = np.array( [word2id_output["<S>"] for _ in range(batch_size)], dtype='int64') init_scores_data = np.array([1. for _ in range(batch_size)], dtype='float32') init_ids_data = init_ids_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1)) init_recursive_seq_lens = [1] * batch_size init_recursive_seq_lens = [ init_recursive_seq_lens, init_recursive_seq_lens ] init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens, place) init_scores = fluid.create_lod_tensor(init_scores_data, init_recursive_seq_lens, place) # Feed dict for inference # feed_dict = feeder.feed([[x[0]] for x in data]) feed_dict = feeder_test.feed(data) feed_dict['init_ids'] = init_ids feed_dict['init_scores'] = init_scores val_fetch_outs = exe.run( inference_program, # test_program(), feed=feed_dict, fetch_list=[net_test.translation_ids], return_numpy=False) # test_scores.update( # preds=val_fetch_outs[0], # labels=[_[-1] for _ in data]) # print("=====Update scores") test_scores.update(preds=val_fetch_outs[0], labels=[_[-1] for _ in data], words_list=[_[0] for _ in data], for_generate=True) # val_fetch_outs = exe.run(inference_program, # feed=val_feeder.feed(data), # fetch_list=[avg_cost, net.label], # return_numpy=False) # test_scores.update( # preds=val_fetch_outs[1], # labels=[_[-1] for _ in data], # words_list=[_[0] for _ in data]) total_loss += 1.0 count += 1 # if batch_id > 0: # break values = test_scores.eval() test_scores.eval_show() return total_loss / count, values print("=====Init scores") id2word_output = {} for k in word2id_output: id2word_output[word2id_output[k]] = k scores_train = generate_pr(word_dict=id2word_output) scores_train.append_label(data_train) scores_test = generate_pr(word_dict=id2word_output) scores_test.append_label(data_test) scores_dev = generate_pr(word_dict=id2word_output) scores_dev.append_label(data_dev) max_tri_f1 = 0.0 max_tri_pass = -1.0 max_arg_f1 = 0.0 max_arg_pass = -1.0 print("=====Start training") for pass_id in range(1, args.pass_num + 1): scores_train.reset() pass_start_time = time.time() words_seen = 0 for batch_id, _data in enumerate(train_batch_generator()): batch_size = len(_data) words_seen += len(_data) * 2 # print(_data) # print(len(_data)) # print(sum([len(_[0]) for _ in _data])) # # Setup initial ids and scores lod tensor # init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') # init_scores_data = np.array( # [1. for _ in range(batch_size)], dtype='float32') # init_ids_data = init_ids_data.reshape((batch_size, 1)) # init_scores_data = init_scores_data.reshape((batch_size, 1)) # init_recursive_seq_lens = [1] * batch_size # init_recursive_seq_lens = [ # init_recursive_seq_lens, init_recursive_seq_lens # ] # init_ids = fluid.create_lod_tensor(init_ids_data, # init_recursive_seq_lens, place) # init_scores = fluid.create_lod_tensor(init_scores_data, # init_recursive_seq_lens, place) # # Feed dict for inference # # feed_dict = feeder.feed([[x[0]] for x in _data]) # feed_dict = feeder.feed(_data) # feed_dict['init_ids'] = init_ids # feed_dict['init_scores'] = init_scores # avg_cost_train, preds = exe.run( # framework.default_main_program(), # # test_program(), # feed=feed_dict, # fetch_list=[avg_cost, net.predict], # return_numpy=False) avg_cost_train, preds = exe.run( main_program, # train_program(), feed=feeder.feed(_data), fetch_list=[avg_cost, net.label], return_numpy=False) # print(np.array(labels).shape) # print(np.array(preds).tolist()) # print([_[-1] for _ in _data]) #print([_[0] for _ in _data]) avg_cost_train = np.array(avg_cost_train) if batch_id % 10 == 0: print('pass_id=%d, batch_id=%d, train_loss: %f' % (pass_id, batch_id, avg_cost_train)) scores_train.update(preds=preds, labels=[_[-1] for _ in _data], words_list=[_[0] for _ in _data]) # This is for continuous evaluation only # if args.enable_ce and batch_id >= 100: # if batch_id > 0: # break scores_train.eval_show() pass_end_time = time.time() new_max_dev = False
parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data)
default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## load char vocabulary list vocab_path = os.path.join('.', args.train_data, 'word2id.pkl') word2id = read_dictionary(vocab_path) # get char embeddings if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode == 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data)
## Session configuration神经网络训练和测试的入口,通过定义主函数的执行函数,可以在这里控制神经网络的训练,保存模型;以及神经网络的测试,包括模型调用 os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # default: 0 config = tf.ConfigProto() ## hyperparameters embedding_dim = 128 tag2label = {"N": 0, "解剖部位": 1, "手术": 2, "药物": 3, "独立症状": 4, "症状描述": 5} ## get char embeddings word2id = read_dictionary('./vocab.pkl') embeddings = random_embedding(word2id, embedding_dim) train_data = read_corpus('./c.txt') # embeddings, tag2label, vocab,batch_size,epoch,hidden_dim,CRF,update_embedding,shuffle ## training model if __name__ == '__main__': model = BiLSTM_CRF(embeddings, tag2label, word2id, 4,80,128,False,True,True) model.build_graph() test_report = open('test_report.txt','w',encoding= 'utf-8') print("train data: {}".format(len(train_data))) model.test(test_report) # model.train(train=train_data) # use test_data as the dev_data to see overfitting phenomena
import numpy as np import pandas as pd import string import random from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.layers import Input, Embedding, Activation, Flatten, Dense from keras.layers import Conv1D, MaxPooling1D, Dropout from keras.models import Model from data import read_dictionary, build_char_dict, prepare_data # Each line is formatted as: "word {gender}" word_list = read_dictionary('./wiktionary_nouns_with_gender.txt') char2idx, idx2char = build_char_dict() vocab_size = len(char2idx) embedding_weights = [] embedding_weights.append(np.zeros(vocab_size)) for char, idx in char2idx.items(): onehot = np.zeros(vocab_size) onehot[idx-1] = 1 embedding_weights.append(onehot) embedding_weights = np.array(embedding_weights) embedding_size = embedding_weights.shape[1]
def get_model(self): config = tf.ConfigProto() parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') parser.add_argument('--epoch', type=int, default=40, help='#epoch of training') parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument( '--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') paths = {} timestamp = str(int( time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() return model, saver, ckpt_file
ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) return paths args = para_set() paths = path_set() config = tf.ConfigProto() print("loading data") train_data, test_data = auas_read_corpus("/home/jinsh/wiki_model/data/extraction.corpus_all.json") print("{0} training data \n{1} test data".format(len(train_data), len(test_data))) # always use random embedding word2id = read_dictionary(word2id_path) embeddings = random_embedding(word2id, args.embedding_dim) model_path = root_dir +"key_word/best_model/checkpoints/" ckpt_file = tf.train.latest_checkpoint(model_path) # print(ckpt_file) print(ckpt_file) # exit paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() # print("test data: {}".format(len(test_data))) model.test(test_data)
words_count_list = sorted(words_count_list, key=lambda x: x[0]) word2id_mini = {} for i, item in enumerate(words_count_list): key = id2word[item[0]] word2id_mini[key] = i print(len(word2id_mini)) with open(save_prefix + "word2id.pkl", 'w+') as fw: pickle.dump(word2id_mini, fw) if __name__ == "__main__": import data from Constant import pos2id, entity2id, event_args, label2idx, tag2label # word2id = data.read_dictionary("data/pre_trained_word2id.pkl") # embeddings = np.load("data/pre_trained_embeddings.npy") word2id = data.read_dictionary("data/pre_trained_mini_word2id.pkl") embeddings = np.load("data/pre_trained_mini_embeddings.npy") word2id_output = word2id.copy() word_ori_size = len(word2id) # word_mini_size = len(word2id_output) # word_size = word_ori_size # word_size = word_mini_size word_size = 0 for k in tag2label: tag2label[k] += args.max_length if tag2label[k] > word_size: word_size = tag2label[k] # word2id_output.update(tag2label) word2id_output = tag2label word2id_output["<S>"] = word_size + 1
lr_pl = float(conf.get('train_arg', 'lr_pl')) # graph参数配置 uniDocModel_wordEmbedSize = int( conf.get('graph_arg', 'uniDocModel_wordEmbedSize')) uniDocModel_hiddenSize = int(conf.get('graph_arg', 'uniDocModel_hiddenSize')) classModel_hiddenSize = int(conf.get('graph_arg', 'classModel_hiddenSize')) # pad参数配置 train_max_sent_len = int(conf.get('pad_arg', 'train_max_sent_len')) train_max_sent_num = int(conf.get('pad_arg', 'train_max_sent_num')) test_max_sent_len = int(conf.get('pad_arg', 'test_max_sent_len')) test_max_sent_num = int(conf.get('pad_arg', 'test_max_sent_num')) if __name__ == "__main__": mode = "train" if os.path.exists(os.path.join("data_path", 'word2id.pkl')): word2id = read_dictionary(os.path.join("data_path", 'word2id.pkl')) else: build_vocab_doc(os.path.join("data_path", 'word2id.pkl'), train_data_path) word2id = read_dictionary(os.path.join("data_path", 'word2id.pkl')) vocab_size = len(word2id) num_tags = len(tag2label) if mode == "train": timestamp = str(int(time.time())) else: timestamp = conf.get('path_arg', 'test_time') output_path = os.path.join(output_path, timestamp) if not os.path.exists(output_path): os.makedirs(output_path) model_path = os.path.join(output_path, conf.get('path_arg', 'model_path')) summary_path = os.path.join(output_path, conf.get('path_arg', 'summary_path'))
# -*- coding: utf-8 -*- """ Created on Wed Feb 27 14:47:03 2019 @author: Administrator """ #import tensorflow as tf #import numpy as np #import os, argparse, time, random from BiLSTMmodel import bilstm_model from data import read_corpus, read_dictionary, random_embedding from config import config ## get char embeddings word2id = read_dictionary('vocab') ##随机产生embedding embeddings = random_embedding(word2id, config.embedding_size) paths={'log_path':'logger//', 'model_path':'./model2/','result_path':'result//'} #TODO 注意:model_path!!这是个坑啊!! model = bilstm_model(embeddings, paths, word2id, config=config) model.build_graph() ## train model on the whole training data train_data = read_corpus('pku_training.utf8') print("train data: {}".format(len(train_data)))
import os import numpy as np from data import read_corpus, read_dictionary from model import BiLSTM_CRF, Config from utils import NerCfgData ner_cfg = NerCfgData() label2id = ner_cfg.generate_tag_to_label() logger = logging.getLogger(__name__) current_dir = os.path.dirname(os.path.abspath(__file__)) ## get char embeddings word2id_pos2id = read_dictionary('word2id_pos2id_new.pkl') word2id = word2id_pos2id['word2id'] pos2id = word2id_pos2id['pos2id'] word_embedding = np.array(np.load('word2vec.npy'), dtype=np.float32) pos_embedding = np.array(np.load('pos2vec.npy'), dtype=np.float32) config = Config(word2id, pos2id, label2id, batch_size=128, n_epochs=200, n_neurons=60) config.word_embedding = word_embedding config.pos_embedding = pos_embedding ## read corpus and get training data
type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') parser.add_argument('--seq_length', type=int, default=20, help='Pretrain language model seq length') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') pre_train_path = os.path.join('.', args.train_data, 'resume_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) pre_train_data = read_pre_train_data(pre_train_path, args.seq_length) test_data = read_corpus(test_path)
def run(sentences): # 配置session的参数 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 使用GPU 0 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 日志级别设置 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 # need ~700MB GPU memory # hyperparameters超参数设置 # 创建一个解析器对象,并告诉它将会有些什么参数 # 那么当你的程序运行时,该解析器就可以用于处理命令行参数 parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') # batch :批次大小 在深度学习中,一般采用SGD训练,即每次训练在训练集中取batchsize个样本训练 # iteration:中文翻译为迭代,1个iteration等于使用batchsize个样本训练一次 # 一个迭代 = 一个正向通过+一个反向通过 parser.add_argument('--epoch', type=int, default=40, help='#epoch of training') # epoch:迭代次数,1个epoch等于使用训练集中的全部样本训练一次 # 一个epoch = 所有训练样本的一个正向传递和一个反向传递 举个例子,训练集有1000个样本,batchsize=10,那么: 训练完整个样本集需要: 100次iteration,1次epoch。 parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') # 输出向量的维度:300维 parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') # 优化器用的Adam parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') # dropout是指在深度学习网络的训练过程中,对于神经网络单元,按照一定的概率将其暂时从网络中丢弃 parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1559398699', help='model for test and demo') # 传递参数送入模型中 args = parser.parse_args() # 初始化embedding矩阵,读取词典 word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) # 通过调用random_embedding函数返回一个len(vocab)*embedding_dim=3905*300的矩阵(矩阵元素均在-0.25到0.25之间)作为初始值 if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') # 读取训练集和测试集 if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) # 设置路径 paths = {} timestamp = str(int( time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) # 将参数写入日志文件 if args.mode == 'train': # 训练模型 model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() model.train(train=train_data, dev=test_data) elif args.mode == 'test': # 测试模型 ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() print("test data: {}".format(test_size)) model.test(test_data) # demo elif args.mode == 'demo': location = [] ckpt_file = tf.train.latest_checkpoint(model_path) print("model path: ", ckpt_file) paths['model_path'] = ckpt_file # 设置模型路径 model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: saver.restore(sess, ckpt_file) for sentence in sentences: demo_sent = sentence demo_sent = list(demo_sent.strip()) # 删除空白符 demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) PER, LOC, ORG = get_entity(tag, demo_sent) # 根据标注序列输出对应的字符 new_LOC = list(set(LOC)) # 去重 loc = ' '.join(new_LOC) location.append(loc) return location