def __init__(self,path): f = open(path, 'rb') try: self.train_set, self.test_set, self.dicts = pickle.load(f, encoding='latin1') except: self.train_set, self.test_set, self.dicts = pickle.load(f) f.close() self.embeddings = random_embedding(self.dicts["words2idx"], 300)
def getDicEmbed(): word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') return word2id, embeddings
def get_embedding(self): ''' 判断是预训练词向量还是随即初始化词向量 :param args: :return: ''' fname = os.path.join('.', args.embeddingfile, args.embedding) if args.pretrain_embedding == 'random': embeddings = random_embedding(self.word2id, args.embedding_dim) else: word_vecs = load_bin_vec(fname, vocab=list(self.word2id), ksize=300) embeddings = get_W(word_vecs=word_vecs, vocab_ids_map=self.word2id, k=300, is_rand=False) return embeddings
def __init__(self, args): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 paths, model_path = get_paths(args) ckpt_file = tf.train.latest_checkpoint(model_path) paths['model_path'] = ckpt_file word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) self.model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) self.model.build_graph() self.saver = tf.train.Saver() self.sess = tf.Session(config=config) self.saver.restore(self.sess, ckpt_file)
Created on Wed Feb 27 14:47:03 2019 @author: Administrator """ #import tensorflow as tf #import numpy as np #import os, argparse, time, random from BiLSTMmodel import bilstm_model from data import read_corpus, read_dictionary, random_embedding from config import config ## get char embeddings word2id = read_dictionary('vocab') ##随机产生embedding embeddings = random_embedding(word2id, config.embedding_size) paths={'log_path':'logger//', 'model_path':'./model2/','result_path':'result//'} #TODO 注意:model_path!!这是个坑啊!! model = bilstm_model(embeddings, paths, word2id, config=config) model.build_graph() ## train model on the whole training data train_data = read_corpus('pku_training.utf8') print("train data: {}".format(len(train_data))) model.train(train_data=train_data)
def get_model(self): config = tf.ConfigProto() parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') parser.add_argument('--epoch', type=int, default=40, help='#epoch of training') parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument( '--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') paths = {} timestamp = str(int( time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() return model, saver, ckpt_file
os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # default: 0 config = tf.ConfigProto() ## hyperparameters embedding_dim = 128 tag2label = {"N": 0, "解剖部位": 1, "手术": 2, "药物": 3, "独立症状": 4, "症状描述": 5} ## get char embeddings word2id = read_dictionary('./vocab.pkl') embeddings = random_embedding(word2id, embedding_dim) train_data = read_corpus('./c.txt') # embeddings, tag2label, vocab,batch_size,epoch,hidden_dim,CRF,update_embedding,shuffle ## training model if __name__ == '__main__': model = BiLSTM_CRF(embeddings, tag2label, word2id, 4,80,128,False,True,True) model.build_graph() test_report = open('test_report.txt','w',encoding= 'utf-8') print("train data: {}".format(len(train_data))) model.test(test_report) # model.train(train=train_data) # use test_data as the dev_data to see overfitting phenomena
def ner(sent): config = tf.ConfigProto() parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') parser.add_argument('--epoch', type=int, default=10, help='#epoch of training') parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=False, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1563773712', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## paths setting paths = {} paths['summary_path'] = './' model_path = r'C:\Users\Houking\Desktop\web_api\ner\checkpoint' paths['model_path'] = os.path.join(model_path, "model") paths['result_path'] = './' paths['log_path'] = './' ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, ckpt_file) while (1): print('Please input your sentence:') demo_sent = input() if demo_sent == '' or demo_sent.isspace(): print('See you next time!') break else: sent = list(sent) data = [(sent, ['O'] * len(sent))] tag = model.demo_one(sess, data) PER, SEX, TIT, REA = get_entity(tag, sent) print('PER: {}\nSEX: {}\nTIT: {}\nREA: {}'.format( PER, SEX, TIT, REA))
default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1547445161', help='model for test and demo') args = parser.parse_args() #1551864803是新数据model 2019-03-06 downloadfile3-4 #1552104107是train_data训练的结果2019-0309 #1552660437是train_merge和test_merge训练测试出来的 ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) #(3905,300) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'train': # train_path = os.path.join('.', args.train_data, 'train_data') # test_path = os.path.join('.', args.test_data, 'test_data') train_path = os.path.join('.', args.train_data, 'processed_downloadfile3') test_path = os.path.join('.', args.test_data, 'processed_downloadfile4') train_data = read_corpus(train_path) #list[(句子,label),(句子,label)] test_data = read_corpus(test_path) test_size = len(test_data) #test中有多少条句子 ## paths setting
parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='train', help='train/test/demo/all/all_2') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings if not os.path.exists(os.path.join('.', args.train_data, 'word2id.pkl')): vocab_build(os.path.join('.', args.train_data, 'word2id.pkl'), os.path.join('.', args.train_data, 'train_data'), 5) word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim, os.path.join('.', args.train_data, 'all_test')) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) ## paths setting
"worker_hosts", "172.16.23.5:2226,172.16.23.5:2227,172.16.23.5:2228,172.16.23.5:2229", "Comma-separated list of hostname:port pairs") # flags.DEFINE_string("worker_hosts", # "172.16.23.5:2223,172.16.23.5:2224,172.16.23.5:2225,172.16.23.5:2226," # "172.16.23.11:2223,172.16.23.11:2224,172.16.23.11:2225,172.16.23.11:2226", # "Comma-separated list of hostname:port pairs") flags.DEFINE_string("job_name", None, "job name: worker or ps") FLAGS = flags.FLAGS # get word embeddings word2id = read_dictionary(os.path.join('./', FLAGS.word2id, 'word2id.pkl')) if FLAGS.pretrain_embedding == 'random': embeddings = random_embedding(word2id, FLAGS.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') # read corpus and get training data if FLAGS.mode != 'demo': train_path = os.path.join('.', FLAGS.train_data_path, 'train_data') train_data_len = get_train_data_len(train_path) # test_path = os.path.join('.', FLAGS.test_data_path, 'test_data') # train_data = read_corpus(train_path) # test_data = read_corpus(test_path) # path setting paths = {} paths['train_data_source'] = './train_data/train_data'
'batch_size': 128, 'epoch': 20, 'hidden_dim': 300, 'optimizer': 'Adam', 'CRF': True, 'lr': 0.001, 'clip': 5.0, 'dropout': 0.8, 'update_embedding': True, 'shuffle': True } ## get char embeddings #word2id = read_dictionary(os.path.join(os.environ['DMPPATH'],'gz_case_address/data_path/word2id.pkl')) word2id = read_dictionary("./gz_case_address/data_path/word2id.pkl") embeddings = random_embedding(word2id, 300) ## paths setting #output_path = os.path.join(os.environ['DMPPATH'],'dmp/gongan/gz_case_address/mode_save') output_path = os.path.join("./gz_case_address/mode_save") # output_path = ('./mode_save') if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model")
default='train', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1536659706', help='model for test and demo') args = parser.parse_args( ) #PyCharm中Run - Edit Configurations - Script Parameters设置命令行参数 ## get char embeddings word2id = read_dictionary( r"D:\data\100dim\word2id_100.pkl") #data.py会生成pkl文件,是词和词向量的对应关系 if args.pretrain_embedding == 'random': embeddings = random_embedding( word2id, args.embedding_dim) #随机生成len(word2id)*args.embedding_dim维array else: embedding_path = 'D:\\data\\100dim\\np_100.npy' #磁盘中数组的二进制格式文件 embeddings = np.array(np.load(embedding_path), dtype='float32') #加载词向量到内存中 ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train1.txt') #训练集文件路径拼接 test_path = os.path.join('.', args.test_data, 'test1.txt') #测试集文件路径拼接 train_data = read_corpus(train_path) #读训练集,为自己定义的一个函数,返回list test_data = read_corpus(test_path) test_size = len(test_data) ## paths setting paths = {}
default=2, help='#num of lstm layers') args = parser.parse_args() ## Session configuration os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # default: 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.2 # need ~700MB GPU memory ## get char embeddings # word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(args.vocab_size, args.embedding_dim) else: embedding_path = './data/word2vec.npy' print("loading pretrain vector...") embeddings = np.array(np.load(embedding_path), dtype='float32') ## training model if args.mode == 'train': print("loading training data...") train_path = os.path.join('./data', args.train_data) train_data = read_train_corpus(file_path=train_path, maxlen=args.max_len) print(args.data_augment) if args.data_augment: train_data = data_augmentation(train_data, maxlen=args.max_len) print("loading valid data...") valid_path = file_path = os.path.join('./data', args.valid_data)
parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument('--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data) ## paths setting paths = {}
help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1521112368', help='model for test and demo') parser.add_argument('--seq_length', type=int, default=20, help='Pretrain language model seq length') args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') pre_train_path = os.path.join('.', args.train_data, 'resume_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) pre_train_data = read_pre_train_data(pre_train_path, args.seq_length) test_data = read_corpus(test_path) test_size = len(test_data)
def run(sentences): # 配置session的参数 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 使用GPU 0 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 日志级别设置 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 # need ~700MB GPU memory # hyperparameters超参数设置 # 创建一个解析器对象,并告诉它将会有些什么参数 # 那么当你的程序运行时,该解析器就可以用于处理命令行参数 parser = argparse.ArgumentParser( description='BiLSTM-CRF for Chinese NER task') parser.add_argument('--train_data', type=str, default='data_path', help='train data source') parser.add_argument('--test_data', type=str, default='data_path', help='test data source') parser.add_argument('--batch_size', type=int, default=64, help='#sample of each minibatch') # batch :批次大小 在深度学习中,一般采用SGD训练,即每次训练在训练集中取batchsize个样本训练 # iteration:中文翻译为迭代,1个iteration等于使用batchsize个样本训练一次 # 一个迭代 = 一个正向通过+一个反向通过 parser.add_argument('--epoch', type=int, default=40, help='#epoch of training') # epoch:迭代次数,1个epoch等于使用训练集中的全部样本训练一次 # 一个epoch = 所有训练样本的一个正向传递和一个反向传递 举个例子,训练集有1000个样本,batchsize=10,那么: 训练完整个样本集需要: 100次iteration,1次epoch。 parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state') # 输出向量的维度:300维 parser.add_argument('--optimizer', type=str, default='Adam', help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD') # 优化器用的Adam parser.add_argument('--CRF', type=str2bool, default=True, help='use CRF at the top layer. if False, use Softmax') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob') # dropout是指在深度学习网络的训练过程中,对于神经网络单元,按照一定的概率将其暂时从网络中丢弃 parser.add_argument('--update_embedding', type=str2bool, default=True, help='update embedding during training') parser.add_argument( '--pretrain_embedding', type=str, default='random', help='use pretrained char embedding or init it randomly') parser.add_argument('--embedding_dim', type=int, default=300, help='random init char embedding_dim') parser.add_argument('--shuffle', type=str2bool, default=True, help='shuffle training data before each epoch') parser.add_argument('--mode', type=str, default='demo', help='train/test/demo') parser.add_argument('--demo_model', type=str, default='1559398699', help='model for test and demo') # 传递参数送入模型中 args = parser.parse_args() # 初始化embedding矩阵,读取词典 word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) # 通过调用random_embedding函数返回一个len(vocab)*embedding_dim=3905*300的矩阵(矩阵元素均在-0.25到0.25之间)作为初始值 if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') # 读取训练集和测试集 if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) # 设置路径 paths = {} timestamp = str(int( time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) # 将参数写入日志文件 if args.mode == 'train': # 训练模型 model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() model.train(train=train_data, dev=test_data) elif args.mode == 'test': # 测试模型 ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() print("test data: {}".format(test_size)) model.test(test_data) # demo elif args.mode == 'demo': location = [] ckpt_file = tf.train.latest_checkpoint(model_path) print("model path: ", ckpt_file) paths['model_path'] = ckpt_file # 设置模型路径 model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: saver.restore(sess, ckpt_file) for sentence in sentences: demo_sent = sentence demo_sent = list(demo_sent.strip()) # 删除空白符 demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) PER, LOC, ORG = get_entity(tag, demo_sent) # 根据标注序列输出对应的字符 new_LOC = list(set(LOC)) # 去重 loc = ' '.join(new_LOC) location.append(loc) return location