emb_path = os.path.join(dir, 'embedding/embedding.npy') #args.char_emb gpu_config = "/gpu:0" #+str(args.gpu) model_path = os.path.join(dir, 'model', 'NER') #args.model_path predict_path = os.path.join(dir, 'data', 'candidate_predict') predict_output_path = os.path.join(dir, 'data', 'rawdata', 'test.txt') #args.output_path num_steps = 200 #it must consist with the train start_time = time.time() char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_chars = len(id2char.keys()) num_classes = len(id2label.keys()) if emb_path != None: embedding_matrix = helper.getEmbedding(emb_path) else: embedding_matrix = None print("building model") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device(gpu_config): initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope("model", reuse=None, initializer=initializer): model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, embedding_matrix=embedding_matrix, is_training=False)
val_path = args.val_path num_epochs = args.epoch emb_path = args.char_emb # gpu_config = "/gpu:"+str(args.gpu) gpu_config = "/cpu:0" num_steps = 200 # it must consist with the test start_time = time.time() print "preparing train and validation data" X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps) char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_chars = len(id2char.keys()) num_classes = len(id2label.keys()) if emb_path != None: embedding_matrix = helper.getEmbedding(emb_path) else: embedding_matrix = None print "building model" config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device(gpu_config): initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope("model", reuse=None, initializer=initializer): model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True) print "training model" tf.initialize_all_variables().run() model.train(sess, save_path, X_train, y_train, X_val, y_val)
embedding_path = os.path.join(dir, 'embedding.npy') save_path = os.path.join(dir, 'model') start_time = time.time() word2id, id2word = helper.loadMap("word2id.txt") label2id, id2label = helper.loadMap("label2id.txt") entitylabel2id, id2entitylabel = helper.loadMap("entitylabel2id.txt") num_words = len(id2word.keys()) num_classes = len(id2label.keys()) emb_dim = 128 batch_size = 128 print("preparing train and validation data") label, entity1label, entity2label, entity1, entity2, distance = helper.getTrainData( train_path=train_path) if embedding_path != None: embedding_matrix = helper.getEmbedding(embedding_path, emb_dim=emb_dim) else: embedding_matrix = None features = np.transpose(np.array([entity1label, entity2label, distance])) input_entity1_emb = np.zeros((len(entity1), emb_dim)) input_entity2_emb = np.zeros((len(entity2), emb_dim)) for i in range(len(entity1)): try: input_entity1_emb[i] = embedding_matrix[entity1[i]] input_entity2_emb[i] = embedding_matrix[entity2[i]] except Exception: continue inputs = np.concatenate([input_entity1_emb, input_entity2_emb, features], 1) X = inputs y = np.array(label) X_train, X_test, y_train, y_test = model_selection.train_test_split(
val_path=None, seq_max_len=n_steps, char2id_file=char2id_file, label2id_file=label2id_file) sh_index = np.arange(len(X_train)) np.random.shuffle(sh_index) X_train = X_train[sh_index] y_train = y_train[sh_index] char2id, id2char = helper.loadMap(char2id_file) label2id, id2label = helper.loadMap(label2id_file) num_chars = len(id2char.keys()) # vocabulary大小 num_classes = len(id2label.keys()) # 标注类别数 emb_path = None if emb_path != None: embedding_matrix = helper.getEmbedding(emb_path, char2id_file) # print len([_ for _ in np.sum(embedding_matrix,axis=1) if _ != 0]) np.savetxt(os.path.join(save_path, "embedding_matrix"), embedding_matrix) num_chars = embedding_matrix.shape[0] # vocabulary大小 else: embedding_matrix = None # char embedding if embedding_matrix is not None: embedding = tf.Variable(embedding_matrix, trainable=True, name="emb", dtype=tf.float32) else: embedding = tf.get_variable("emb", [num_chars, emb_dim]) inputs_emb = tf.nn.embedding_lookup(embedding, x1) # ??