Пример #1
0
emb_path = os.path.join(dir, 'embedding/embedding.npy')  #args.char_emb
gpu_config = "/gpu:0"  #+str(args.gpu)
model_path = os.path.join(dir, 'model', 'NER')  #args.model_path
predict_path = os.path.join(dir, 'data', 'candidate_predict')
predict_output_path = os.path.join(dir, 'data', 'rawdata',
                                   'test.txt')  #args.output_path
num_steps = 200  #it must consist with the train

start_time = time.time()

char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")
num_chars = len(id2char.keys())
num_classes = len(id2label.keys())
if emb_path != None:
    embedding_matrix = helper.getEmbedding(emb_path)
else:
    embedding_matrix = None

print("building model")
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
    with tf.device(gpu_config):
        initializer = tf.random_uniform_initializer(-0.1, 0.1)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            model = BILSTM_CRF(num_chars=num_chars,
                               num_classes=num_classes,
                               num_steps=num_steps,
                               embedding_matrix=embedding_matrix,
                               is_training=False)
val_path = args.val_path
num_epochs = args.epoch
emb_path = args.char_emb
# gpu_config = "/gpu:"+str(args.gpu)
gpu_config = "/cpu:0"
num_steps = 200 # it must consist with the test

start_time = time.time()
print "preparing train and validation data"
X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps)
char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")
num_chars = len(id2char.keys())
num_classes = len(id2label.keys())
if emb_path != None:
	embedding_matrix = helper.getEmbedding(emb_path)
else:
	embedding_matrix = None

print "building model"
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
	with tf.device(gpu_config):
		initializer = tf.random_uniform_initializer(-0.1, 0.1)
		with tf.variable_scope("model", reuse=None, initializer=initializer):
			model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True)

		print "training model"
		tf.initialize_all_variables().run()
		model.train(sess, save_path, X_train, y_train, X_val, y_val)
Пример #3
0
embedding_path = os.path.join(dir, 'embedding.npy')
save_path = os.path.join(dir, 'model')
start_time = time.time()

word2id, id2word = helper.loadMap("word2id.txt")
label2id, id2label = helper.loadMap("label2id.txt")
entitylabel2id, id2entitylabel = helper.loadMap("entitylabel2id.txt")
num_words = len(id2word.keys())
num_classes = len(id2label.keys())
emb_dim = 128
batch_size = 128
print("preparing train and validation data")
label, entity1label, entity2label, entity1, entity2, distance = helper.getTrainData(
    train_path=train_path)
if embedding_path != None:
    embedding_matrix = helper.getEmbedding(embedding_path, emb_dim=emb_dim)
else:
    embedding_matrix = None
features = np.transpose(np.array([entity1label, entity2label, distance]))
input_entity1_emb = np.zeros((len(entity1), emb_dim))
input_entity2_emb = np.zeros((len(entity2), emb_dim))
for i in range(len(entity1)):
    try:
        input_entity1_emb[i] = embedding_matrix[entity1[i]]
        input_entity2_emb[i] = embedding_matrix[entity2[i]]
    except Exception:
        continue
inputs = np.concatenate([input_entity1_emb, input_entity2_emb, features], 1)
X = inputs
y = np.array(label)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
Пример #4
0
                                                 val_path=None,
                                                 seq_max_len=n_steps,
                                                 char2id_file=char2id_file,
                                                 label2id_file=label2id_file)
sh_index = np.arange(len(X_train))
np.random.shuffle(sh_index)
X_train = X_train[sh_index]
y_train = y_train[sh_index]

char2id, id2char = helper.loadMap(char2id_file)
label2id, id2label = helper.loadMap(label2id_file)
num_chars = len(id2char.keys())  # vocabulary大小
num_classes = len(id2label.keys())  # 标注类别数
emb_path = None
if emb_path != None:
    embedding_matrix = helper.getEmbedding(emb_path, char2id_file)
    # print len([_ for _ in np.sum(embedding_matrix,axis=1) if _ != 0])
    np.savetxt(os.path.join(save_path, "embedding_matrix"), embedding_matrix)
    num_chars = embedding_matrix.shape[0]  # vocabulary大小
else:
    embedding_matrix = None

# char embedding
if embedding_matrix is not None:
    embedding = tf.Variable(embedding_matrix,
                            trainable=True,
                            name="emb",
                            dtype=tf.float32)
else:
    embedding = tf.get_variable("emb", [num_chars, emb_dim])
inputs_emb = tf.nn.embedding_lookup(embedding, x1)  # ??