class Model: def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden num_shards = FLAGS.num_shards self.c2v = self.load_w2v(num_shards, c2vPath, FLAGS.embedding_size) self.words = [] with tf.device("/cpu:0"): for i in range(0, num_shards): words_i = tf.get_variable(name="words-%02d" % i, initializer=tf.random_uniform( self.c2v[i].shape, minval=-0.1, maxval=0.1), trainable=False) self.words.append(words_i) layers = [ { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) else: self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass def length(self, data): used = tf.sign(tf.abs(data)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def inference(self, X, reuse=None, trainMode=True): word_vectors = tf.nn.embedding_lookup(self.words, X, partition_strategy="div") length = self.length(X) reuse = False if trainMode else True if FLAGS.use_idcnn: word_vectors = tf.expand_dims(word_vectors, 1) unary_scores = self.model.inference(word_vectors, reuse=reuse) else: unary_scores = self.model.inference(word_vectors, length, reuse=reuse) return unary_scores, length def loss(self, X, Y): P, sequence_length = self.inference(X) log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( P, Y, sequence_length) loss = tf.reduce_mean(-log_likelihood) return loss def load_w2v(self, num_shards, path, expectDim): with open(path, "r") as fp: print("load data from:", path) line = next(fp) line = line.strip() ss = line.split(" ") total = int(ss[0]) dim = int(ss[1]) assert (dim == expectDim), "dim:%d , expectDim: %d".format( dim, expectDim) ws = [] mv = np.zeros(dim, dtype=np.float) #[0 for i in range(dim)] second = -1 for t, line in enumerate(fp): #@todo total if ss[0] in ['<UNK>', "<unk>"]: second = t line = line.strip() ss = line.split(" ") assert (len(ss) == (dim + 1)) vals = map(float, ss[1:]) #for i in range(1, dim + 1): # fv = float(ss[i]) # mv[i - 1] += fv # vals.append(fv) mv += vals ws.append(vals) if len(ws) % 50000 == 0: print("wordtvec data loading :", len(ws)) #if len(ws) > 50000 : # break mv /= total #second = len(ws) while len(ws) % num_shards != 0: ws.append(mv) #增加一个UNK位置 #mv /=total assert (second != -1) # append one more token , maybe useless #ws.append(mv) if second != 1: t = ws[1] ws[1] = ws[second] ws[second] = t print("loading commpleted .....") print("make array 2d to 3d") total = len(ws) range_size = total / num_shards begin_ = 0 ends_ = range_size ws = np.asarray(ws, dtype=np.float32) sub_ws = [] for i in xrange(0, num_shards): begin_ = i * range_size if (i + 1) * range_size < total: ends_ = (i + 1) * range_size else: ends_ = total assert ends_ - begin_ == range_size sub_ws.append(ws[int(begin_):int(ends_), ]) return np.array(sub_ws, dtype=np.float32) def test_unary_score(self): P, sequence_length = self.inference(self.inp, reuse=True, trainMode=False) return P, sequence_length
class Model: def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden # let word2vec be a variable in tensorflow, # so in prediciton step, it don't need input whole w2v file again, just input the w2v index self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size) # word2vec path self.words = tf.Variable(self.c2v, name="words") layers = [ # iterated dilated CNN's block parameter { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) # filter is 3x3 else: self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass def length(self, data): used = tf.sign(tf.abs(data)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def inference(self, X, reuse=None, trainMode=True): word_vectors = tf.nn.embedding_lookup(self.words, X) length = self.length(X) reuse = False if trainMode else True if FLAGS.use_idcnn: word_vectors = tf.expand_dims(word_vectors, 1) unary_scores = self.model.inference(word_vectors, reuse=reuse) else: unary_scores = self.model.inference(word_vectors, length, reuse=reuse) return unary_scores, length def loss(self, X, Y): P, sequence_length = self.inference(X) self.P = P self.realY = Y log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( P, Y, sequence_length) self.log_likelihood = log_likelihood loss = tf.reduce_mean(-log_likelihood) return loss def load_w2v(self, path, expectDim): ''' return: the first one is all zero, and the last are the average number among the dimension ''' fp = open(path, "r") print("load data from:", path) line = fp.readline().strip() ss = line.split(" ") total = int(ss[0]) # how many word vector in the file dim = int(ss[1]) # each word-vector's dimension assert (dim == expectDim) ws = [] mv = [0 for i in range(dim)] # initial vector to zero vector second = -1 for t in range(total): if ss[0] == '<UNK>': second = t line = fp.readline().strip() ss = line.split(" ") assert (len(ss) == (dim + 1)) vals = [] for i in range(1, dim + 1): # doesn' contain the original word, only vector fv = float(ss[i]) mv[i - 1] += fv vals.append(fv) ws.append(vals) for i in range(dim): mv[i] = mv[i] / total # average? assert (second != -1) # append one more token , maybe useless ws.append(mv) if second != 1: t = ws[1] ws[1] = ws[second] ws[second] = t fp.close() return np.asarray(ws, dtype=np.float32) def test_unary_score(self): ''' unary score: for CRF''' P, sequence_length = self.inference(self.inp, reuse=True, trainMode=False) return P, sequence_length
class Model: def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size) self.words = tf.Variable(self.c2v, name="words") layers = [ { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) else: self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass def length(self, data): used = tf.sign(tf.abs(data)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def inference(self, X, reuse=None, trainMode=True): word_vectors = tf.nn.embedding_lookup(self.words, X) length = self.length(X) reuse = False if trainMode else True if FLAGS.use_idcnn: word_vectors = tf.expand_dims(word_vectors, 1) unary_scores = self.model.inference(word_vectors, reuse=reuse) else: unary_scores = self.model.inference(word_vectors, length, reuse=reuse) return unary_scores, length def loss(self, X, Y): P, sequence_length = self.inference(X) log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( P, Y, sequence_length) loss = tf.reduce_mean(-log_likelihood) return loss def load_w2v(self, path, expectDim): fp = open(path, "r") print("load data from:", path) line = fp.readline().strip() ss = line.split(" ") total = int(ss[0]) dim = int(ss[1]) assert (dim == expectDim) ws = [] mv = [0 for i in range(dim)] second = -1 for t in range(total): if ss[0] == '<UNK>': second = t line = fp.readline().strip() ss = line.split(" ") assert (len(ss) == (dim + 1)) vals = [] for i in range(1, dim + 1): fv = float(ss[i]) mv[i - 1] += fv vals.append(fv) ws.append(vals) for i in range(dim): mv[i] = mv[i] / total assert (second != -1) # append one more token , maybe useless ws.append(mv) if second != 1: t = ws[1] ws[1] = ws[second] ws[second] = t fp.close() return np.asarray(ws, dtype=np.float32) def test_unary_score(self): P, sequence_length = self.inference(self.inp, reuse=True, trainMode=False) return P, sequence_length
class Model: def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden): self.embeddingSize = embeddingSize self.distinctTagNum = distinctTagNum self.numHidden = numHidden self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size) self.words = tf.Variable(self.c2v, name="words") layers = [ { 'dilation': 1 }, { 'dilation': 1 }, { 'dilation': 2 }, ] if FLAGS.use_idcnn: self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size, FLAGS.max_sentence_len, FLAGS.num_tags) else: self.model = BiLSTM( FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags) self.trains_params = None self.inp = tf.placeholder(tf.int32, shape=[None, FLAGS.max_sentence_len], name="input_placeholder") pass def length(self, data): used = tf.sign(tf.abs(data)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length def inference(self, X, reuse=None, trainMode=True): word_vectors = tf.nn.embedding_lookup(self.words, X) length = self.length(X) reuse = False if trainMode else True if FLAGS.use_idcnn: word_vectors = tf.expand_dims(word_vectors, 1) unary_scores = self.model.inference(word_vectors, reuse=reuse) else: unary_scores = self.model.inference( word_vectors, length, reuse=reuse) return unary_scores, length def loss(self, X, Y): P, sequence_length = self.inference(X) log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( P, Y, sequence_length) loss = tf.reduce_mean(-log_likelihood) return loss def load_w2v(self, path, expectDim): fp = open(path, "r") print("load data from:", path) line = fp.readline().strip() ss = line.split(" ") total = int(ss[0]) dim = int(ss[1]) assert (dim == expectDim) ws = [] mv = [0 for i in range(dim)] second = -1 for t in range(total): if ss[0] == '<UNK>': second = t line = fp.readline().strip() ss = line.split(" ") assert (len(ss) == (dim + 1)) vals = [] for i in range(1, dim + 1): fv = float(ss[i]) mv[i - 1] += fv vals.append(fv) ws.append(vals) for i in range(dim): mv[i] = mv[i] / total assert (second != -1) # append one more token , maybe useless ws.append(mv) if second != 1: t = ws[1] ws[1] = ws[second] ws[second] = t fp.close() return np.asarray(ws, dtype=np.float32) def test_unary_score(self): P, sequence_length = self.inference(self.inp, reuse=True, trainMode=False) return P, sequence_length