Exemplo n.º 1
0
 def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
     self.embeddingSize = embeddingSize
     self.distinctTagNum = distinctTagNum
     self.numHidden = numHidden
     self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size)
     self.words = tf.Variable(self.c2v, name="words")
     layers = [
         {
             'dilation': 1
         },
         {
             'dilation': 1
         },
         {
             'dilation': 2
         },
     ]
     if FLAGS.use_idcnn:
         self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                            FLAGS.embedding_size, FLAGS.max_sentence_len,
                            FLAGS.num_tags)
     else:
         self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                             FLAGS.num_tags)
     self.trains_params = None
     self.inp = tf.placeholder(tf.int32,
                               shape=[None, FLAGS.max_sentence_len],
                               name="input_placeholder")
     pass
Exemplo n.º 2
0
    def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
        self.embeddingSize = embeddingSize
        self.distinctTagNum = distinctTagNum
        self.numHidden = numHidden 
        num_shards = FLAGS.num_shards
	self.c2v = self.load_w2v( num_shards , c2vPath, FLAGS.embedding_size)
	self.words =[]
        with tf.device("/gpu:0"):
	    for i in range( 0 , num_shards):
	       words_i = tf.get_variable(name = "words-%02d" % i,
                              initializer=tf.random_uniform(self.c2v[i].shape, minval=-0.1, maxval=0.1), trainable=False )
	       self.words.append( words_i ) 
	layers = [
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        if FLAGS.use_idcnn:
            self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size,
                               FLAGS.max_sentence_len, FLAGS.num_tags)
        else:
            self.model = BiLSTM(
                FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags)
        self.trains_params = None
        self.inp = tf.placeholder(tf.int32,
                                  shape=[None, FLAGS.max_sentence_len],
                                  name="input_placeholder")
        pass
Exemplo n.º 3
0
 def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
     self.embeddingSize = embeddingSize
     self.distinctTagNum = distinctTagNum
     self.numHidden = numHidden
     # let word2vec be a variable in tensorflow,
     # so in prediciton step, it don't need input whole w2v file again, just input the w2v index
     self.c2v = self.load_w2v(c2vPath,
                              FLAGS.embedding_size)  # word2vec path
     self.words = tf.Variable(self.c2v, name="words")
     layers = [  # iterated dilated CNN's block parameter
         {
             'dilation': 1
         },
         {
             'dilation': 1
         },
         {
             'dilation': 2
         },
     ]
     if FLAGS.use_idcnn:
         self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                            FLAGS.embedding_size, FLAGS.max_sentence_len,
                            FLAGS.num_tags)  # filter is 3x3
     else:
         self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                             FLAGS.num_tags)
     self.trains_params = None
     self.inp = tf.placeholder(tf.int32,
                               shape=[None, FLAGS.max_sentence_len],
                               name="input_placeholder")
     pass
Exemplo n.º 4
0
 def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
     self.embeddingSize = embeddingSize
     self.distinctTagNum = distinctTagNum
     self.numHidden = numHidden
     self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size)
     self.words = tf.Variable(self.c2v, name="words")
     layers = [
         {
             'dilation': 1
         },
         {
             'dilation': 1
         },
         {
             'dilation': 2
         },
     ]
     if FLAGS.use_idcnn:
         self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size,
                            FLAGS.max_sentence_len, FLAGS.num_tags)
     else:
         self.model = BiLSTM(
             FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags)
     self.trains_params = None
     self.inp = tf.placeholder(tf.int32,
                               shape=[None, FLAGS.max_sentence_len],
                               name="input_placeholder")
     pass
Exemplo n.º 5
0
class Model:
    def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
        self.embeddingSize = embeddingSize
        self.distinctTagNum = distinctTagNum
        self.numHidden = numHidden
        num_shards = FLAGS.num_shards
        self.c2v = self.load_w2v(num_shards, c2vPath, FLAGS.embedding_size)
        self.words = []
        with tf.device("/cpu:0"):
            for i in range(0, num_shards):
                words_i = tf.get_variable(name="words-%02d" % i,
                                          initializer=tf.random_uniform(
                                              self.c2v[i].shape,
                                              minval=-0.1,
                                              maxval=0.1),
                                          trainable=False)
                self.words.append(words_i)
        layers = [
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        if FLAGS.use_idcnn:
            self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                               FLAGS.embedding_size, FLAGS.max_sentence_len,
                               FLAGS.num_tags)
        else:
            self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                                FLAGS.num_tags)
        self.trains_params = None
        self.inp = tf.placeholder(tf.int32,
                                  shape=[None, FLAGS.max_sentence_len],
                                  name="input_placeholder")
        pass

    def length(self, data):
        used = tf.sign(tf.abs(data))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def inference(self, X, reuse=None, trainMode=True):
        word_vectors = tf.nn.embedding_lookup(self.words,
                                              X,
                                              partition_strategy="div")
        length = self.length(X)
        reuse = False if trainMode else True
        if FLAGS.use_idcnn:
            word_vectors = tf.expand_dims(word_vectors, 1)
            unary_scores = self.model.inference(word_vectors, reuse=reuse)
        else:
            unary_scores = self.model.inference(word_vectors,
                                                length,
                                                reuse=reuse)
        return unary_scores, length

    def loss(self, X, Y):
        P, sequence_length = self.inference(X)
        log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            P, Y, sequence_length)
        loss = tf.reduce_mean(-log_likelihood)
        return loss

    def load_w2v(self, num_shards, path, expectDim):
        with open(path, "r") as fp:
            print("load data from:", path)
            line = next(fp)
            line = line.strip()
            ss = line.split(" ")
            total = int(ss[0])
            dim = int(ss[1])
            assert (dim == expectDim), "dim:%d , expectDim: %d".format(
                dim, expectDim)
            ws = []
            mv = np.zeros(dim, dtype=np.float)  #[0 for i in range(dim)]
            second = -1
            for t, line in enumerate(fp):  #@todo total
                if ss[0] in ['<UNK>', "<unk>"]:
                    second = t
                line = line.strip()
                ss = line.split(" ")
                assert (len(ss) == (dim + 1))
                vals = map(float, ss[1:])
                #for i in range(1, dim + 1):
                #    fv = float(ss[i])
                #    mv[i - 1] += fv
                #    vals.append(fv)
                mv += vals
                ws.append(vals)
                if len(ws) % 50000 == 0:
                    print("wordtvec data loading :", len(ws))
#if len(ws) >  50000 :
#   break
            mv /= total
            #second = len(ws)
            while len(ws) % num_shards != 0:
                ws.append(mv)
            #增加一个UNK位置

#mv /=total
            assert (second != -1)
            # append one more token , maybe useless
            #ws.append(mv)
            if second != 1:
                t = ws[1]
                ws[1] = ws[second]
                ws[second] = t
        print("loading commpleted .....")
        print("make array 2d to 3d")
        total = len(ws)
        range_size = total / num_shards
        begin_ = 0
        ends_ = range_size
        ws = np.asarray(ws, dtype=np.float32)
        sub_ws = []
        for i in xrange(0, num_shards):
            begin_ = i * range_size
            if (i + 1) * range_size < total:
                ends_ = (i + 1) * range_size
            else:
                ends_ = total
                assert ends_ - begin_ == range_size
            sub_ws.append(ws[int(begin_):int(ends_), ])
        return np.array(sub_ws, dtype=np.float32)

    def test_unary_score(self):
        P, sequence_length = self.inference(self.inp,
                                            reuse=True,
                                            trainMode=False)
        return P, sequence_length
Exemplo n.º 6
0
class Model:
    def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
        self.embeddingSize = embeddingSize
        self.distinctTagNum = distinctTagNum
        self.numHidden = numHidden
        self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size)
        self.words = tf.Variable(self.c2v, name="words")
        layers = [
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        if FLAGS.use_idcnn:
            self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                               FLAGS.embedding_size, FLAGS.max_sentence_len,
                               FLAGS.num_tags)
        else:
            self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                                FLAGS.num_tags)
        self.trains_params = None
        self.inp = tf.placeholder(tf.int32,
                                  shape=[None, FLAGS.max_sentence_len],
                                  name="input_placeholder")
        pass

    def length(self, data):
        used = tf.sign(tf.abs(data))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def inference(self, X, reuse=None, trainMode=True):
        word_vectors = tf.nn.embedding_lookup(self.words, X)
        length = self.length(X)
        reuse = False if trainMode else True
        if FLAGS.use_idcnn:
            word_vectors = tf.expand_dims(word_vectors, 1)
            unary_scores = self.model.inference(word_vectors, reuse=reuse)
        else:
            unary_scores = self.model.inference(word_vectors,
                                                length,
                                                reuse=reuse)
        return unary_scores, length

    def loss(self, X, Y):
        P, sequence_length = self.inference(X)
        log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            P, Y, sequence_length)
        loss = tf.reduce_mean(-log_likelihood)
        return loss

    def load_w2v(self, path, expectDim):
        fp = open(path, "r")
        print("load data from:", path)
        line = fp.readline().strip()
        ss = line.split(" ")
        total = int(ss[0])
        dim = int(ss[1])
        assert (dim == expectDim)
        ws = []
        mv = [0 for i in range(dim)]
        second = -1
        for t in range(total):
            if ss[0] == '<UNK>':
                second = t
            line = fp.readline().strip()
            ss = line.split(" ")
            assert (len(ss) == (dim + 1))
            vals = []
            for i in range(1, dim + 1):
                fv = float(ss[i])
                mv[i - 1] += fv
                vals.append(fv)
            ws.append(vals)
        for i in range(dim):
            mv[i] = mv[i] / total
        assert (second != -1)
        # append one more token , maybe useless
        ws.append(mv)
        if second != 1:
            t = ws[1]
            ws[1] = ws[second]
            ws[second] = t
        fp.close()
        return np.asarray(ws, dtype=np.float32)

    def test_unary_score(self):
        P, sequence_length = self.inference(self.inp,
                                            reuse=True,
                                            trainMode=False)
        return P, sequence_length
Exemplo n.º 7
0
class Model:
    def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
        self.embeddingSize = embeddingSize
        self.distinctTagNum = distinctTagNum
        self.numHidden = numHidden
        # let word2vec be a variable in tensorflow,
        # so in prediciton step, it don't need input whole w2v file again, just input the w2v index
        self.c2v = self.load_w2v(c2vPath,
                                 FLAGS.embedding_size)  # word2vec path
        self.words = tf.Variable(self.c2v, name="words")
        layers = [  # iterated dilated CNN's block parameter
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        if FLAGS.use_idcnn:
            self.model = IdCNN(layers, 3, FLAGS.num_hidden,
                               FLAGS.embedding_size, FLAGS.max_sentence_len,
                               FLAGS.num_tags)  # filter is 3x3
        else:
            self.model = BiLSTM(FLAGS.num_hidden, FLAGS.max_sentence_len,
                                FLAGS.num_tags)
        self.trains_params = None
        self.inp = tf.placeholder(tf.int32,
                                  shape=[None, FLAGS.max_sentence_len],
                                  name="input_placeholder")
        pass

    def length(self, data):
        used = tf.sign(tf.abs(data))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def inference(self, X, reuse=None, trainMode=True):
        word_vectors = tf.nn.embedding_lookup(self.words, X)
        length = self.length(X)
        reuse = False if trainMode else True
        if FLAGS.use_idcnn:
            word_vectors = tf.expand_dims(word_vectors, 1)
            unary_scores = self.model.inference(word_vectors, reuse=reuse)
        else:
            unary_scores = self.model.inference(word_vectors,
                                                length,
                                                reuse=reuse)
        return unary_scores, length

    def loss(self, X, Y):
        P, sequence_length = self.inference(X)
        self.P = P
        self.realY = Y
        log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            P, Y, sequence_length)
        self.log_likelihood = log_likelihood
        loss = tf.reduce_mean(-log_likelihood)
        return loss

    def load_w2v(self, path, expectDim):
        '''
        return: the first one is all zero, and the last are the average number among the dimension
        '''

        fp = open(path, "r")
        print("load data from:", path)
        line = fp.readline().strip()
        ss = line.split(" ")
        total = int(ss[0])  # how many word vector in the file
        dim = int(ss[1])  # each word-vector's dimension
        assert (dim == expectDim)
        ws = []
        mv = [0 for i in range(dim)]  # initial vector to zero vector
        second = -1
        for t in range(total):
            if ss[0] == '<UNK>':
                second = t
            line = fp.readline().strip()
            ss = line.split(" ")
            assert (len(ss) == (dim + 1))
            vals = []
            for i in range(1, dim +
                           1):  # doesn' contain the original word, only vector
                fv = float(ss[i])
                mv[i - 1] += fv
                vals.append(fv)
            ws.append(vals)
        for i in range(dim):
            mv[i] = mv[i] / total  # average?
        assert (second != -1)
        # append one more token , maybe useless
        ws.append(mv)
        if second != 1:
            t = ws[1]
            ws[1] = ws[second]
            ws[second] = t
        fp.close()
        return np.asarray(ws, dtype=np.float32)

    def test_unary_score(self):
        ''' unary score: for CRF'''
        P, sequence_length = self.inference(self.inp,
                                            reuse=True,
                                            trainMode=False)
        return P, sequence_length
Exemplo n.º 8
0
class Model:
    def __init__(self, embeddingSize, distinctTagNum, c2vPath, numHidden):
        self.embeddingSize = embeddingSize
        self.distinctTagNum = distinctTagNum
        self.numHidden = numHidden
        self.c2v = self.load_w2v(c2vPath, FLAGS.embedding_size)
        self.words = tf.Variable(self.c2v, name="words")
        layers = [
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        if FLAGS.use_idcnn:
            self.model = IdCNN(layers, 3, FLAGS.num_hidden, FLAGS.embedding_size,
                               FLAGS.max_sentence_len, FLAGS.num_tags)
        else:
            self.model = BiLSTM(
                FLAGS.num_hidden, FLAGS.max_sentence_len, FLAGS.num_tags)
        self.trains_params = None
        self.inp = tf.placeholder(tf.int32,
                                  shape=[None, FLAGS.max_sentence_len],
                                  name="input_placeholder")
        pass

    def length(self, data):
        used = tf.sign(tf.abs(data))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    def inference(self, X, reuse=None, trainMode=True):
        word_vectors = tf.nn.embedding_lookup(self.words, X)
        length = self.length(X)
        reuse = False if trainMode else True
        if FLAGS.use_idcnn:
            word_vectors = tf.expand_dims(word_vectors, 1)
            unary_scores = self.model.inference(word_vectors, reuse=reuse)
        else:
            unary_scores = self.model.inference(
                word_vectors, length, reuse=reuse)
        return unary_scores, length

    def loss(self, X, Y):
        P, sequence_length = self.inference(X)
        log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
            P, Y, sequence_length)
        loss = tf.reduce_mean(-log_likelihood)
        return loss

    def load_w2v(self, path, expectDim):
        fp = open(path, "r")
        print("load data from:", path)
        line = fp.readline().strip()
        ss = line.split(" ")
        total = int(ss[0])
        dim = int(ss[1])
        assert (dim == expectDim)
        ws = []
        mv = [0 for i in range(dim)]
        second = -1
        for t in range(total):
            if ss[0] == '<UNK>':
                second = t
            line = fp.readline().strip()
            ss = line.split(" ")
            assert (len(ss) == (dim + 1))
            vals = []
            for i in range(1, dim + 1):
                fv = float(ss[i])
                mv[i - 1] += fv
                vals.append(fv)
            ws.append(vals)
        for i in range(dim):
            mv[i] = mv[i] / total
        assert (second != -1)
        # append one more token , maybe useless
        ws.append(mv)
        if second != 1:
            t = ws[1]
            ws[1] = ws[second]
            ws[second] = t
        fp.close()
        return np.asarray(ws, dtype=np.float32)

    def test_unary_score(self):
        P, sequence_length = self.inference(self.inp,
                                            reuse=True,
                                            trainMode=False)
        return P, sequence_length