示例#1
0
    char_embeddings_index[chars] = coefs

f.close()

print('Found %s char vectors.' % len(char_embeddings_index))

ce_words = []
for chr in char_embeddings_index:
    ce_words.append(chr)

"""
Create Word & Label Index
"""

# char = DI(train.words + ce_words)
char = DI()
char.load('char')
# word = DI([train.words, [we_words]])
word = DI()
word.load('word')
label = DI([train.labels])  # training label and testing label should be the same

print 'Found', word.cnt - 1, 'unique words.'
print 'Found', char.cnt - 1, 'unique chars.'
print 'Found', label.cnt - 1, 'unique labels.'

word.add([train.words])
print 'Found', word.cnt - 1, 'unique words.'

"""
Create word embedding matrix
示例#2
0
文件: Main.py 项目: joshuakosasih/TA
    chars = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    char_embeddings_index[chars] = coefs

f.close()

print('Found %s char vectors.' % len(char_embeddings_index))

ce_words = []
for chr in char_embeddings_index:
    ce_words.append(chr)
"""
Create Word & Label Index
"""

char = DI(train.words + ce_words)
char.save('char')
word = DI([train.words, [we_words]])
word.save('word')
label = DI([train.labels
            ])  # training label and testing label should be the same

print 'Found', word.cnt - 1, 'unique words.'
print 'Found', char.cnt - 1, 'unique chars.'
print 'Found', label.cnt - 1, 'unique labels.'
"""
Create word embedding matrix
"""

EMBEDDING_DIM = len(coefs)
示例#3
0
文件: main.py 项目: joshuakosasih/TA
from keras.layers import GRU
from keras.layers import Input
from keras.utils import plot_model
from keras.utils import to_categorical
from keras_contrib.layers import CRF
"""
Preparing file
"""

train = DL('id-ud-train')
test = DL('id-ud-test')
"""
Create Word & Label Index
"""

word = DI([train.words, test.words])
label = DI([train.labels
            ])  # training label and testing label should be the same

print 'Found', word.cnt - 1, 'unique words.'
print 'Found', label.cnt - 1, 'unique labels.'
"""
Load pre-trained embedding
"""

embeddings_index = {}
WE_DIR = raw_input('Enter embedding file name: ')

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
示例#4
0
    def __init__(self):
        self.textinput = ''
        self.test = ''
        self.x_test = ''
        self.x_test_char = ''
        self.results = []
        self.data = {}
        self.json_data = {}

        self.char = DI()
        self.char.load('char')
        self.word = DI()
        self.word.load('word.ner')
        self.label = DI()
        self.label.load('label.ner')

        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.CHAR_EMBEDDING_DIM)))
        """
        Create keras word model
        """

        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = 0.4
        dropout = Dropout(rate=drop)(embedded_sequences)
        """
        Create keras char model
        """
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        # embeddingPrompt('char')
        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.CHAR_EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')

        embedded_sequences_c = embedding_layer_c(sequence_input_c)

        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = 0.5
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)
        """
        Combine word + char model
        """
        merge_m = 'concat'
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 2,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(rtwo)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)

        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = 'adagrad'
        loss = 'poisson'
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])

        self.w_name = '06-05_17:19_658'
        m_layers_len = len(self.model.layers)
        for i in range(m_layers_len):
            with open(self.w_name + "_" + str(i) + ".wgt", "rb") as fp:
                w = pickle.load(fp)
                self.model.layers[i].set_weights(w)
示例#5
0
import pickle
from DataProcessor import DataIndexer as DI

w_name = '05-17_22:39_736'

fp = open(w_name + "_1.wgt", "rb")
fout = open(w_name + "-char.vec", "w")
w = pickle.load(fp)
w = w[0]
idx = DI()
idx.load('char')
ii = idx.index
keys = ii.keys()
values = ii.values()

for i, char in enumerate(w):
    if i != 0:
        if i < idx.cnt:
            print i
            c = keys[values.index(i)]
            try:
                c.decode('utf-8')
                fout.write(c)
                for vec in char:
                    fout.write(' ' + str(vec))
                fout.write('\n')
            except UnicodeError:
                print "char is not UTF-8"
示例#6
0
文件: Main.py 项目: joshuakosasih/TA
    def createModel(self, traindata, valdata, testdata, wordemb, charemb):
        self.train = DL(traindata)
        self.val = DL(valdata)
        self.test = DL(testdata)

        # Load pre-trained embedding
        embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb)
        char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader(
            charemb)

        # Create Word & Label Index
        self.char = DI(self.train.words + ce_words)
        self.word = DI([self.train.words, [we_words]])
        self.label = DI([self.train.labels])
        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        # Create word embedding matrix
        self.EMBEDDING_DIM = len(self.coefs)
        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        for wrd, i in self.word.index.items():
            embedding_vector = embeddings_index.get(wrd)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Create char embedding matrix
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.EMBEDDING_DIM)))
        for chars, i in self.char.index.items():
            embedding_vector = char_embeddings_index.get(chars)
            if embedding_vector is not None:
                char_embedding_matrix[i] = embedding_vector

        trimlen = self.padsize
        self.train.trim(trimlen)
        self.test.trim(trimlen)
        self.val.trim(trimlen)

        self.x_train = DM(self.train.words, self.word.index)
        self.x_test = DM(self.test.words, self.word.index)
        self.x_val = DM(self.val.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov
        print "Number of OOV (val):", len(self.x_val.oov_index)
        print "OOV word occurences (val):", self.x_val.oov
        padsize = self.padsize
        self.x_train.pad(padsize)
        self.x_test.pad(padsize)
        self.x_val.pad(padsize)
        print('Padded until %s tokens.' % padsize)

        self.y_train = DM(self.train.labels, self.label.index)
        self.y_test = DM(self.test.labels, self.label.index)
        self.y_val = DM(self.val.labels, self.label.index)

        self.y_train.pad(padsize)
        self.y_test.pad(padsize)
        self.y_val.pad(padsize)
        self.y_encoded = to_categorical(self.y_train.padded)
        self.y_val_enc = to_categorical(self.y_val.padded)

        # Converting char text data to int using index
        self.x_test_char = self.convertCharText2Int(self.test)
        self.x_train_char = self.convertCharText2Int(self.train)
        self.x_val_char = self.convertCharText2Int(self.val)

        # Create keras word model
        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = self.dropout_embedding
        dropout = Dropout(rate=drop)(embedded_sequences)

        # Create keras char model
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')
        embedded_sequences_c = embedding_layer_c(sequence_input_c)
        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = self.dropout_gru
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)

        # Combine word + char model
        merge_m = 'concat'
        merge = Concatenate()([dropout, rtwo])
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(merge)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)
        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = self.optimizer
        loss = self.loss
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
示例#7
0
from keras.utils import to_categorical
from keras import backend as K
import tensorflow as tf

from keras_contrib.layers import CRF
"""
Preparing file
"""

train = DL('id-ud-train')
test = DL('id-ud-test')
"""
Create Word & Label Index
"""

char = DI(train.words + test.words)
label = DI([train.labels
            ])  # training label and testing label should be the same

print 'Found', char.cnt - 1, 'unique chars.'
print 'Found', label.cnt - 1, 'unique labels.'
"""
Load pre-trained embedding
"""

char_embeddings_index = {}
CE_DIR = raw_input('Enter embedding file name: ')

print 'Loading', CE_DIR, '...'
f = open(CE_DIR, 'r')
for line in f: