char_embeddings_index[chars] = coefs f.close() print('Found %s char vectors.' % len(char_embeddings_index)) ce_words = [] for chr in char_embeddings_index: ce_words.append(chr) """ Create Word & Label Index """ # char = DI(train.words + ce_words) char = DI() char.load('char') # word = DI([train.words, [we_words]]) word = DI() word.load('word') label = DI([train.labels]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', char.cnt - 1, 'unique chars.' print 'Found', label.cnt - 1, 'unique labels.' word.add([train.words]) print 'Found', word.cnt - 1, 'unique words.' """ Create word embedding matrix
chars = values[0] coefs = np.asarray(values[1:], dtype='float32') char_embeddings_index[chars] = coefs f.close() print('Found %s char vectors.' % len(char_embeddings_index)) ce_words = [] for chr in char_embeddings_index: ce_words.append(chr) """ Create Word & Label Index """ char = DI(train.words + ce_words) char.save('char') word = DI([train.words, [we_words]]) word.save('word') label = DI([train.labels ]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', char.cnt - 1, 'unique chars.' print 'Found', label.cnt - 1, 'unique labels.' """ Create word embedding matrix """ EMBEDDING_DIM = len(coefs)
from keras.layers import GRU from keras.layers import Input from keras.utils import plot_model from keras.utils import to_categorical from keras_contrib.layers import CRF """ Preparing file """ train = DL('id-ud-train') test = DL('id-ud-test') """ Create Word & Label Index """ word = DI([train.words, test.words]) label = DI([train.labels ]) # training label and testing label should be the same print 'Found', word.cnt - 1, 'unique words.' print 'Found', label.cnt - 1, 'unique labels.' """ Load pre-trained embedding """ embeddings_index = {} WE_DIR = raw_input('Enter embedding file name: ') print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f:
def __init__(self): self.textinput = '' self.test = '' self.x_test = '' self.x_test_char = '' self.results = [] self.data = {} self.json_data = {} self.char = DI() self.char.load('char') self.word = DI() self.word.load('word.ner') self.label = DI() self.label.load('label.ner') print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.CHAR_EMBEDDING_DIM))) """ Create keras word model """ MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = 0.4 dropout = Dropout(rate=drop)(embedded_sequences) """ Create keras char model """ def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize # embeddingPrompt('char') embedding_layer_c = Embedding(len(self.char.index) + 1, self.CHAR_EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = 0.5 rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) """ Combine word + char model """ merge_m = 'concat' gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 2, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rtwo) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = 'adagrad' loss = 'poisson' self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc']) self.w_name = '06-05_17:19_658' m_layers_len = len(self.model.layers) for i in range(m_layers_len): with open(self.w_name + "_" + str(i) + ".wgt", "rb") as fp: w = pickle.load(fp) self.model.layers[i].set_weights(w)
import pickle from DataProcessor import DataIndexer as DI w_name = '05-17_22:39_736' fp = open(w_name + "_1.wgt", "rb") fout = open(w_name + "-char.vec", "w") w = pickle.load(fp) w = w[0] idx = DI() idx.load('char') ii = idx.index keys = ii.keys() values = ii.values() for i, char in enumerate(w): if i != 0: if i < idx.cnt: print i c = keys[values.index(i)] try: c.decode('utf-8') fout.write(c) for vec in char: fout.write(' ' + str(vec)) fout.write('\n') except UnicodeError: print "char is not UTF-8"
def createModel(self, traindata, valdata, testdata, wordemb, charemb): self.train = DL(traindata) self.val = DL(valdata) self.test = DL(testdata) # Load pre-trained embedding embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb) char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader( charemb) # Create Word & Label Index self.char = DI(self.train.words + ce_words) self.word = DI([self.train.words, [we_words]]) self.label = DI([self.train.labels]) print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' # Create word embedding matrix self.EMBEDDING_DIM = len(self.coefs) embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) for wrd, i in self.word.index.items(): embedding_vector = embeddings_index.get(wrd) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Create char embedding matrix char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.EMBEDDING_DIM))) for chars, i in self.char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector trimlen = self.padsize self.train.trim(trimlen) self.test.trim(trimlen) self.val.trim(trimlen) self.x_train = DM(self.train.words, self.word.index) self.x_test = DM(self.test.words, self.word.index) self.x_val = DM(self.val.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov print "Number of OOV (val):", len(self.x_val.oov_index) print "OOV word occurences (val):", self.x_val.oov padsize = self.padsize self.x_train.pad(padsize) self.x_test.pad(padsize) self.x_val.pad(padsize) print('Padded until %s tokens.' % padsize) self.y_train = DM(self.train.labels, self.label.index) self.y_test = DM(self.test.labels, self.label.index) self.y_val = DM(self.val.labels, self.label.index) self.y_train.pad(padsize) self.y_test.pad(padsize) self.y_val.pad(padsize) self.y_encoded = to_categorical(self.y_train.padded) self.y_val_enc = to_categorical(self.y_val.padded) # Converting char text data to int using index self.x_test_char = self.convertCharText2Int(self.test) self.x_train_char = self.convertCharText2Int(self.train) self.x_val_char = self.convertCharText2Int(self.val) # Create keras word model MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = self.dropout_embedding dropout = Dropout(rate=drop)(embedded_sequences) # Create keras char model def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize embedding_layer_c = Embedding(len(self.char.index) + 1, self.EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = self.dropout_gru rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) # Combine word + char model merge_m = 'concat' merge = Concatenate()([dropout, rtwo]) gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(merge) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = self.optimizer loss = self.loss self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
from keras.utils import to_categorical from keras import backend as K import tensorflow as tf from keras_contrib.layers import CRF """ Preparing file """ train = DL('id-ud-train') test = DL('id-ud-test') """ Create Word & Label Index """ char = DI(train.words + test.words) label = DI([train.labels ]) # training label and testing label should be the same print 'Found', char.cnt - 1, 'unique chars.' print 'Found', label.cnt - 1, 'unique labels.' """ Load pre-trained embedding """ char_embeddings_index = {} CE_DIR = raw_input('Enter embedding file name: ') print 'Loading', CE_DIR, '...' f = open(CE_DIR, 'r') for line in f: