def convertCharText2Int(dataload): x_tmp1 = [] global char_padsize for sent in dataload.words: x_map = DM(sent, char.index, False) if x_map.padsize > char_padsize: char_padsize = x_map.padsize x_tmp1.append(x_map) x_tmp2 = [] for sent in x_tmp1: sent.pad(char_padsize) x_tmp2.append(sent.padded) print('Padded until %s chars.' % char_padsize) zeroes = [] for i in range(char_padsize): zeroes.append(0) x_char = [] for sent in x_tmp2: padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char
def predict(self, text): self.textinput = text self.test = DP(text) self.x_test = DM(self.test.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov self.x_test.pad(self.padsize) print('Padded until %s tokens.' % self.padsize) self.x_test_char = self.convertCharText2Int(self.test) self.results = [] print "Computing..." print self.x_test.padded print self.x_test_char raw_results = self.model.predict( [np.array(self.x_test.padded), np.array(self.x_test_char)]) for raw_result in raw_results: result = [] for token in raw_result: value = np.argmax(token) result.append(value) self.results.append(result) temp = self.results[0] li = self.label.index keys = li.keys() values = li.values() self.results = [] start = False for token in temp: if token != 0: start = True if start: if token == 0: self.results.append('?') else: self.results.append(keys[values.index(token)]) print self.test.words[0] print self.results self.data = {'words': self.test.words[0], 'labels': self.results} self.json_data = json.dumps(self.data) return self.json_data
embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector else: char_notfound.append(chars) print('%s unique chars not found in embedding.' % len(char_notfound)) """ Converting word text data to int using index """ trimlen = input('Enter trimming length (default 63.5): ') train.trim(trimlen) test.trim(trimlen) # val.trim(trimlen) x_train = DM(train.words, word.index) x_test = DM(test.words, word.index) # x_val = DM(val.words, word.index) print "Number of OOV:", len(x_test.oov_index) print "OOV word occurences:", x_test.oov # print "Number of OOV (val):", len(x_val.oov_index) # print "OOV word occurences (val):", x_val.oov padsize = max([x_train.padsize, x_test.padsize]) x_train.pad(padsize) x_test.pad(padsize) # x_val.pad(padsize) print('Padded until %s tokens.' % padsize) y_train = DM(train.labels, label.index) y_test = DM(test.labels, label.index) # y_val = DM(val.labels, label.index)
char_notfound = [] # list kata yang tidak terdapat dalam embedding char_embedding_matrix = np.zeros( (len(char.index) + 1, int(CHAR_EMBEDDING_DIM))) for chars, i in char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector else: char_notfound.append(chars) print('%s unique chars not found in embedding.' % len(char_notfound)) """ Converting word text data to int using index """ x_train = DM(train.words, word.index) x_test = DM(test.words, word.index) padsize = max([x_train.padsize, x_test.padsize]) x_train.pad(padsize) print('Padded until %s tokens.' % padsize) y_train = DM(train.labels, label.index) y_test = DM(test.labels, label.index) y_train.pad(padsize) y_encoded = to_categorical(y_train.padded) """ Converting char text data to int using index """
def createModel(self, traindata, valdata, testdata, wordemb, charemb): self.train = DL(traindata) self.val = DL(valdata) self.test = DL(testdata) # Load pre-trained embedding embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb) char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader( charemb) # Create Word & Label Index self.char = DI(self.train.words + ce_words) self.word = DI([self.train.words, [we_words]]) self.label = DI([self.train.labels]) print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' # Create word embedding matrix self.EMBEDDING_DIM = len(self.coefs) embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) for wrd, i in self.word.index.items(): embedding_vector = embeddings_index.get(wrd) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Create char embedding matrix char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.EMBEDDING_DIM))) for chars, i in self.char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector trimlen = self.padsize self.train.trim(trimlen) self.test.trim(trimlen) self.val.trim(trimlen) self.x_train = DM(self.train.words, self.word.index) self.x_test = DM(self.test.words, self.word.index) self.x_val = DM(self.val.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov print "Number of OOV (val):", len(self.x_val.oov_index) print "OOV word occurences (val):", self.x_val.oov padsize = self.padsize self.x_train.pad(padsize) self.x_test.pad(padsize) self.x_val.pad(padsize) print('Padded until %s tokens.' % padsize) self.y_train = DM(self.train.labels, self.label.index) self.y_test = DM(self.test.labels, self.label.index) self.y_val = DM(self.val.labels, self.label.index) self.y_train.pad(padsize) self.y_test.pad(padsize) self.y_val.pad(padsize) self.y_encoded = to_categorical(self.y_train.padded) self.y_val_enc = to_categorical(self.y_val.padded) # Converting char text data to int using index self.x_test_char = self.convertCharText2Int(self.test) self.x_train_char = self.convertCharText2Int(self.train) self.x_val_char = self.convertCharText2Int(self.val) # Create keras word model MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = self.dropout_embedding dropout = Dropout(rate=drop)(embedded_sequences) # Create keras char model def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize embedding_layer_c = Embedding(len(self.char.index) + 1, self.EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = self.dropout_gru rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) # Combine word + char model merge_m = 'concat' merge = Concatenate()([dropout, rtwo]) gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(merge) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = self.optimizer loss = self.loss self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector else: char_notfound.append(chars) print('%s unique chars not found in embedding.' % len(char_notfound)) """ Converting text data to int using index """ padsize = 188 x_test_tmp1 = [] char_padsize = 0 for sent in train.words: x_map = DM(sent, char.index) if x_map.padsize > char_padsize: char_padsize = x_map.padsize x_test_tmp1.append(x_map) # x_test = DM(test.words[0], char.index) # char_padsize = max([x_train.char_padsize, x_test.char_padsize]) x_test_tmp2 = [] for sent in x_test_tmp1: sent.pad(char_padsize) x_test_tmp2.append(sent.padded) print('Padded until %s chars.' % char_padsize) zeroes = []