def predict(sentence, model): sen_list = [[[i, 'O\n'] for i in sentence.split()]] #sen_list = [[['SOCCER', 'O\n'], ['-', 'O\n'], ['JAPAN', 'O\n'], ['GET', 'O\n'], ['LUCKY', 'O\n'], ['WIN', 'O\n'], [',', 'O\n'], ['CHINA', 'O\n'], ['IN', 'O\n'], ['SURPRISE', 'O\n'], ['DEFEAT', 'O\n'], ['.', 'O\n']]] test = addCharInformatioin(sen_list) predLabels = [] test_set = padding( createMatrices(test, word2Idx, label2Idx, case2Idx, char2Idx)) test_batch, test_batch_len = createBatches(test_set) for i, data in enumerate(test_batch): tokens, casing, char, labels = data tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes predLabels.append(pred) entity_labels = [] j = 0 words_list = sentence.split() for i in predLabels[-1]: entity_labels.append((words_list[j], idx2Label[int(i)])) j += 1 print("predLabels", entity_labels) return entity_labels
def test(model, test_set, idx2Label, package): test_batch, test_batch_len = createBatches(test_set) predLabels, correctLabels = tag_dataset(test_batch, model, package) pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test)) return pre_test, rec_test, f1_test
def train(model, train_set, epochs, package): train_batch, train_batch_len = createBatches(train_set) for epoch in range(epochs): print("Epoch %d/%d" % (epoch, epochs)) a = Progbar(len(train_batch_len)) for i, batch in enumerate( iterate_minibatches(train_batch, train_batch_len)): labels, tokens, casing, char = batch if package.modelName == "LSTM_word": with tf.device('/gpu:0'): model.fit([tokens], labels, verbose=0) elif package.modelName == "LSTM_word_char": with tf.device('/gpu:0'): model.fit([tokens, casing, char], labels, verbose=0) a.update(i) print(' ') return model
char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char2Idx[c] = len(char2Idx) train_set = padding( createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx)) dev_set = padding( createMatrices(devSentences, word2Idx, label2Idx, case2Idx, char2Idx)) test_set = padding( createMatrices(testSentences, word2Idx, label2Idx, case2Idx, char2Idx)) idx2Label = {v: k for k, v in label2Idx.items()} np.save("models/idx2Label.npy", idx2Label) np.save("models/word2Idx.npy", word2Idx) train_batch, train_batch_len = createBatches(train_set) dev_batch, dev_batch_len = createBatches(dev_set) test_batch, test_batch_len = createBatches(test_set) words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1], weights=[wordEmbeddings], trainable=False)(words_input) casing_input = Input(shape=(None, ), dtype='int32', name='casing_input') casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input) character_input = Input(shape=( None,
case2Idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } caseEmbeddings = np.identity(len(case2Idx), dtype='float32') train_set = padding( createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx)) train_batch, train_batch_len = createBatches(train_set) words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1], weights=[wordEmbeddings], trainable=False)(words_input) casing_input = Input(shape=(None, ), dtype='int32', name='casing_input') casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input) character_input = Input(shape=( None, 52, ), name='char_input')
def createBatches(self): """Create batches""" self.train_batch, self.train_batch_len = createBatches(self.train_set) self.dev_batch, self.dev_batch_len = createBatches(self.dev_set) self.test_batch, self.test_batch_len = createBatches(self.test_set)
wordEmbeddings = np.array(wordEmbeddings) char2Idx = {"PADDING":0, "UNKNOWN":1} s = " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>" s = s + '\t' + '\n' + '\x97' + '\x92'+'\x93' + '\x94' + '\xc2' for c in s: char2Idx[c] = len(char2Idx) train_set = padding(createMatrices(trainSentences,word2Idx, label2Idx, case2Idx, char2Idx)) learn_set = padding(createMatrices(learnSentences,word2Idx, label2Idx, case2Idx,char2Idx)) test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx)) idx2Label = {v: k for k, v in label2Idx.items()} train_batch,train_batch_len = createBatches(train_set) learn_batch,learn_batch_len = createBatches(learn_set) test_batch,test_batch_len = createBatches(test_set) #modelPackage conatains Deatiled information modelPackage = ModelPackage(wordEmbeddings, caseEmbeddings, word2Idx, label2Idx, char2Idx, modelName, datasetName) print(modelPackage.modelName) model = createModel(modelPackage) modelPackage.model = model # plot_model(model, to_file='model.png') precision = 0 recoil = 0