def predict(sentence, model): sen_list = [[[i, 'O\n'] for i in sentence.split()]] #sen_list = [[['SOCCER', 'O\n'], ['-', 'O\n'], ['JAPAN', 'O\n'], ['GET', 'O\n'], ['LUCKY', 'O\n'], ['WIN', 'O\n'], [',', 'O\n'], ['CHINA', 'O\n'], ['IN', 'O\n'], ['SURPRISE', 'O\n'], ['DEFEAT', 'O\n'], ['.', 'O\n']]] test = addCharInformatioin(sen_list) predLabels = [] test_set = padding( createMatrices(test, word2Idx, label2Idx, case2Idx, char2Idx)) test_batch, test_batch_len = createBatches(test_set) for i, data in enumerate(test_batch): tokens, casing, char, labels = data tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes predLabels.append(pred) entity_labels = [] j = 0 words_list = sentence.split() for i in predLabels[-1]: entity_labels.append((words_list[j], idx2Label[int(i)])) j += 1 print("predLabels", entity_labels) return entity_labels
vector = np.random.uniform(-0.25, 0.25, len(split) - 1) wordEmbeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) wordEmbeddings.append(vector) word2Idx[split[0]] = len(word2Idx) wordEmbeddings = np.array(wordEmbeddings) char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char2Idx[c] = len(char2Idx) train_set = padding( createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx)) dev_set = padding( createMatrices(devSentences, word2Idx, label2Idx, case2Idx, char2Idx)) test_set = padding( createMatrices(testSentences, word2Idx, label2Idx, case2Idx, char2Idx)) idx2Label = {v: k for k, v in label2Idx.items()} np.save("models/idx2Label.npy", idx2Label) np.save("models/word2Idx.npy", word2Idx) train_batch, train_batch_len = createBatches(train_set) dev_batch, dev_batch_len = createBatches(dev_set) test_batch, test_batch_len = createBatches(test_set) words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0],
def embed(self): """Create word- and character-level embeddings""" labelSet = set() words = {} # unique words and labels in data for dataset in [ self.trainSentences, self.devSentences, self.testSentences ]: for sentence in dataset: for token, char, label in sentence: # token ... token, char ... list of chars, label ... BIO labels labelSet.add(label) words[token.lower()] = True # mapping for labels self.label2Idx = {} for label in labelSet: self.label2Idx[label] = len(self.label2Idx) # mapping for token cases case2Idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.caseEmbeddings = np.identity( len(case2Idx), dtype='float32') # identity matrix used # read GLoVE word embeddings word2Idx = {} self.wordEmbeddings = [] fEmbeddings = open("embeddings/glove.6B.50d.txt", encoding="utf-8") # loop through each word in embeddings for line in fEmbeddings: split = line.strip().split(" ") word = split[0] # embedding word entry if len(word2Idx) == 0: # add padding+unknown word2Idx["PADDING_TOKEN"] = len(word2Idx) vector = np.zeros(len(split) - 1) # zero vector for 'PADDING' word self.wordEmbeddings.append(vector) word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) vector = np.random.uniform(-0.25, 0.25, len(split) - 1) self.wordEmbeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) self.wordEmbeddings.append(vector) # word embedding vector word2Idx[split[0]] = len(word2Idx) # corresponding word dict self.wordEmbeddings = np.array(self.wordEmbeddings) # dictionary of all possible characters self.char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>": self.char2Idx[c] = len(self.char2Idx) # format: [[wordindices], [caseindices], [padded word indices], [label indices]] self.train_set = padding( createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.dev_set = padding( createMatrices(self.devSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.test_set = padding( createMatrices(self.testSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.idx2Label = {v: k for k, v in self.label2Idx.items()}
word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) vector = np.random.uniform(-0.25, 0.25, len(split)-1) wordEmbeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) wordEmbeddings.append(vector) word2Idx[split[0]] = len(word2Idx) wordEmbeddings = np.array(wordEmbeddings) char2Idx = {"PADDING":0, "UNKNOWN":1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char2Idx[c] = len(char2Idx) train_set = padding(createMatrices(trainSentences,word2Idx, label2Idx, case2Idx,char2Idx)) dev_set = padding(createMatrices(devSentences,word2Idx, label2Idx, case2Idx,char2Idx)) test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx)) idx2Label = {v: k for k, v in label2Idx.items()} np.save("models/idx2Label.npy",idx2Label) np.save("models/word2Idx.npy",word2Idx) train_batch,train_batch_len = createBatches(train_set) dev_batch,dev_batch_len = createBatches(dev_set) test_batch,test_batch_len = createBatches(test_set) words_input = Input(shape=(None,),dtype='int32',name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1], weights=[wordEmbeddings], trainable=False)(words_input) casing_input = Input(shape=(None,), dtype='int32', name='casing_input')