def test(self, sess, token_ids): # We decode one sentence at a time. token_ids = data_utils.padding(token_ids) target_ids = data_utils.padding([data_utils.GO_ID]) y_ids = data_utils.padding([data_utils.EOS_ID]) encoder_inputs, decoder_inputs, _ = data_utils.nextRandomBatch([(token_ids, target_ids, y_ids)], batch_size=1) prediction = sess.run(self.prediction, feed_dict={ self.encoder_inputs: encoder_inputs, self.decoder_inputs: decoder_inputs }) pred_max = tf.arg_max(prediction, 1) # prediction = tf.split(0, self.num_steps, prediction) # # This is a greedy decoder - outputs are just argmaxes of output_logits. # outputs = [int(np.argmax(predict)) for predict in prediction] # # If there is an EOS symbol in outputs, cut them at that point. # if data_utils.EOS_ID in outputs: # outputs = outputs[:outputs.index(data_utils.EOS_ID)] return pred_max.eval()
def predict(word_raw): sentence = add_char_information([format_data(word_raw)]) test_sent = padding(create_matrices(sentence, word2idx, label2idx, case2idx, char2idx)) sent_batch, _ = create_batches(test_sent) tokens, casing, char, labels = sent_batch[0] tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) pred_sent = list(zip(word_raw.split(), [idx2_label[i] for i in pred])) return pred_sent
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ length = padding_length(sentences) none_index = tag_to_id["O"] def f(x): return x.lower() if lower else x data = [] for s in sentences: string = [w[0] for w in s] chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string] if len(chars) > length: sequence_length = length else: sequence_length = len(chars) chars = padding(chars, length) segs = get_seg_features("".join(string)) segs = padding(segs, length) if train: tags = [tag_to_id[w[-1]] for w in s] else: tags = [none_index for _ in chars] tags = padding(tags, length) data.append({ "string": string, "chars": chars, "segs": segs, "tags": tags, "seqlen": sequence_length }) return data
def embed(self): """Create word- and character-level embeddings""" label_set = set() words = {} # unique words and labels in data for dataset in [self.train_sentences, self.dev_sentences, self.test_sentences]: for sentence in dataset: for token, char, label in sentence: # token ... token, char ... list of chars, label ... BIO labels label_set.add(label) words[token.lower()] = True # mapping for labels self.label_to_idx = {} for label in label_set: self.label_to_idx[label] = len(self.label_to_idx) # mapping for token cases case_to_idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7} self.case_embeddings = np.identity(len(case_to_idx), dtype='float32') # identity matrix used # read GLoVE word embeddings word_to_idx = {} self.word_embeddings = [] f_embeddings = open("data/glove.50d.txt", encoding="utf-8") # loop through each word in embeddings for line in f_embeddings: split = line.strip().split(" ") word = split[0] # embedding word entry if len(word_to_idx) == 0: # add padding+unknown word_to_idx["PADDING_TOKEN"] = len(word_to_idx) vector = np.zeros(len(split) - 1) # zero vector for 'PADDING' word self.word_embeddings.append(vector) word_to_idx["UNKNOWN_TOKEN"] = len(word_to_idx) vector = np.random.uniform(-0.25, 0.25, len(split) - 1) self.word_embeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) self.word_embeddings.append(vector) # word embedding vector word_to_idx[split[0]] = len(word_to_idx) # corresponding word dict self.word_embeddings = np.array(self.word_embeddings) # dictionary of all possible characters self.char_to_idx = {"PADDING": 0, "UNKNOWN": 1} for c in "’ỳ‘°fhXLẹủÀgÂếỒừHơý¼[êớ3BùỜnểỗPứỹAlâ+ÔẵÊ/.Ề-jÓ8CởVqĩẨk* " \ "òĐỆd4áỏệrUỐỪ>ỮóÐ]ễụRũ²ằự&ZồÕeẶuẽ0wố6ŨẢDSữẩọưQyèO)K³bắvãàÚạ?MÝÁỔỄÙìmặ27ƠỞửÍƯờỉầịĂềổậJđIpõỵẬộ~ôiY–9" \ "Ầð:FxG!a,5%(ísả…NWỨoỡTẫéú“ợEẻỲză\"ẤẠỷc;ấẳ1”ỰtỖỦ'": self.char_to_idx[c] = len(self.char_to_idx) def write(file_name, data): with open(file_name, "w") as f: json.dump(data, f) write("data/word.json", word_to_idx) write("data/label2idx.json", self.label_to_idx) write("data/case2idx.json", case_to_idx) write("data/char2idx.json", self.char_to_idx) # format: [[wordindices], [caseindices], [padded word indices], [label indices]] self.train_set = padding( create_matrices(self.train_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx)) self.dev_set = padding( create_matrices(self.dev_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx)) self.test_set = padding( create_matrices(self.test_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx)) self.idx_to_label = {v: k for k, v in self.label_to_idx.items()}