def batch_generator(sentences, taggings, steps, training=True): global word_dict while True: for i in range(steps): batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))] batch_taggings = taggings[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(taggings))] word_input, _ = get_batch_input(batch_sentences, 1, word_dict, {}, word_ignore_case=True, char_ignore_case=False) if not training: yield word_input, batch_taggings continue sentence_len = word_input.shape[1] for j in range(len(batch_taggings)): batch_taggings[j] = batch_taggings[j] + [0] * ( sentence_len - len(batch_taggings[j])) batch_taggings[j] = [[tag] for tag in batch_taggings[j]] batch_taggings = numpy.asarray(batch_taggings) yield word_input, batch_taggings if not training: break
def train_batch_generator(batch_size=32, training=True): while True: sentences = [] if training: #use training set data = sentences_train batch_ix = random.sample(range(len(data)), batch_size) for ix in batch_ix: text = data[ix] sentences.append(get_word_list_eng(text)) onehot_labels = onehot_train[batch_ix, :] else: #use validation set data = sentences_dev batch_ix = random.sample(range(len(data)), batch_size) for ix in batch_ix: text = data[ix] sentences.append(get_word_list_eng(text)) onehot_labels = onehot_dev[batch_ix, :] word_input, char_input = get_batch_input( sentences=sentences, max_word_len=max_word_len, word_dict=word_dict, char_dict=char_dict, word_ignore_case=True, char_ignore_case=False, ) yield [word_input, char_input], onehot_labels
def batch_generator(self, sentences, taggings, steps, training=True): while True: for i in range(steps): batch_sentences = sentences[self.BATCH_SIZE * i:min(self.BATCH_SIZE * (i + 1), len(sentences))] batch_taggings = taggings[self.BATCH_SIZE * i:min(self.BATCH_SIZE * (i + 1), len(taggings))] word_input, char_input = get_batch_input( batch_sentences, self.max_word_len, self.word_dict, self.char_dict, word_ignore_case=True, char_ignore_case=False) if not training: yield [word_input, char_input], batch_taggings continue sentence_len = word_input.shape[1] for j in range(len(batch_taggings)): batch_taggings[j] = batch_taggings[j] + [0] * ( sentence_len - len(batch_taggings[j])) batch_taggings = self.to_categorical_tensor( numpy.asarray(batch_taggings), len(self.TAGS)) yield [word_input, char_input], batch_taggings if not training: break
def batch_generator(sentences, taggings, steps, training=True): global word_dict, char_dict, max_word_len while True: for i in range(steps): batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))] output_sentences = copy.deepcopy(batch_sentences) batch_taggings = taggings[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(taggings))] word_input, char_input = get_batch_input( batch_sentences, max_word_len, word_dict, char_dict, word_ignore_case=True, char_ignore_case=False ) if not training: # yield [word_input, char_input], batch_taggings # original yield [word_input, char_input], batch_taggings, output_sentences # 加一个原始text输出 # yield [word_input], batch_taggings, output_sentences # 去掉字符级 continue pass sentence_len = word_input.shape[1] for j in range(len(batch_taggings)): batch_taggings[j] = batch_taggings[j] + [0] * (sentence_len - len(batch_taggings[j])) batch_taggings[j] = [[tag] for tag in batch_taggings[j]] batch_taggings = numpy.asarray(batch_taggings) yield [word_input, char_input], batch_taggings # yield [word_input, char_input], batch_taggings, output_sentences if not training: break
def test_exceed_len(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] word_embd_input, char_embd_input = get_batch_input( sentences, max_word_len=2, word_dict={}, char_dict={}, ) self.assertEqual(word_embd_input.shape, (2, 6)) self.assertEqual(char_embd_input.shape, (2, 6, 2))
def test_batch_generator(batch_size=1): index = 0 while index < test_num: sentences = [] index += 1 sentences.append(get_word_list_eng(single_sentence)) word_input, char_input = get_batch_input( sentences=sentences, max_word_len=max_word_len, word_dict=word_dict, char_dict=char_dict, word_ignore_case=True, char_ignore_case=False, ) yield [word_input, char_input]
def test_batch_generator(batch_size=32): index = 0 while index < test_num: sentences = [] batch_ix = range(index, min(index + batch_size, test_num)) index += batch_size for ix in batch_ix: text = sentences_test[ix] sentences.append(get_word_list_eng(text)) word_input, char_input = get_batch_input( sentences=sentences, max_word_len=max_word_len, word_dict=word_dict, char_dict=char_dict, word_ignore_case=True, char_ignore_case=False, ) yield [word_input, char_input]
def test_shape(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] word_embd_input, char_embd_input = get_batch_input( sentences, max_word_len=5, word_dict={}, char_dict={}, ) self.assertEqual(word_embd_input.shape, (2, 6)) self.assertEqual(char_embd_input.shape, (2, 6, 5)) for sentence_index in range(2): for word_index in range(6): if word_index < len(sentences[sentence_index]): self.assertEqual( 1, word_embd_input[sentence_index, word_index], (sentence_index, word_index), ) for char_index in range(5): if char_index < len( sentences[sentence_index][word_index]): self.assertEqual( 1, char_embd_input[sentence_index, word_index, char_index].tolist(), (sentence_index, word_index), ) else: self.assertEqual( 0, char_embd_input[sentence_index, word_index, char_index].tolist(), (sentence_index, word_index), ) else: self.assertEqual( 0, word_embd_input[sentence_index, word_index], (sentence_index, word_index), )
def batch_generator(sentences, taggings, steps, training=True): global word_dict, char_dict, max_word_len while True: for i in range(steps): batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))] batch_taggings = taggings[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(taggings))] word_input, char_input = get_batch_input( batch_sentences, max_word_len, word_dict, char_dict, word_ignore_case=True, char_ignore_case=False ) if not training: yield [word_input, char_input], batch_taggings continue sentence_len = word_input.shape[1] for j in range(len(batch_taggings)): batch_taggings[j] = batch_taggings[j] + [0] * (sentence_len - len(batch_taggings[j])) batch_taggings = keras.utils.to_categorical(numpy.asarray(batch_taggings), len(TAGS)) yield [word_input, char_input], batch_taggings if not training: break
def test_mapping(self): sentences = [ ['All', 'work', 'and', 'no', 'play'], ['makes', 'Jack', 'a', 'dull', 'boy', '.'], ] word_embd_input, char_embd_input = get_batch_input( sentences, max_word_len=5, word_dict={ 'All': 2, 'Work': 0 }, char_dict={'a': 3}, word_ignore_case=False, char_ignore_case=False, ) self.assertEqual(word_embd_input.shape, (2, 6)) self.assertEqual(char_embd_input.shape, (2, 6, 5)) for sentence_index in range(2): for word_index in range(6): if word_index < len(sentences[sentence_index]): if sentences[sentence_index][word_index] == 'All': self.assertEqual( 2, word_embd_input[sentence_index, word_index], (sentence_index, word_index), ) else: self.assertEqual( 1, word_embd_input[sentence_index, word_index], (sentence_index, word_index), ) for char_index in range(5): if char_index < len( sentences[sentence_index][word_index]): if sentences[sentence_index][word_index][ char_index] == 'a': self.assertEqual( 3, char_embd_input[sentence_index, word_index, char_index].tolist(), (sentence_index, word_index), ) else: self.assertEqual( 1, char_embd_input[sentence_index, word_index, char_index].tolist(), (sentence_index, word_index), ) else: self.assertEqual( 0, char_embd_input[sentence_index, word_index, char_index].tolist(), (sentence_index, word_index), ) else: self.assertEqual( 0, word_embd_input[sentence_index, word_index], (sentence_index, word_index), )