Пример #1
0
def batch_generator(sentences, taggings, steps, training=True):
    global word_dict
    while True:
        for i in range(steps):
            batch_sentences = sentences[BATCH_SIZE *
                                        i:min(BATCH_SIZE *
                                              (i + 1), len(sentences))]
            batch_taggings = taggings[BATCH_SIZE *
                                      i:min(BATCH_SIZE *
                                            (i + 1), len(taggings))]
            word_input, _ = get_batch_input(batch_sentences,
                                            1,
                                            word_dict, {},
                                            word_ignore_case=True,
                                            char_ignore_case=False)
            if not training:
                yield word_input, batch_taggings
                continue
            sentence_len = word_input.shape[1]
            for j in range(len(batch_taggings)):
                batch_taggings[j] = batch_taggings[j] + [0] * (
                    sentence_len - len(batch_taggings[j]))
                batch_taggings[j] = [[tag] for tag in batch_taggings[j]]
            batch_taggings = numpy.asarray(batch_taggings)
            yield word_input, batch_taggings
        if not training:
            break
Пример #2
0
def train_batch_generator(batch_size=32, training=True):
    while True:
        sentences = []
        if training:  #use training set
            data = sentences_train
            batch_ix = random.sample(range(len(data)), batch_size)
            for ix in batch_ix:
                text = data[ix]
                sentences.append(get_word_list_eng(text))
            onehot_labels = onehot_train[batch_ix, :]
        else:  #use validation set
            data = sentences_dev
            batch_ix = random.sample(range(len(data)), batch_size)
            for ix in batch_ix:
                text = data[ix]
                sentences.append(get_word_list_eng(text))
            onehot_labels = onehot_dev[batch_ix, :]

        word_input, char_input = get_batch_input(
            sentences=sentences,
            max_word_len=max_word_len,
            word_dict=word_dict,
            char_dict=char_dict,
            word_ignore_case=True,
            char_ignore_case=False,
        )
        yield [word_input, char_input], onehot_labels
Пример #3
0
 def batch_generator(self, sentences, taggings, steps, training=True):
     while True:
         for i in range(steps):
             batch_sentences = sentences[self.BATCH_SIZE *
                                         i:min(self.BATCH_SIZE *
                                               (i + 1), len(sentences))]
             batch_taggings = taggings[self.BATCH_SIZE *
                                       i:min(self.BATCH_SIZE *
                                             (i + 1), len(taggings))]
             word_input, char_input = get_batch_input(
                 batch_sentences,
                 self.max_word_len,
                 self.word_dict,
                 self.char_dict,
                 word_ignore_case=True,
                 char_ignore_case=False)
             if not training:
                 yield [word_input, char_input], batch_taggings
                 continue
             sentence_len = word_input.shape[1]
             for j in range(len(batch_taggings)):
                 batch_taggings[j] = batch_taggings[j] + [0] * (
                     sentence_len - len(batch_taggings[j]))
             batch_taggings = self.to_categorical_tensor(
                 numpy.asarray(batch_taggings), len(self.TAGS))
             yield [word_input, char_input], batch_taggings
         if not training:
             break
Пример #4
0
def batch_generator(sentences, taggings, steps, training=True):
    global word_dict, char_dict, max_word_len
    while True:
        for i in range(steps):
            batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))]
            output_sentences = copy.deepcopy(batch_sentences)
            batch_taggings = taggings[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(taggings))]
            word_input, char_input = get_batch_input(
                batch_sentences,
                max_word_len,
                word_dict,
                char_dict,
                word_ignore_case=True,
                char_ignore_case=False
            )
            if not training:
                # yield [word_input, char_input], batch_taggings  # original
                yield [word_input, char_input], batch_taggings, output_sentences  # 加一个原始text输出
                # yield [word_input], batch_taggings, output_sentences  # 去掉字符级
                continue
                pass
            sentence_len = word_input.shape[1]
            for j in range(len(batch_taggings)):
                batch_taggings[j] = batch_taggings[j] + [0] * (sentence_len - len(batch_taggings[j]))
                batch_taggings[j] = [[tag] for tag in batch_taggings[j]]
            batch_taggings = numpy.asarray(batch_taggings)

            yield [word_input, char_input], batch_taggings
            # yield [word_input, char_input], batch_taggings, output_sentences
        if not training:
            break
Пример #5
0
 def test_exceed_len(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     word_embd_input, char_embd_input = get_batch_input(
         sentences,
         max_word_len=2,
         word_dict={},
         char_dict={},
     )
     self.assertEqual(word_embd_input.shape, (2, 6))
     self.assertEqual(char_embd_input.shape, (2, 6, 2))
 def test_batch_generator(batch_size=1):
     index = 0
     while index < test_num:
         sentences = []
         index += 1
         sentences.append(get_word_list_eng(single_sentence))
         word_input, char_input = get_batch_input(
             sentences=sentences,
             max_word_len=max_word_len,
             word_dict=word_dict,
             char_dict=char_dict,
             word_ignore_case=True,
             char_ignore_case=False,
         )
         yield [word_input, char_input]
Пример #7
0
def test_batch_generator(batch_size=32):
    index = 0
    while index < test_num:
        sentences = []
        batch_ix = range(index, min(index + batch_size, test_num))
        index += batch_size
        for ix in batch_ix:
            text = sentences_test[ix]
            sentences.append(get_word_list_eng(text))
        word_input, char_input = get_batch_input(
            sentences=sentences,
            max_word_len=max_word_len,
            word_dict=word_dict,
            char_dict=char_dict,
            word_ignore_case=True,
            char_ignore_case=False,
        )
        yield [word_input, char_input]
Пример #8
0
 def test_shape(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     word_embd_input, char_embd_input = get_batch_input(
         sentences,
         max_word_len=5,
         word_dict={},
         char_dict={},
     )
     self.assertEqual(word_embd_input.shape, (2, 6))
     self.assertEqual(char_embd_input.shape, (2, 6, 5))
     for sentence_index in range(2):
         for word_index in range(6):
             if word_index < len(sentences[sentence_index]):
                 self.assertEqual(
                     1,
                     word_embd_input[sentence_index, word_index],
                     (sentence_index, word_index),
                 )
                 for char_index in range(5):
                     if char_index < len(
                             sentences[sentence_index][word_index]):
                         self.assertEqual(
                             1,
                             char_embd_input[sentence_index, word_index,
                                             char_index].tolist(),
                             (sentence_index, word_index),
                         )
                     else:
                         self.assertEqual(
                             0,
                             char_embd_input[sentence_index, word_index,
                                             char_index].tolist(),
                             (sentence_index, word_index),
                         )
             else:
                 self.assertEqual(
                     0,
                     word_embd_input[sentence_index, word_index],
                     (sentence_index, word_index),
                 )
Пример #9
0
def batch_generator(sentences, taggings, steps, training=True):
    global word_dict, char_dict, max_word_len
    while True:
        for i in range(steps):
            batch_sentences = sentences[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(sentences))]
            batch_taggings = taggings[BATCH_SIZE * i:min(BATCH_SIZE * (i + 1), len(taggings))]
            word_input, char_input = get_batch_input(
                batch_sentences,
                max_word_len,
                word_dict,
                char_dict,
                word_ignore_case=True,
                char_ignore_case=False
            )
            if not training:
                yield [word_input, char_input], batch_taggings
                continue
            sentence_len = word_input.shape[1]
            for j in range(len(batch_taggings)):
                batch_taggings[j] = batch_taggings[j] + [0] * (sentence_len - len(batch_taggings[j]))
            batch_taggings = keras.utils.to_categorical(numpy.asarray(batch_taggings), len(TAGS))
            yield [word_input, char_input], batch_taggings
        if not training:
            break
Пример #10
0
 def test_mapping(self):
     sentences = [
         ['All', 'work', 'and', 'no', 'play'],
         ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
     ]
     word_embd_input, char_embd_input = get_batch_input(
         sentences,
         max_word_len=5,
         word_dict={
             'All': 2,
             'Work': 0
         },
         char_dict={'a': 3},
         word_ignore_case=False,
         char_ignore_case=False,
     )
     self.assertEqual(word_embd_input.shape, (2, 6))
     self.assertEqual(char_embd_input.shape, (2, 6, 5))
     for sentence_index in range(2):
         for word_index in range(6):
             if word_index < len(sentences[sentence_index]):
                 if sentences[sentence_index][word_index] == 'All':
                     self.assertEqual(
                         2,
                         word_embd_input[sentence_index, word_index],
                         (sentence_index, word_index),
                     )
                 else:
                     self.assertEqual(
                         1,
                         word_embd_input[sentence_index, word_index],
                         (sentence_index, word_index),
                     )
                 for char_index in range(5):
                     if char_index < len(
                             sentences[sentence_index][word_index]):
                         if sentences[sentence_index][word_index][
                                 char_index] == 'a':
                             self.assertEqual(
                                 3,
                                 char_embd_input[sentence_index, word_index,
                                                 char_index].tolist(),
                                 (sentence_index, word_index),
                             )
                         else:
                             self.assertEqual(
                                 1,
                                 char_embd_input[sentence_index, word_index,
                                                 char_index].tolist(),
                                 (sentence_index, word_index),
                             )
                     else:
                         self.assertEqual(
                             0,
                             char_embd_input[sentence_index, word_index,
                                             char_index].tolist(),
                             (sentence_index, word_index),
                         )
             else:
                 self.assertEqual(
                     0,
                     word_embd_input[sentence_index, word_index],
                     (sentence_index, word_index),
                 )