Пример #1
0
 def setUp(self):
     words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
     (_, tmp) = tempfile.mkstemp()
     with open(tmp, 'w') as fout:
         fout.write('\n'.join(words))
     self.vocab = UnicodeCharsVocabulary(tmp, 5)
     self._tmp = tmp
Пример #2
0
 def test_vocab_encode_chars_reverse(self):
     sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
     vocab = UnicodeCharsVocabulary(self._tmp, 5)
     char_ids = vocab.encode_chars(sentence, reverse=True)
     expected = np.array(
         [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
          [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
          [258, 257, 259, 260, 260]],
         dtype=np.int32)[::-1, :]
     self.assertTrue((char_ids == expected).all())
Пример #3
0
class TestUnicodeCharsVocabulary(unittest.TestCase):
    def setUp(self):
        words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
        (_, tmp) = tempfile.mkstemp()
        with open(tmp, 'w') as fout:
            fout.write('\n'.join(words))
        self.vocab = UnicodeCharsVocabulary(tmp, 5)
        self._tmp = tmp

    def test_vocab_word_to_char_ids(self):
        char_ids = self.vocab.word_to_char_ids('th')
        expected = np.array([258, 116, 104, 259, 260], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids('thhhhh')
        expected = np.array([258, 116, 104, 104, 259])
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids(chr(256) + 't')
        expected = np.array([258, 196, 128, 116, 259], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_bos_eos(self):
        bos_ids = self.vocab.word_to_char_ids('<S>')
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        bos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('<S>')]
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        eos_ids = self.vocab.word_to_char_ids('</S>')
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

        eos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('</S>')]
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

    def test_vocab_encode_chars(self):
        sentence = ' '.join(['th', 'thhhhh', chr(256) + 't'])
        char_ids = self.vocab.encode_chars(sentence)
        expected = np.array(
            [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
             [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
             [258, 257, 259, 260, 260]],
            dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_vocab_encode_chars_reverse(self):
        sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
        vocab = UnicodeCharsVocabulary(self._tmp, 5)
        char_ids = vocab.encode_chars(sentence, reverse=True)
        expected = np.array(
            [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
             [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
             [258, 257, 259, 260, 260]],
            dtype=np.int32)[::-1, :]
        self.assertTrue((char_ids == expected).all())

    def tearDown(self):
        os.remove(self._tmp)
Пример #4
0
def load_vocab(vocab_file, max_word_length=None):
    if max_word_length:
        return UnicodeCharsVocabulary(vocab_file,
                                      max_word_length,
                                      validate_file=True)
    else:
        return Vocabulary(vocab_file, validate_file=True)
Пример #5
0
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
Пример #6
0
    def _load_data(self, reverse, chars, bidirectional=False):
        if chars:
            vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5)
        else:
            vocab = Vocabulary(self._tmp_vocab)

        if not bidirectional:
            data = LMDataset(self._tmp_train, vocab, reverse=reverse)
        else:
            data = BidirectionalLMDataset(self._tmp_train, vocab)

        return data
Пример #7
0
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    '''
    Given an input vocabulary file, dump all the token embeddings to the
    outfile.  The result can be used as the embedding_weight_file when
    constructing a BidirectionalLanguageModel.
    '''
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']

    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)  #todo 获取具体的单词
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)