Exemplo n.º 1
0
 def __init__(self,
              word_embed_size=100,
              pos_embed_size=100,
              word_embed_file=None,
              word_preprocess=text.replace_number,
              word_unknown="UNKNOWN",
              embed_dtype='float32'):
     super().__init__(reader=ConllReader())
     self.use_pretrained = word_embed_file is not None
     word_vocab = text.EmbeddingVocab(
         word_unknown,
         file=word_embed_file,
         dim=word_embed_size,
         dtype=embed_dtype,
         initializer=text.EmbeddingVocab.random_normal)
     pos_vocab = text.EmbeddingVocab(
         dim=pos_embed_size,
         dtype=embed_dtype,
         initializer=text.EmbeddingVocab.random_normal)
     self.add_processor('word',
                        vocab=word_vocab,
                        preprocess=word_preprocess)
     self.add_processor('pos',
                        vocab=pos_vocab,
                        preprocess=lambda x: x.lower())
     self.label_map = text.Dict()
Exemplo n.º 2
0
    def test_embeddings_serialize_deserialize(self):
        v1 = text.EmbeddingVocab(serialize_embeddings=False)
        self.assertEqual(v1.add("Pierre"), 1)
        self.assertEqual(v1.add("Vinken"), 2)
        self.assertEqual(v1.get_embeddings().shape, (3, 50))
        with tempfile.TemporaryFile() as f:
            pickle.dump(v1, f)
            self.assertEqual(v1.get_embeddings().shape, (3, 50))
            f.seek(0)
            v2 = pickle.load(f)
            self.assertIsNot(v1, v2)
            self.assertEqual(v1._dict, v2._dict)
            self.assertRaises(RuntimeError, lambda: v2.get_embeddings())
            self.assertEqual(v2.add("nonexecutive"), 3)
            self.assertEqual(v2.add("director"), 4)
            self.assertRaises(RuntimeError, lambda: v2.get_embeddings())

        v3 = text.EmbeddingVocab(serialize_embeddings=True, dim=16)
        self.assertEqual(v3.add("Pierre"), 1)
        self.assertEqual(v3.add("Vinken"), 2)
        self.assertEqual(v3.get_embeddings().shape, (3, 16))
        with tempfile.TemporaryFile() as f:
            pickle.dump(v3, f)
            self.assertEqual(v3.get_embeddings().shape, (3, 16))
            f.seek(0)
            v4 = pickle.load(f)
            self.assertIsNot(v3, v4)
            self.assertEqual(v3._dict, v4._dict)
            self.assertEqual(v4.get_embeddings().shape, (3, 16))
            self.assertEqual(v4.add("nonexecutive"), 3)
            self.assertEqual(v4.add("director"), 4)
            self.assertEqual(v4.get_embeddings().shape, (5, 16))
Exemplo n.º 3
0
    def __init__(self,
                 word_embed_size=100,
                 postag_embed_size=50,
                 char_embed_size=10,
                 word_embed_file=None,
                 word_preprocess=text.lower,
                 word_unknown="<UNK>",
                 filter_coord=False,
                 format='tree'):
        if format == 'tree':
            read_genia = False
        elif format == 'genia':
            read_genia = True
        else:
            raise ValueError("Invalid data format: {}".format(format))
        super().__init__(reader=reader.ZipReader([
            GeniaReader() if read_genia else reader.TreeReader(),
            reader.CsvReader(delimiter=' '),
            reader.ContextualizedEmbeddingsReader(),
        ]))
        self.filter_coord = filter_coord
        self._updated = False
        self._postag_file = None
        self._cont_embed_file = None
        self._use_pretrained_embed = word_embed_file is not None
        self._read_genia = read_genia

        word_vocab = text.EmbeddingVocab(
            file=word_embed_file,
            unknown=word_unknown,
            dim=word_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)
        postag_vocab = text.EmbeddingVocab(
            unknown=word_unknown,
            dim=postag_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)
        char_vocab = text.EmbeddingVocab(
            unknown=word_unknown,
            dim=char_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)

        for word in CC_KEY + CC_SEP:
            word_vocab.add(word)
        self.char_pad_id = char_vocab.add(_CHAR_PAD)
        self.add_processor('word', word_vocab, preprocess=word_preprocess)
        self.add_processor('pos', postag_vocab, preprocess=False)
        self.add_processor('char', char_vocab, preprocess=False)
Exemplo n.º 4
0
 def __init__(self,
              word_embed_size=100,
              postag_embed_size=100,
              word_embed_file=None,
              word_preprocess=text.lower,
              word_unknown="<UNK>",
              input_file=None,
              min_frequency=2):
     super().__init__(reader=ConllReader())
     words = []
     if input_file is not None:
         self._fix_word = True
         counter = Counter([word_preprocess(token['form'])
                            for sentence in self._reader.read(input_file)
                            for token in sentence])
         for word, count in counter.most_common():
             if count < min_frequency:
                 break
             words.append(word)
     else:
         self._fix_word = False
     word_vocab = text.EmbeddingVocab.from_words(
         words, unknown=word_unknown, dim=word_embed_size,
         initializer=(text.EmbeddingVocab.random_normal
                      if word_embed_file is None else np.zeros),
         serialize_embeddings=True)
     pretrained_word_vocab = text.EmbeddingVocab(
         unknown=word_unknown, file=word_embed_file, dim=word_embed_size,
         initializer=np.zeros, serialize_embeddings=True)
     pretrained_word_vocab.add(word_preprocess("<ROOT>"))
     postag_vocab = text.EmbeddingVocab(
         unknown=word_unknown, dim=postag_embed_size,
         initializer=text.EmbeddingVocab.random_normal,
         serialize_embeddings=True)
     self.add_processor(
         'word', word_vocab, preprocess=word_preprocess)
     self.add_processor(
         'pre', pretrained_word_vocab, preprocess=word_preprocess)
     self.add_processor('pos', postag_vocab, preprocess=False)
     self.rel_map = text.Dict()
Exemplo n.º 5
0
    def __init__(self,
                 word_embed_size=100,
                 postag_embed_size=50,
                 char_embed_size=10,
                 word_embed_file=None,
                 word_preprocess=text.lower,
                 word_unknown="<UNK>",
                 filter_coord=False,
                 format='tree'):
        super().__init__(reader=None)
        self.init_reader(format)
        self.filter_coord = filter_coord
        self._mode = None
        self._updated = False
        self._postag_file = None
        self._cont_embed_file = None
        self._use_pretrained_embed = word_embed_file is not None

        word_vocab = text.EmbeddingVocab(
            file=word_embed_file,
            unknown=word_unknown, dim=word_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)
        postag_vocab = text.EmbeddingVocab(
            unknown=word_unknown, dim=postag_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)
        char_vocab = text.EmbeddingVocab(
            unknown=word_unknown, dim=char_embed_size,
            initializer=text.EmbeddingVocab.random_normal,
            serialize_embeddings=True)

        for word in CC_KEY + CC_SEP:
            word_vocab.add(word)
        self.char_pad_id = char_vocab.add(_CHAR_PAD)
        self.add_processor('word', word_vocab, preprocess=word_preprocess)
        self.add_processor('pos', postag_vocab, preprocess=False)
        self.add_processor('char', char_vocab, preprocess=False)
Exemplo n.º 6
0
 def test_embeddings_from_file(self):
     raw = ("Pierre -0.00066023 -0.6566 0.27843 -0.14767\n"
            "Vinken 0.10204 -0.12792 -0.8443 -0.12181\n"
            "61 0.24968 -0.41242 0.1217 0.34527\n"
            "years -0.19181 -1.8823 -0.76746 0.099051\n"
            "old -0.52287 -0.31681 0.00059213 0.0074449\n")
     with tempfile.NamedTemporaryFile(mode='w') as f:
         f.write(raw)
         f.flush()
         v1 = text.EmbeddingVocab(file=f.name, dtype=np.float64)
         self.assertEqual(len(v1), 6)
         x1 = v1.get_embeddings()
         self.assertEqual(x1.shape, (6, 4))
         np.testing.assert_array_equal(
             x1[1], [0.10204, -0.12792, -0.8443, -0.12181])
Exemplo n.º 7
0
    def test_embeddings(self):
        v = text.EmbeddingVocab(dim=32)
        self.assertEqual(v.add("Pierre"), 1)
        self.assertEqual(v.add("Vinken"), 2)
        self.assertEqual(v.get_embeddings().shape, (3, 32))
        self.assertEqual(v.add("nonexecutive"), 3)
        self.assertEqual(v.add("director"), 4)
        self.assertEqual(v.get_embeddings().shape, (5, 32))
        np.testing.assert_array_equal(v.get_embeddings(), v.get_embeddings())
        self.assertIs(v.get_embeddings(), v.get_embeddings())

        x1 = v.get_embeddings()
        self.assertEqual(v.add("61"), 5)
        self.assertEqual(v.add("years"), 6)
        self.assertEqual(v.add("old"), 7)
        x2 = v.get_embeddings()
        self.assertEqual(x2.shape, (8, 32))
        self.assertIsNot(x1, x2)
        np.testing.assert_array_equal(x1, x2[:5])