Exemplo n.º 1
0
class EmbeddingSum(Module):
    """A lookup table that stores embeddings of a fixed dictionary and size,
    combined with summing (CBOW).

    EmbeddingSum is an optimized variant of the Embedding class combined
    with summing (CBOW).  It is intended to be used over bags of features
    rather than single features.  It is based on torch.nn.Embedding
    (look it up in PyTorch docs).

    >>> import torch
    >>> symset = set(['a', 'b', 'c'])
    >>> emb = EmbeddingSum(symset, emb_size=10)
    >>> emb.forward(['a', 'b']) #doctest: +ELLIPSIS
    tensor(...)
    >>> emb.forward(['a', 'b']).shape
    torch.Size([10])
    """
    def __init__(self, alphabet: set, emb_size: int):
        """Create a random embedding dictionary.

        Arguments:
        * alphabet: set of symbols to embed (characters, words, POS tags, ...)
        * emb_size: embedding size (each symbol is mapped to a vector
            of size emb_size)
        """
        self.emb_size = emb_size
        self.enc = Encoding(alphabet)
        self.emb = nn.EmbeddingBag(self.enc.class_num, emb_size, mode='sum')

    def forward(self, syms: Iterator) -> TT:
        """Embed the given bag (sequence) of symbols and compute the sum.

        Returns:
            Single vector, which is the sum of the embeddings of the given
            symbols.
        """
        ixs = []
        for sym in syms:
            try:
                ixs.append(self.enc.encode(sym))
            except KeyError:
                pass
        if len(ixs) > 0:
            ix_tensor = torch.LongTensor(ixs).view(1, -1)
            return self.emb(ix_tensor)[0]
        else:
            return torch.zeros(self.emb_size)

    def params(self):
        """The list of parameters of the embedding dictionary."""
        return [self.emb.weight]
Exemplo n.º 2
0
class LangRec(Module):
    def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int,
                 hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            ngram_size: size of n-gram features (e.g., use 1 for unigrams,
                2 for bigrams, etc.)
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        # Keep the size of the ngrams
        self.ngram_size = ngram_size
        # Calculate the embedding alphabet and create the embedding sub-module
        feat_set = self.alphabet(data_set)
        self.register("emb", Embedding(feat_set, emb_size))
        # Encoding (mapping between langs and ints)
        lang_set = set(lang for (_, lang) in data_set)
        self.enc = Encoding(lang_set)
        # Scoring FFN sub-module
        self.register("ffn",
                      FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set)))
        # Additional check to verify that all the registered
        # parameters actually require gradients.  This allows
        # to identify the "bug" in the embedding module.
        assert all([param.requires_grad is True for param in self.params()])

    def preprocess(self, name: Name) -> Name:
        """Name preprocessing."""
        # Currently no preprocessing, but we could think of something
        # in the future.
        return name

    def features(self, name: Name) -> Iterator[str]:
        """Retrieve the list of features in the given name."""
        return ngrams(self.preprocess(name), self.ngram_size)

    def alphabet(self, data_set: DataSet) -> Set[str]:
        """Retrieve the embedding alphabet from the dataset.

        Retrieve the set of all features that we want to embed from
        the given dataset.
        """
        return set(feat for (name, _) in data_set
                   for feat in self.features(name))

    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    def forward(self, name: Name) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            name: a person name

        Returns:
            score vector corresponding to the name, with its individual
            elements corresponding to the scores of different languages
        """
        embeddings = [self.emb.forward(feat) for feat in self.features(name)]
        cbow = sum(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward(name)
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res

    def classify_one(self, name: Name) -> Lang:
        """A simplified version of `classify` which returns the
        language with the highest score."""
        prob_map = self.classify(name)
        preds = sorted(prob_map.items(), key=lambda pair: pair[1])
        (name, _prob) = preds[-1]
        return name
Exemplo n.º 3
0
class Test_Encoding(unittest.TestCase):
    def setUp(self):
        self.encoder = Encoding()

    def tearDown(self):
        self.encoder = None

    def test_single_character(self):
        decimal = self.encoder.encode_decimal("A")
        self.assertEqual(decimal, 16777217)
        hex_value = self.encoder.encode_hex("A")
        self.assertEqual(int(hex_value, 16), int('0x01000001', 16))

    def test_full_bundle(self):
        decimal = self.encoder.encode_decimal("FRED")
        self.assertEqual(decimal, 251792692)
        hex_value = self.encoder.encode_hex("FRED")
        self.assertEqual(int(hex_value, 16), int('0x0F020d34', 16))

    def test_non_alphanumerics(self):
        decimal = self.encoder.encode_decimal(" :^)")
        self.assertEqual(decimal, 79094888)
        hex_value = self.encoder.encode_hex(" :^)")
        self.assertEqual(int(hex_value, 16), int('0x04B6E468', 16))

    def test_foo(self):
        decimal = self.encoder.encode_decimal("foo")
        self.assertEqual(decimal, 124807030)

    def test_foo_with_space(self):
        decimal = self.encoder.encode_decimal(" foo")
        self.assertEqual(decimal, 250662636)

    def test_foot(self):
        decimal = self.encoder.encode_decimal("foot")
        self.assertEqual(decimal, 267939702)

    def test_BIRD(self):
        decimal = self.encoder.encode_decimal("BIRD")
        self.assertEqual(decimal, 251930706)

    def test_periods(self):
        decimal = self.encoder.encode_decimal("....")
        self.assertEqual(decimal, 15794160)

    def test_carrots(self):
        decimal = self.encoder.encode_decimal("^^^^")
        self.assertEqual(decimal, 252706800)

    def test_Whoot(self):
        decimal = self.encoder.encode_decimal("Woot")
        self.assertEqual(decimal, 266956663)

    def test_no(self):
        decimal = self.encoder.encode_decimal("no")
        self.assertEqual(decimal, 53490482)

    def test_email(self):
        decimal = self.encoder.encode_decimal("a@b.")
        self.assertEqual(decimal, 131107009)

    def test_my_email(self):
        decimal = self.encoder.encode_decimal("me@a")
        self.assertEqual(decimal, 263197451)

    # ----------- Part 2 ----------------------------

    def test_endcode_array_tacocat(self):
        encoded = self.encoder.encode("tacocat")
        self.assertEqual(encoded, [267487694, 125043731])

    def test_decode_FRED(self):
        decoded = self.encoder.decode_decimal(251792692)
        self.assertEqual(decoded, "FRED")

    def test_decode_array_tacocat(self):
        decoded = self.encoder.decode([267487694, 125043731])
        self.assertEqual(decoded, "tacocat")

    def test_decode_array_never_odd(self):
        decoded = self.encoder.decode(
            [267657050, 233917524, 234374596, 250875466, 17830160])
        self.assertEqual(decoded, "never odd or even")

    def test_decode_array_larger(self):
        decoded = self.encoder.decode(
            [267394382, 167322264, 66212897, 200937635, 267422503])
        self.assertEqual(decoded, "lager, sir, is regal")

    def test_decode_array_go_hang(self):
        decoded = self.encoder.decode([
            200319795, 133178981, 234094669, 267441422, 78666124, 99619077,
            267653454, 133178165, 124794470
        ])
        self.assertEqual(decoded, "go hang a salami, I'm a lasagna hog")

    def test_decode_array_engad(self):
        decoded = self.encoder.decode([
            267389735, 82841860, 267651166, 250793668, 233835785, 267665210,
            99680277, 133170194, 124782119
        ])
        self.assertEqual(decoded, "egad, a base tone denotes a bad age")

    def test_bothways(self):
        self.assertEqual("bothways",
                         self.encoder.decode(self.encoder.encode("bothways")))
Exemplo n.º 4
0
class LangRec(Module):

    # TODO: Implement this method.
    def __init__(self, data_set: DataSet, emb_size: int, hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        char_set = char_set_in(data_set)
        # Embedding
        self.register("emb", Embedding(char_set, emb_size))
        lang_set = set(lang for (name, lang) in data_set)
        lang_num = len(lang_set)
        # Encoding (mapping between langs and ints)
        self.enc = Encoding(lang_set)
        # FFN
        self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=lang_num))

    # TODO: Implement this method.
    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    # TODO: Implement this method.
    def forward(self, name: Name) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            name: a person name

        Returns:
            score vector corresponding to the name, with its individual
            elements corresponding to the scores of different languages
        """
        embeddings = [self.emb.forward(char) for char in name]
        cbow = sum(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    # TODO: Implement this method.
    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward(name)
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res
Exemplo n.º 5
0
class LangRec(Module):
    def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int,
                 hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            ngram_size: size of n-gram features (e.g., use 1 for unigrams,
                2 for bigrams, etc.)
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        # Keep the size of the ngrams
        self.ngram_size = ngram_size
        # Calculate the embedding alphabet and create the embedding sub-module
        feat_set = self.alphabet(data_set)
        self.register("emb", EmbeddingSum(feat_set, emb_size))
        # Encoding (mapping between langs and ints)
        lang_set = set(lang for (_, lang) in data_set)
        self.enc = Encoding(lang_set)
        # Scoring FFN sub-module
        self.register("ffn",
                      FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set)))
        # Additional check to verify that all the registered
        # parameters actually require gradients.
        assert all([param.requires_grad is True for param in self.params()])

    def preprocess(self, name: Name) -> Name:
        """Name preprocessing."""
        # Currently no preprocessing, but we could think of something
        # in the future.
        return name

    def features(self, name: Name) -> Iterator[str]:
        """Retrieve the list of features in the given name."""
        return ngrams(self.preprocess(name), self.ngram_size)

    def alphabet(self, data_set: DataSet) -> Set[str]:
        """Retrieve the embedding alphabet from the dataset.

        Retrieve the set of all features that we want to embed from
        the given dataset.
        """
        return set(feat for (name, _) in data_set
                   for feat in self.features(name))

    def encode(self, lang: Lang) -> int:
        """Encode the given language as an integer."""
        return self.enc.encode(lang)

    def forward(self, names: Iterator[Name]) -> TT:
        """The forward calculation of the name's language recognition model.

        Args:
            names: a sequence of person names; calculating the scores for
                several names at the same time is faster thanks to better
                parallelization

        Returns:
            score matrix in which each row corresponds to a single name, with
            its individual elements corresponding to the scores of different
            languages
        """
        # TODO EX2 (a): the following lines need to be adapted to the EmbeddingSum,
        # which processes features in groups.  You will also need to make
        # trivial modifications in the code in two or three other places
        # (imports, initialization).
        # TODO EX2 (b): you can further try to modify the EmbeddingSum class so
        # that it works over batches of feature groups.
        embeddings = [
            # [self.emb.forward(feat) for feat in self.features(name)]
            self.emb.forward(self.features(name)) for name in names
        ]
        # cbow = utils.from_rows(map(sum, embeddings))
        cbow = utils.stack(embeddings)
        scores = self.ffn.forward(cbow)
        return scores

    def classify(self, name: Name) -> Dict[Lang, float]:
        """Classify the given person name.

        Args:
            name: person name, sequence of characters

        Returns:
            the mapping from languages to their probabilities
            for the given name.
        """
        # We don't want Pytorch to calculate the gradients
        with torch.no_grad():
            # The vector of scores for the given name
            scores = self.forward([name])[0]
            # We map the vector of scores to the vector of probabilities.
            probs = torch.softmax(scores, dim=0)
            # Result dictionary
            res = {}
            # `ix` should be an index in the scores vector
            for ix in range(len(probs)):
                lang = self.enc.decode(ix)
                res[lang] = probs[ix]
            return res

    def classify_one(self, name: Name) -> Lang:
        """A simplified version of `classify` which returns the
        language with the highest score."""
        prob_map = self.classify(name)
        preds = sorted(prob_map.items(), key=lambda pair: pair[1])
        (name, _prob) = preds[-1]
        return name