Exemplo n.º 1
0
class BPEmbVaeSampler(VAESampler):
    def __init__(self, lang, vs, dim, decode_from, params, cuda=False):
        self.bp = BPEmb(lang=lang, vs=vs, dim=dim)
        super().__init__(decode_from, params, cuda)

    def to_s(self, decoded):
        out = []
        for item in decoded:
            s = self.bp.decode(item).replace('▁', ' ').strip()
            s = s[0].upper() + s[1:]
            s = re.sub(r'\bi\b', 'I', s)
            s = re.sub(r'[.!?]\s+(\w)',
                       lambda m: m.group()[:-1] + m.group()[-1].upper(), s)
            out.append(s)
        return out

    def str2ids(self, s):
        """
        Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but
        the model only has outputs for vocab items that are used in the
        training data, so this function replaces any BPEmb ids *not* in the
        training vocabulary with the model's "unknown" id.
        """
        encoded = self.bp.encode(s)
        ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \
                for item in encoded]
        return ids
class BPEmbVaeSampler(VAESampler):
    def __init__(self, lang, vs, dim, decode_from, params, cuda=False):
        self.bp = BPEmb(lang=lang, vs=vs, dim=dim, add_pad_emb=True)
        super().__init__(decode_from, params, cuda)

    def _load_train_data(self):
        class Defaulter(dict):
            def __missing__(self, item):
                return 0
        word2idx = Defaulter(
                **{item: self.bp.emb.vocab[item].index \
                        for item in self.bp.emb.vocab})
        train_data = MonoTextData(self.params.train_data,
                                  label=False,
                                  vocab=word2idx)
        return train_data

    def to_s(self, decoded):
        out = []
        for item in decoded:
            s = self.bp.decode(item).replace('▁', ' ').strip()
            s = s[0].upper() + s[1:]
            s = re.sub(r'\bi\b', 'I', s)
            s = re.sub(r'[.!?]\s+(\w)',
                       lambda m: m.group()[:-1] + m.group()[-1].upper(), s)
            out.append(s)
        return out

    def str2ids(self, s):
        """
        Encode string s with BPEmb. BPEmb has a fixed vocabulary size, but
        the model only has outputs for vocab items that are used in the
        training data, so this function replaces any BPEmb ids *not* in the
        training vocabulary with the model's "unknown" id.
        """
        encoded = self.bp.encode(s)
        ids = [self.vocab.word2id.get(item, self.vocab.unk_id) \
                for item in encoded]
        return ids