Exemplo n.º 1
0
    def __getitem__(self, index):

        inp_x = self.data[index][:self.seq_len]
        out_x = inp_x[1:] + [self.vocab.EOS]

        inp_xhat = [self.vocab.SOS] + self.data[index][:self.seq_len]
        out_xhat = inp_xhat[1:] + [self.vocab.EOS]

        # print(tabulate([inp_src, out_src, inp_trg, out_trg],
        #                tablefmt="psql"))

        if not self.subword:
            inp_x, oov_map = vectorize(inp_x, self.vocab, self.oovs)
            out_x, _ = vectorize(out_x, self.vocab, self.oovs)
            inp_xhat, _ = vectorize(inp_xhat, self.vocab, self.oovs)
            out_xhat, _ = vectorize(out_xhat, self.vocab, self.oovs)
        else:
            raise NotImplementedError

        # add noise in the form of token swaps ! after the OOV replacements
        inp_x = token_swaps(inp_x, self.swaps)

        sample = inp_x, out_x, inp_xhat, out_xhat, len(inp_x), len(inp_xhat)

        if self.return_oov:
            sample = sample + (oov_map, )

        return sample
Exemplo n.º 2
0
    def dataitem(self, i):

        # tokenize sentence / text
        token_list = self.tokenize(self.data[i])

        # add special tokens such as <BOS> or <EOS>
        token_list = self.add_special_tokens(token_list)

        # vectorize the tokens
        vector = vectorize(token_list, self.vocab)
        return vector
Exemplo n.º 3
0
    def __getitem__(self, index):
        sentence = self.data[index]
        sentence = sentence + [self.vocab.EOS]

        if self.sos:
            sentence = [self.vocab.SOS] + sentence

        sentence = sentence[:self.seq_len]
        inputs = sentence[:-1]
        targets = sentence[1:]

        length = len(inputs)

        if self.oovs > 0:
            inputs_vec, _ = vectorize(inputs, self.vocab, self.oovs)
            targets_vec, _ = vectorize(targets, self.vocab, self.oovs)
        else:
            inputs_vec = vectorize(inputs, self.vocab)
            targets_vec = vectorize(targets, self.vocab)

        assert len(inputs_vec) == len(targets_vec)

        return inputs_vec, targets_vec, length
Exemplo n.º 4
0
 def read_sample(self, index):
     sample = self.data[index][:self.seq_len]
     sample = [self.vocab.SOS] + sample + [self.vocab.EOS]
     sample, _ = vectorize(sample, self.vocab, self.oovs)
     return list(map(self.vocab.id2tok.get, sample))