Пример #1
0
    def reverse(self, batch, limited=False):
        if self.use_revtok:
            try:
                import revtok
            except ImportError:
                print("Please install revtok.")
                raise
        if not self.batch_first:
            batch = batch.t()
        with torch.cuda.device_of(batch):
            batch = batch.tolist()
        batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize

        def trim(s, t):
            sentence = []
            for w in s:
                if w == t:
                    break
                sentence.append(w)
            return sentence

        batch = [trim(ex, self.eos_token) for ex in batch]  # trim past frst eos

        def filter_special(tok):
            return tok not in (self.init_token, self.pad_token)

        batch = [filter(filter_special, ex) for ex in batch]
        if self.use_revtok:
            return [revtok.detokenize(ex) for ex in batch]
        return [''.join(ex) for ex in batch]
    def reverse(self, batch):
        if self.use_revtok:
            try:
                import revtok
            except ImportError:
                print("Please install revtok.")
                raise
        if not self.batch_first:
            batch = batch.t()
        with torch.cuda.device_of(batch):
            batch = batch.tolist()
        batch = [[self.vocab.itos[ind] for ind in ex]
                 for ex in batch]  # denumericalize

        def trim(s, t):
            sentence = []
            for w in s:
                if w == t:
                    break
                sentence.append(w)
            return sentence

        batch = [trim(ex, self.eos_token)
                 for ex in batch]  # trim past frst eos

        def filter_special(tok):
            return tok not in (self.init_token, self.pad_token)

        batch = [filter(filter_special, ex) for ex in batch]
        if self.use_revtok:
            return [revtok.detokenize(ex) for ex in batch]
        return [' '.join(ex) for ex in batch]
def load_real_dataset(dataset_name):
    train_filename, valid_filename, test_filename = \
        "{}_train.txt".format(dataset_name),\
        "{}_valid.txt".format(dataset_name),\
        "{}_test.txt".format(dataset_name)

    import random
    random.seed(42)
    print(train_filename, valid_filename, test_filename)

    TEXT = load(file_name=dataset_name + "_vocab.pkl",
                parent_path=DATASET_PATH)

    trn = LanguageModelingDataset(path=DATASET_PATH + train_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    tst = LanguageModelingDataset(path=DATASET_PATH + test_filename,
                                  newline_eos=False,
                                  text_field=TEXT)

    import revtok

    def denumericalize(batch):
        batch = [[TEXT.vocab.itos[ind] for ind in ex] for ex in batch.tolist()]

        def trim(s, t):
            sentence = []
            for w in s:
                if w == t:
                    break
                sentence.append(w)
            return sentence

        batch = [trim(ex, TEXT.eos_token)
                 for ex in batch]  # trim past frst eos

        def filter_special(tok):
            return tok not in (TEXT.init_token, TEXT.pad_token)

        batch = [list(filter(filter_special, ex)) for ex in batch]

        return batch

    TEXT.detokenize = lambda B: [revtok.detokenize(l) for l in B]
    TEXT.denumericalize = denumericalize
    TEXT.fix_length = TEXT.max_length + 1

    lens = [len(x) for x in trn.text]

    print(
        'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n '
        'min length: {}\n max length: {}\n mean train length: {:.2f}, loaded max length: {}'
        .format(len(TEXT.vocab), len(trn), len(vld), len(tst), np.min(lens),
                np.max(lens), np.mean(lens), TEXT.max_length))
    return trn, vld, tst, TEXT
Пример #4
0
def detokenize(tokens):
    words = []
    for i, t in enumerate(tokens):
        if t['orig_id'] is None or (i and t['orig_id'] == tokens[i-1]['orig_id']):
            continue
        else:
            words.append(t['orig'])
    return revtok.detokenize(words)
Пример #5
0
    def reverse(self, batch, src_data=None, att=None, dic_src=None):
        if self.use_revtok:
            try:
                import revtok
            except ImportError:
                print("Please install revtok.")
                raise
        if not self.batch_first:
            batch = batch.t()
        with torch.cuda.device_of(batch):
            batch = batch.tolist()
        batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize

        if att is not None:
            for i in range(len(batch)):
                for j in range(len(batch[i])):
                    if batch[i][j] == '<unk>':
                        _, maxIndex = att[j].max(0)
                        batch[i][j] = dic_src.vocab.itos[src_data[maxIndex[0]]]

        def trim(s, t):
            sentence = []
            for w in s:
                if w == t:
                    break
                sentence.append(w+'  ')
            return sentence

        batch = [trim(ex, self.eos_token) for ex in batch]  # trim past frst eos

        def filter_special(tok):
            return tok not in (self.init_token, self.pad_token)

        batch = [filter(filter_special, ex) for ex in batch]
        if self.use_revtok:
            return [revtok.detokenize(ex) for ex in batch]
        return [''.join(ex) for ex in batch]
Пример #6
0
 def detokenize(self, tokenized, field_name=None):
     return revtok.detokenize(tokenized)
Пример #7
0
        self.inverse_doc_freqs = idf

    def _term_freqs(self, doc):
        counter = Counter(doc)
        for token in doc:
            counter[token] /= len(doc)
        return counter


if __name__ == '__main__':
    # Interactive testing for relevant memories retrieval
    import revtok
    from dataset import Dataset

    dataset = Dataset()
    kv_memory = KeyValueMemory(dataset)

    print('Interactive memory retrieval. {} to cancel\n'.format(
        colorize('Press CTRL + C', color='white')))
    try:
        while True:
            query = revtok.tokenize(input('> ').strip())
            queries, responses, _ = kv_memory.address(query)
            for key, value in zip(queries, responses):
                print('\nQ: {query}'.format(query=revtok.detokenize(key)))
                print(
                    'R: {response}'.format(response=revtok.detokenize(value)))
            print()
    except (KeyboardInterrupt, EOFError):
        print('\n\nShutting down')