예제 #1
0
def load_penn(path,
              batch_size,
              max_size=1000000,
              min_freq=1,
              gpu=False,
              shuffle=True):
    train_data = load_lines(os.path.join(path, 'train.txt'))
    valid_data = load_lines(os.path.join(path, 'valid.txt'))
    test_data = load_lines(os.path.join(path, 'test.txt'))

    d = Dict(pad_token=u.PAD,
             eos_token=u.EOS,
             bos_token=u.BOS,
             max_size=max_size,
             min_freq=min_freq)
    d.fit(train_data, valid_data)

    train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu)
    valid = PairedDataset(valid_data,
                          None, {'src': d},
                          batch_size,
                          gpu=gpu,
                          evaluation=True)
    test = PairedDataset(test_data,
                         None, {'src': d},
                         batch_size,
                         gpu=gpu,
                         evaluation=True)

    return train.sort_(), valid.sort_(), test.sort_()
예제 #2
0
def load_dataset(src,
                 trg,
                 batch_size,
                 max_size=100000,
                 min_freq=5,
                 gpu=False,
                 shuffle=True,
                 sort_key=default_sort_key,
                 **kwargs):
    """
    Wrapper function for dataset with sensible, overwritable defaults
    """
    tweets_dict = Dict(pad_token='<pad>',
                       eos_token='<eos>',
                       bos_token='<bos>',
                       max_size=max_size,
                       min_freq=min_freq)
    labels_dict = Dict(sequential=False, force_unk=False)
    tweets_dict.fit(src)
    labels_dict.fit(trg)
    d = {'src': tweets_dict, 'trg': labels_dict}
    splits = PairedDataset(src, trg, d, batch_size,
                           gpu=gpu).splits(shuffle=shuffle,
                                           sort_key=sort_key,
                                           **kwargs)
    return splits
예제 #3
0
def load_split_data(path, batch_size, max_size, min_freq, max_len, device, processor):
    """
    Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt'
    """
    train = load_lines(os.path.join(path, 'train.txt'), max_len, processor)
    valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor)

    d = Dict(
        pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
        max_size=max_size, min_freq=min_freq, force_unk=True
    ).fit(train, valid)

    train = load_lines(os.path.join(path, 'train.txt'), max_len, processor)
    valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor)
    test = load_lines(os.path.join(path, 'test.txt'), max_len, processor)
    train = PairedDataset(train, None, {'src': d}, batch_size, device=device)
    valid = PairedDataset(valid, None, {'src': d}, batch_size, device=device)
    test = PairedDataset(test, None, {'src': d}, batch_size, device=device)

    return train.sort_(), valid.sort_(), test.sort_()
예제 #4
0
def load_penn(path, batch_size,
              max_size=1000000, min_freq=1, gpu=False, shuffle=True,
              sort_key=lambda pair: len(pair[0])):
    train_data = load_lines(os.path.join(path, 'train.txt'))
    train_labels = make_mock_labels(train_data)
    valid_data = load_lines(os.path.join(path, 'valid.txt'))
    valid_labels = make_mock_labels(valid_data)
    test_data = load_lines(os.path.join(path, 'test.txt'))
    test_labels = make_mock_labels(test_data)
    ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                 max_size=max_size, min_freq=min_freq)
    ldict.fit(train_data, valid_data)
    mock = Dict().fit(train_labels)
    d = {'src': ldict, 'trg': mock}
    train = PairedDataset(train_data, train_labels, d, batch_size, gpu=gpu
    ).sort_(sort_key=sort_key)
    valid = PairedDataset(valid_data, valid_labels, d, batch_size, gpu=gpu,
                          evaluation=True).sort_(sort_key=sort_key)
    test = PairedDataset(test_data, test_labels, d, batch_size, gpu=gpu,
                         evaluation=True).sort_(sort_key=sort_key)
    return train, valid, test
예제 #5
0
def load_from_lines(
        path, batch_size, max_size=1000000, min_freq=5, gpu=False,
        shuffle=True, sort_key=lambda x: len(x[0]), **kwargs):
    lines = load_lines(path)
    ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                 max_size=max_size, min_freq=min_freq)
    ldict.fit(lines)
    mock_labels = make_mock_labels(train)
    mock = Dict()
    mock.fit(mock_labels)
    d = {'src': ldict, 'trg': mock}
    splits = PairedDataset(lines, mock_labels, d, batch_size, gpu=gpu).splits(
        shuffle=shuffle, sort_key=sort_key, **kwargs)
    return splits
예제 #6
0
def load_from_lines(path,
                    batch_size,
                    max_size=1000000,
                    min_freq=5,
                    gpu=False,
                    shuffle=True,
                    **kwargs):
    lines = load_lines(path)

    ldict = Dict(pad_token=u.PAD,
                 eos_token=u.EOS,
                 bos_token=u.BOS,
                 max_size=max_size,
                 min_freq=min_freq).fit(lines)

    return PairedDataset(lines, None, {
        'src': ldict
    }, batch_size, gpu=gpu).splits(shuffle=shuffle, **kwargs)
예제 #7
0
def shingle_dataset(args, vocab_dict=None, focus_size=None, right_size=None):
    if focus_size:
        args.focus_size = focus_size
    if right_size:
        args.right_size = right_size

    # load the data:
    if args.task == 'sentences':
        dataset = list(
            SentenceCouples(args.input,
                            max_items=args.max_items,
                            tokenize=args.tokenize,
                            level=args.level))
        print(f'* loaded {len(dataset)} sentences')
    elif args.task == 'snippets':
        dataset = list(
            SnippetCouples(args.input,
                           focus_size=args.focus_size,
                           right_size=args.right_size,
                           max_items=args.max_items))
        print(f'* loaded {len(dataset)} snippets')
    else:
        raise ValueError("`Task` should be one of ('sentences', 'snippets')")

    # random shuffle:
    if args.shuffle:
        print('* shuffling batches...')
        random.seed(args.rnd_seed)
        random.shuffle(dataset)

    for c in dataset[:10]:
        print('\t'.join(' '.join(s[:10]) for s in c))

    if vocab_dict is None:
        vocab_dict = Dict(pad_token=u.PAD,
                          bos_token=u.BOS,
                          eos_token=u.EOS,
                          min_freq=args.min_item_freq,
                          sequential=True,
                          force_unk=True,
                          max_size=args.max_vocab_size)

    focus, right = zip(*dataset)
    del dataset
    if not vocab_dict.fitted:
        vocab_dict.fit(
            focus, right
        )  # sometimes inefficient? # do a partial fit in the triple store?

    train, valid = PairedDataset(src=(focus, ),
                                 trg=(right, ),
                                 d={
                                     'src': (vocab_dict, ),
                                     'trg': (vocab_dict, )
                                 },
                                 batch_size=args.batch_size,
                                 gpu=args.gpu,
                                 align_right=args.reverse,
                                 fitted=False).splits(sort_by='src',
                                                      dev=args.dev,
                                                      test=None,
                                                      sort=True)

    return train, valid, vocab_dict
예제 #8
0
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_gpu(args.gpu)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len,
                                       sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        train, valid = PairedDataset(src,
                                     trg, {
                                         'src': src_dict,
                                         'trg': src_dict
                                     },
                                     batch_size=args.batch_size,
                                     gpu=args.gpu).splits(dev=args.dev,
                                                          test=None,
                                                          sort_by='src')

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
    print(' * maximum batch size. %d' % batch_size)

    print('Building model...')

    model = EncoderDecoder((args.layers, args.dec_layers),
                           args.emb_dim,
                           args.hid_dim,
                           args.att_dim,
예제 #9
0
    parser.add_argument('--pretrained', type=str, default='empty')

    # Logging
    parser.add_argument('--gen_src', default=None)
    parser.add_argument('--gen_tgt', default=None)
    parser.add_argument('--csv', type=str, default='empty')
    parser.add_argument('--logging', action='store_true')
    parser.add_argument('--visdom', action='store_true')
    args = parser.parse_args()

    src, trg = load_data(args.path, ('.answers', '.questions'))
    src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                    max_size=args.max_size, min_freq=args.min_freq)
    src_dict.fit(src, trg)
    train, valid = PairedDataset(
        src, trg, {'src': src_dict, 'trg': src_dict},
        batch_size=args.batch_size, gpu=args.gpu
    ).splits(dev=args.dev, test=None, sort_key=lambda pair: len(pair[0]))

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
    print(' * maximum batch size. %d' % args.batch_size)

    print('Building model...')
    model = EncoderDecoder(
        # removed (args.hid_dim, args.hid_dim) added args.hid_dim
        (args.layers, args.layers), args.emb_dim, args.hid_dim,
        args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout,
        bidi=args.bidi, cell=args.cell)

    # Load Glove Pretrained Embeddings
예제 #10
0
    for target in args.targets:
        sample_fn = wrap_autoencode(getattr(d, target))
        src, trg = zip(*d.generate_set(
            args.train_len, args.vocab, args.min_len, args.max_len, sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        datasets[target] = {'src': src, 'trg': trg}

    src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
    src_dict.fit(*[data
                   for target in datasets
                   for data in datasets[target].values()])

    for target in datasets:
        train, valid = PairedDataset(
            datasets[target]['src'], datasets[target]['trg'],
            {'src': src_dict, 'trg': src_dict},
            batch_size=args.batch_size, gpu=args.gpu).splits(
                dev=args.dev, test=None,
                shuffle=True, sort_key=lambda pair: len(pair[0]))
        del datasets[target]
        src, trg = zip(*d.generate_set(
            int(args.train_len * 0.1), args.vocab, args.min_len, args.max_len,
            getattr(d, target)))
        src, trg = list(map(list, src)), list(map(list, trg))
        test = PairedDataset(src, trg, {'src': src_dict, 'trg': src_dict},
                             batch_size=args.batch_size, gpu=args.gpu)
        datasets[target] = {'train': train, 'valid': valid, 'test': test}

    print('Building model...')
    model = ForkableMultiTarget(
        (args.layers, args.layers), args.emb_dim, (args.hid_dim, args.hid_dim),
        args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout,