def prepare_s2b_dataset(data_dir,
                        data_dict,
                        max_src_vocab=16000,
                        max_tgt_vocab=300,
                        vocab_freq_cutoff=1):
    train_set = Dataset.from_raw_file(
        os.path.join(data_dir, data_dict['train']))
    dev_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['dev']))
    test_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['test']))

    # generate vocabulary
    src_vocab = VocabEntry.from_corpus([e.src for e in train_set],
                                       size=max_src_vocab,
                                       freq_cutoff=vocab_freq_cutoff)
    tgt_vocab = VocabEntry.from_corpus([e.tgt for e in train_set],
                                       size=max_tgt_vocab,
                                       freq_cutoff=vocab_freq_cutoff)

    vocab = Vocab(src=src_vocab, tgt=tgt_vocab)
    print('generated vocabulary %s' % repr(vocab), file=sys.stderr)

    print("sum info: train:{},dev:{},test:{}".format(
        len(train_set),
        len(dev_set),
        len(test_set),
    ))
    detail(train_set)
    detail(dev_set)
    detail(test_set)

    train_file = data_dir + "/train.bin"
    dev_file = data_dir + "/dev.bin"
    test_file = data_dir + "/test.bin"
    vocab_file = data_dir + "/vocab.bin"

    pickle.dump(train_set.examples, open(train_file, 'wb'))
    pickle.dump(dev_set.examples, open(dev_file, 'wb'))
    pickle.dump(test_set.examples, open(test_file, 'wb'))
    pickle.dump(vocab, open(vocab_file, 'wb'))
    if 'debug' in data_dict:
        debug_set = Dataset.from_raw_file(
            os.path.join(data_dir, data_dict['debug']))
        debug_file = data_dir + "/debug.bin"
        pickle.dump(debug_set.examples, open(debug_file, 'wb'))
def prepare_ptb_to_distance(data_dir, data_dict):
    train_set = Dataset.from_raw_file(os.path.join(data_dir,
                                                   data_dict['train']),
                                      e_type='ptb')
    dev_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['dev']),
                                    e_type='ptb')
    test_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['test']),
                                     e_type='ptb')
    debug_set = Dataset.from_raw_file(os.path.join(data_dir,
                                                   data_dict['debug']),
                                      e_type='ptb')

    train_file = data_dir + "/train.bin"
    dev_file = data_dir + "/dev.bin"
    test_file = data_dir + "/test.bin"
    debug_file = data_dir + "/debug.bin"

    pickle.dump(train_set.examples, open(train_file, 'wb'))
    pickle.dump(dev_set.examples, open(dev_file, 'wb'))
    pickle.dump(test_set.examples, open(test_file, 'wb'))
    pickle.dump(debug_set.examples, open(debug_file, 'wb'))
def prepare_raw_data(data_dir, data_dict):
    for key, val in data_dict.items():
        path = os.path.join(data_dir, val)
        data = Dataset.from_raw_file(path)
        out_file = path + ".bin"
        pickle.dump(data.examples, open(out_file, 'wb'))