Exemplo n.º 1
0
def get_datasets(argv):
    # dataset: 1D: n_docs, 2D: n_utterances, 3D: elem=(time, speaker_id, addressee_id, response1, ... , label)
    say('\n\nLoad dataset...')
    train_dataset, word_set = load_dataset(fn=argv.train_data)
    dev_dataset, _ = load_dataset(fn=argv.dev_data)
    test_dataset, _ = load_dataset(fn=argv.test_data)
    return train_dataset, dev_dataset, test_dataset, word_set
Exemplo n.º 2
0
def create_samples(argv, train_dataset, dev_dataset, test_dataset):
    n_prev_sents = argv.n_prev_sents

    # samples: 1D: n_samples; elem=Sample()
    say('\n\nCreating samples...')
    train_samples = get_samples(threads=train_dataset,
                                n_prev_sents=n_prev_sents)
    dev_samples = get_samples(threads=dev_dataset,
                              n_prev_sents=n_prev_sents,
                              test=True)
    test_samples = get_samples(threads=test_dataset,
                               n_prev_sents=n_prev_sents,
                               test=True)
    return train_samples, dev_samples, test_samples
Exemplo n.º 3
0
def main(argv):
    say('\nSAMPLE GENERATOR\n')

    path = Path(argv.train_data)
    os.chdir(path.parent)

    train_dataset, dev_dataset, test_dataset, word_dict = get_datasets(argv)
    train_samples, dev_samples, test_samples = create_samples(
        argv, train_dataset, dev_dataset, test_dataset)

    # n_cands = len(train_samples[0].response)
    # n_prev_sents = argv.n_prev_sents

    output_samples('train-sample', train_samples)
    output_samples('dev-sample', dev_samples)
    output_samples('test-sample', test_samples)
    output_vocab(word_dict)
Exemplo n.º 4
0
def get_samples(threads, n_prev_sents, test=False):
    """
    :param threads: 1D: n_threads, 2D: n_sents, 3D: (time, speaker_id, addressee_id, response, ..., label)
    :return: samples: 1D: n_samples; elem=Sample()
    """

    if threads is None:
        return None

    say('\n\n\tTHREADS: {:>5}'.format(len(threads)))

    samples = []
    max_n_agents = n_prev_sents + 1

    for thread in threads:
        samples += get_one_thread_samples(thread, max_n_agents, n_prev_sents,
                                          test)

    # sample_statistics(samples, max_n_agents)

    return samples