def get_datasets(argv): # dataset: 1D: n_docs, 2D: n_utterances, 3D: elem=(time, speaker_id, addressee_id, response1, ... , label) say('\n\nLoad dataset...') train_dataset, word_set = load_dataset(fn=argv.train_data) dev_dataset, _ = load_dataset(fn=argv.dev_data) test_dataset, _ = load_dataset(fn=argv.test_data) return train_dataset, dev_dataset, test_dataset, word_set
def create_samples(argv, train_dataset, dev_dataset, test_dataset): n_prev_sents = argv.n_prev_sents # samples: 1D: n_samples; elem=Sample() say('\n\nCreating samples...') train_samples = get_samples(threads=train_dataset, n_prev_sents=n_prev_sents) dev_samples = get_samples(threads=dev_dataset, n_prev_sents=n_prev_sents, test=True) test_samples = get_samples(threads=test_dataset, n_prev_sents=n_prev_sents, test=True) return train_samples, dev_samples, test_samples
def main(argv): say('\nSAMPLE GENERATOR\n') path = Path(argv.train_data) os.chdir(path.parent) train_dataset, dev_dataset, test_dataset, word_dict = get_datasets(argv) train_samples, dev_samples, test_samples = create_samples( argv, train_dataset, dev_dataset, test_dataset) # n_cands = len(train_samples[0].response) # n_prev_sents = argv.n_prev_sents output_samples('train-sample', train_samples) output_samples('dev-sample', dev_samples) output_samples('test-sample', test_samples) output_vocab(word_dict)
def get_samples(threads, n_prev_sents, test=False): """ :param threads: 1D: n_threads, 2D: n_sents, 3D: (time, speaker_id, addressee_id, response, ..., label) :return: samples: 1D: n_samples; elem=Sample() """ if threads is None: return None say('\n\n\tTHREADS: {:>5}'.format(len(threads))) samples = [] max_n_agents = n_prev_sents + 1 for thread in threads: samples += get_one_thread_samples(thread, max_n_agents, n_prev_sents, test) # sample_statistics(samples, max_n_agents) return samples