def get_corpora(lang, num_train=500000, num_test=10000, distributed=False): full_corpus = corpora.get_corpus(lang, word_boundaries=True) # A list of (phoneme, precedes_boundary) tuples. phones_and_boundaries = extract_boundaries(full_corpus) # Divide into train and test. train, test = corpora.train_test_split(phones_and_boundaries, num_train, num_test, mode='end') # Separate phones from boundary markers. train_phones, _ = map(list, zip(*train)) test_phones, test_bounds = map(list, zip(*test)) joblib.dump(test_bounds, lang + '_bounds.pkl') return # Construct targets and encode phonemes. train_in, train_out = prepare(train_phones, distributed) test_in, test_out = prepare(test_phones, distributed) # Remove the trailing bound to match test_out. del test_bounds[-1] assert len(train_in) == len(train_out) assert len(test_in) == len(test_out) == len(test_bounds) return (train_in, train_out), (test_in, test_out), test_bounds
def get_corpora(lang, kind, train_len, roc_len=100, bleu_len=100): if lang == 'toy2': import pcfg corpus = (s.split(' ') for s in pcfg.toy2()) else: corpus = corpora.get_corpus(lang, kind) train_corpus = [next(corpus) for _ in range(train_len)] testable = (utt for utt in corpus if 2 < len(utt) < 20) roc_test_corpus = [next(testable) for _ in range(roc_len)] producable = (utt for utt in corpus if 2 < len(utt) < 20) bleu_test_corpus = [next(producable) for _ in range(bleu_len)] return {'train': train_corpus, 'roc_test': roc_test_corpus, 'bleu_test': bleu_test_corpus}
def get_model(train_len=1000, lang='english', kind='word', **params): # for testing model = Numila(**params) corpus = corpora.get_corpus(lang, kind) train_corpus = [next(corpus) for _ in range(train_len)] return model.fit(train_corpus)