Exemplo n.º 1
0
def get_corpora(lang, num_train=500000, num_test=10000, distributed=False):
        full_corpus = corpora.get_corpus(lang, word_boundaries=True)

        # A list of (phoneme, precedes_boundary) tuples.
        phones_and_boundaries = extract_boundaries(full_corpus)

        # Divide into train and test.
        train, test = corpora.train_test_split(phones_and_boundaries, 
                                               num_train, num_test, mode='end')

        # Separate phones from boundary markers.
        train_phones, _ = map(list, zip(*train))
        test_phones, test_bounds = map(list, zip(*test))
        joblib.dump(test_bounds, lang + '_bounds.pkl')
        return

        # Construct targets and encode phonemes.
        train_in, train_out = prepare(train_phones, distributed)
        test_in, test_out = prepare(test_phones, distributed)
        
        # Remove the trailing bound to match test_out.
        del test_bounds[-1]
        assert len(train_in) == len(train_out)
        assert len(test_in) == len(test_out) == len(test_bounds)

        return (train_in, train_out), (test_in, test_out), test_bounds
Exemplo n.º 2
0
def get_corpora(lang, kind, train_len, roc_len=100, bleu_len=100):
    if lang == 'toy2':
        import pcfg
        corpus = (s.split(' ') for s in pcfg.toy2())
    else:
        corpus = corpora.get_corpus(lang, kind)

    train_corpus = [next(corpus) for _ in range(train_len)]

    testable = (utt for utt in corpus if 2 < len(utt) < 20)
    roc_test_corpus = [next(testable) for _ in range(roc_len)]

    producable = (utt for utt in corpus if 2 < len(utt) < 20)
    bleu_test_corpus = [next(producable) for _ in range(bleu_len)]
    
    return {'train': train_corpus,
            'roc_test': roc_test_corpus,
            'bleu_test': bleu_test_corpus}
Exemplo n.º 3
0
def get_model(train_len=1000, lang='english', kind='word', **params):
    # for testing
    model = Numila(**params)
    corpus = corpora.get_corpus(lang, kind)
    train_corpus = [next(corpus) for _ in range(train_len)]
    return model.fit(train_corpus)