Пример #1
0
def test_train_text(prep):
    train_text = prep[:10]
    test_text = prep[10:]

    # offline learning on train_text
    segmented1 = list(puddle.segment(test_text, train_text=train_text))

    # online learning
    segmented2 = list(puddle.segment(test_text, nfolds=1))

    def join(s):
        return ''.join(s).replace(' ', '')

    assert len(test_text) == len(segmented1) == len(segmented2)
    assert join(test_text) == join(segmented1) == join(segmented2)
Пример #2
0
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = puddle.segment(_prepared, nfolds=1)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.06369,
        'type_precision': 0.1075,
        'type_recall': 0.04525,
        'token_fscore': 0.06295,
        'token_precision': 0.2056,
        'token_recall': 0.03716,
        'boundary_all_fscore': 0.4605,
        'boundary_all_precision': 1.0,
        'boundary_all_recall': 0.2991,
        'boundary_noedge_fscore': 0.02806,
        'boundary_noedge_precision': 1.0,
        'boundary_noedge_recall': 0.01423
    }

    assert score == pytest.approx(expected, rel=1e-3)
Пример #3
0
def test_puddle(prep, window, nfolds, njobs):
    out = list(puddle.segment(prep, window=window, nfolds=nfolds, njobs=njobs))
    s = Separator().remove

    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
Пример #4
0
def test_empty_line(prep):
    with pytest.raises(ValueError) as err:
        puddle.segment(prep[:2] + [''] + prep[4:])
    assert 'utterance is empty' in str(err)
Пример #5
0
# display the computed statistics
sys.stdout.write(
    '* Statistics\n\n' +
    json.dumps(stats, indent=4) + '\n')

# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)

# evaluate them against the gold file
eval_baseline = evaluate(segmented_baseline, gold, units=prepared)
eval_tp = evaluate(segmented_tp, gold, units=prepared)
eval_puddle = evaluate(segmented_puddle, gold, units=prepared)
eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared)
eval_ag = evaluate(segmented_ag, gold, units=prepared)
eval_dibs = evaluate(segmented_dibs, gold, units=prepared)