def test_train_text(prep): train_text = prep[:10] test_text = prep[10:] # offline learning on train_text segmented1 = list(puddle.segment(test_text, train_text=train_text)) # online learning segmented2 = list(puddle.segment(test_text, nfolds=1)) def join(s): return ''.join(s).replace(' ', '') assert len(test_text) == len(segmented1) == len(segmented2) assert join(test_text) == join(segmented1) == join(segmented2)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = puddle.segment(_prepared, nfolds=1) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.06369, 'type_precision': 0.1075, 'type_recall': 0.04525, 'token_fscore': 0.06295, 'token_precision': 0.2056, 'token_recall': 0.03716, 'boundary_all_fscore': 0.4605, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.2991, 'boundary_noedge_fscore': 0.02806, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.01423 } assert score == pytest.approx(expected, rel=1e-3)
def test_puddle(prep, window, nfolds, njobs): out = list(puddle.segment(prep, window=window, nfolds=nfolds, njobs=njobs)) s = Separator().remove assert len(out) == len(prep) for n, (a, b) in enumerate(zip(out, prep)): assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
def test_empty_line(prep): with pytest.raises(ValueError) as err: puddle.segment(prep[:2] + [''] + prep[4:]) assert 'utterance is empty' in str(err)
# display the computed statistics sys.stdout.write( '* Statistics\n\n' + json.dumps(stats, indent=4) + '\n') # prepare the input for segmentation prepared = list(prepare(text)) # generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs) # evaluate them against the gold file eval_baseline = evaluate(segmented_baseline, gold, units=prepared) eval_tp = evaluate(segmented_tp, gold, units=prepared) eval_puddle = evaluate(segmented_puddle, gold, units=prepared) eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared) eval_ag = evaluate(segmented_ag, gold, units=prepared) eval_dibs = evaluate(segmented_dibs, gold, units=prepared)