def test_replicate_cdswordseg(datadir): sep = Separator() _tags = [utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt] _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) _train = _tags[:200] model = dibs.CorpusSummary(_train) segmented = dibs.segment(_prepared, model) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases). You # can replicate this result in CDSWordseg using # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs" expected = { 'type_fscore': 0.2359, 'type_precision': 0.2084, 'type_recall': 0.2719, 'token_fscore': 0.239, 'token_precision': 0.3243, 'token_recall': 0.1892, 'boundary_all_fscore': 0.6543, 'boundary_all_precision': 0.8377, 'boundary_all_recall': 0.5367, 'boundary_noedge_fscore': 0.4804, 'boundary_noedge_precision': 0.7161, 'boundary_noedge_recall': 0.3614} assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = puddle.segment(_prepared, nfolds=1) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.06369, 'type_precision': 0.1075, 'type_recall': 0.04525, 'token_fscore': 0.06295, 'token_precision': 0.2056, 'token_recall': 0.03716, 'boundary_all_fscore': 0.4605, 'boundary_all_precision': 1.0, 'boundary_all_recall': 0.2991, 'boundary_noedge_fscore': 0.02806, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 0.01423 } assert score == pytest.approx(expected, rel=1e-3)
def test_replicate(datadir): sep = Separator() _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][:100] # 100 first lines only _prepared = prepare(_tags, separator=sep) _gold = gold(_tags, separator=sep) segmented = tp.segment(_prepared) score = evaluate(segmented, _gold) # we obtained that score from the dibs version in CDSWordSeg # (using wordseg.prepare and wordseg.evaluate in both cases) expected = { 'type_fscore': 0.304, 'type_precision': 0.2554, 'type_recall': 0.3756, 'token_fscore': 0.3994, 'token_precision': 0.3674, 'token_recall': 0.4375, 'boundary_all_fscore': 0.7174, 'boundary_all_precision': 0.6671, 'boundary_all_recall': 0.776, 'boundary_noedge_fscore': 0.6144, 'boundary_noedge_precision': 0.557, 'boundary_noedge_recall': 0.685 } assert score == pytest.approx(expected, rel=1e-3)
def test_boundary_3(): text = ['hell o'] gold = ['h ello'] score = evaluate(text, gold) expected = { 'boundary_all_precision': 2.0 / 3.0, 'boundary_all_recall': 2.0 / 3.0, 'boundary_all_fscore': 2.0 / 3.0, 'boundary_noedge_precision': 0, 'boundary_noedge_recall': 0, 'boundary_noedge_fscore': 0 } for k, v in expected.items(): assert score[k] == v, k
def test_boundary_1(): text = ['hello'] gold = ['hello'] score = {k: v for k, v in evaluate(text, gold).items() if 'boundary' in k} expected = { 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_noedge_precision': None, 'boundary_noedge_recall': None, 'boundary_noedge_fscore': None } for k, v in expected.items(): assert score[k] == v, k
def test_replicate_cdswordseg(datadir): sep = Separator() # only the last 10 lines, for a fast test. We cannot take the 10 # first lines because they cause the dpseg_bugfix to correct a # fold (the implementation of that fix differs in CDS and wordseg, # so the results are not replicated exactly) _tags = [ utt for utt in codecs.open( os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt ][-10:] _prepared = prepare(_tags, separator=sep, unit='syllable') _gold = gold(_tags, separator=sep) uni_dmcmc_conf = [ c for c in wordseg.utils.get_config_files('dpseg') if 'uni_dmcmc' in c ][0] args = '--ngram 1 --a1 0 --b1 1 -C {}'.format(uni_dmcmc_conf) segmented = segment(_prepared, nfolds=5, njobs=4, args=args) score = evaluate(segmented, _gold) # we obtained that scores from the dpseg version in CDSWordSeg expected = { 'type_fscore': 0.3768, 'type_precision': 0.3939, 'type_recall': 0.3611, 'token_fscore': 0.3836, 'token_precision': 0.4118, 'token_recall': 0.359, 'boundary_all_fscore': 0.7957, 'boundary_all_precision': 0.8409, 'boundary_all_recall': 0.7551, 'boundary_noedge_fscore': 0.6415, 'boundary_noedge_precision': 0.7083, 'boundary_noedge_recall': 0.5862 } assert score == pytest.approx(expected, rel=1e-3)
def _test_basic(text, gold, units, expected): pred = evaluate(text, gold, units=units) assert pred == pytest.approx(expected)
def test_ipa(): text = ['juːviː mɔː kʊkɪz '] gold = ['juː viː mɔː kʊkɪz'] evaluate(text, gold)
def test_gold_on_gold(): gold = ['the dog bites the dog'] for v in evaluate(gold, gold).values(): assert v == 1.0
# generate the gold text gold = list(gold(text)) # segment the prepared text with different algorithms segmented_baseline = baseline.segment(prepared, probability=0.2) segmented_tp = tp.segment(prepared, threshold='relative') segmented_puddle = puddle.segment(prepared, njobs=4, window=2) segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1') segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100') # we must provide a trained model to dibs (with stats on diphones) model_dibs = dibs.CorpusSummary(text) segmented_dibs = dibs.segment(prepared, model_dibs) # evaluate them against the gold file eval_baseline = evaluate(segmented_baseline, gold, units=prepared) eval_tp = evaluate(segmented_tp, gold, units=prepared) eval_puddle = evaluate(segmented_puddle, gold, units=prepared) eval_dpseg = evaluate(segmented_dpseg, gold, units=prepared) eval_ag = evaluate(segmented_ag, gold, units=prepared) eval_dibs = evaluate(segmented_dibs, gold, units=prepared) # a little function to display score with 4-digits precision def display(score): if score is None: return 'None' else: return '%.4g' % score