示例#1
0
def test_empty_lines():
    text = ['', '']
    assert len(list(prepare(text))) == 0
    assert len(list(gold(text))) == 0

    text = [
        'hh ax l ;esyll ow ;esyll ;eword', '',
        'hh ax l ;esyll ow ;esyll ;eword'
    ]
    assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2
    assert len(list(gold(text, separator=Separator()))) == 2
示例#2
0
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    _tags = [utt for utt in codecs.open(
        os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8')
            if utt]
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)
    _train = _tags[:200]

    model = dibs.CorpusSummary(_train)
    segmented = dibs.segment(_prepared, model)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases). You
    # can replicate this result in CDSWordseg using
    # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs"
    expected = {
        'type_fscore': 0.2359,
        'type_precision': 0.2084,
        'type_recall': 0.2719,
        'token_fscore': 0.239,
        'token_precision': 0.3243,
        'token_recall': 0.1892,
        'boundary_all_fscore': 0.6543,
        'boundary_all_precision': 0.8377,
        'boundary_all_recall': 0.5367,
        'boundary_noedge_fscore': 0.4804,
        'boundary_noedge_precision': 0.7161,
        'boundary_noedge_recall': 0.3614}

    assert score == pytest.approx(expected, rel=1e-3)
示例#3
0
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = puddle.segment(_prepared, nfolds=1)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.06369,
        'type_precision': 0.1075,
        'type_recall': 0.04525,
        'token_fscore': 0.06295,
        'token_precision': 0.2056,
        'token_recall': 0.03716,
        'boundary_all_fscore': 0.4605,
        'boundary_all_precision': 1.0,
        'boundary_all_recall': 0.2991,
        'boundary_noedge_fscore': 0.02806,
        'boundary_noedge_precision': 1.0,
        'boundary_noedge_recall': 0.01423
    }

    assert score == pytest.approx(expected, rel=1e-3)
示例#4
0
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = tp.segment(_prepared)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.304,
        'type_precision': 0.2554,
        'type_recall': 0.3756,
        'token_fscore': 0.3994,
        'token_precision': 0.3674,
        'token_recall': 0.4375,
        'boundary_all_fscore': 0.7174,
        'boundary_all_precision': 0.6671,
        'boundary_all_recall': 0.776,
        'boundary_noedge_fscore': 0.6144,
        'boundary_noedge_precision': 0.557,
        'boundary_noedge_recall': 0.685
    }

    assert score == pytest.approx(expected, rel=1e-3)
示例#5
0
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    # only the last 10 lines, for a fast test. We cannot take the 10
    # first lines because they cause the dpseg_bugfix to correct a
    # fold (the implementation of that fix differs in CDS and wordseg,
    # so the results are not replicated exactly)
    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][-10:]

    _prepared = prepare(_tags, separator=sep, unit='syllable')
    _gold = gold(_tags, separator=sep)

    uni_dmcmc_conf = [
        c for c in wordseg.utils.get_config_files('dpseg') if 'uni_dmcmc' in c
    ][0]
    args = '--ngram 1 --a1 0 --b1 1 -C {}'.format(uni_dmcmc_conf)
    segmented = segment(_prepared, nfolds=5, njobs=4, args=args)
    score = evaluate(segmented, _gold)

    # we obtained that scores from the dpseg version in CDSWordSeg
    expected = {
        'type_fscore': 0.3768,
        'type_precision': 0.3939,
        'type_recall': 0.3611,
        'token_fscore': 0.3836,
        'token_precision': 0.4118,
        'token_recall': 0.359,
        'boundary_all_fscore': 0.7957,
        'boundary_all_precision': 0.8409,
        'boundary_all_recall': 0.7551,
        'boundary_noedge_fscore': 0.6415,
        'boundary_noedge_precision': 0.7083,
        'boundary_noedge_recall': 0.5862
    }

    assert score == pytest.approx(expected, rel=1e-3)
示例#6
0
# compute some statistics on the input text (text tokenized at phone
# and word levels)
separator = Separator(phone=' ', syllable=None, word=';eword')
stats = CorpusStatistics(text, separator).describe_all()

# display the computed statistics
sys.stdout.write(
    '* Statistics\n\n' +
    json.dumps(stats, indent=4) + '\n')

# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)

# evaluate them against the gold file
eval_baseline = evaluate(segmented_baseline, gold, units=prepared)
eval_tp = evaluate(segmented_tp, gold, units=prepared)