예제 #1
def test_prepare_bad_types():
    # give dict or list of int as input, must fail
    with pytest.raises(AttributeError):
        list(prepare({1: 1, 2: 2}))

    with pytest.raises(AttributeError):
        list(prepare([1, 2], separator=Separator()))
예제 #2
def test_prepare_tolerant():

    # tolerant=False
    with pytest.raises(ValueError):
        list(prepare(utterances, tolerant=False))

    # tolerant=True
    prepared = list(prepare(utterances, tolerant=True))
    assert len(prepared) == len(GOOD_UTTERANCES)
    assert prepared == list(prepare(GOOD_UTTERANCES))
예제 #3
def test_empty_lines():
    text = ['', '']
    assert len(list(prepare(text))) == 0
    assert len(list(gold(text))) == 0

    text = [
        'hh ax l ;esyll ow ;esyll ;eword', '',
        'hh ax l ;esyll ow ;esyll ;eword'
    assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2
    assert len(list(gold(text, separator=Separator()))) == 2
예제 #4
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    _tags = [utt for utt in codecs.open(
        os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8')
            if utt]
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)
    _train = _tags[:200]

    model = dibs.CorpusSummary(_train)
    segmented = dibs.segment(_prepared, model)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases). You
    # can replicate this result in CDSWordseg using
    # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs"
    expected = {
        'type_fscore': 0.2359,
        'type_precision': 0.2084,
        'type_recall': 0.2719,
        'token_fscore': 0.239,
        'token_precision': 0.3243,
        'token_recall': 0.1892,
        'boundary_all_fscore': 0.6543,
        'boundary_all_precision': 0.8377,
        'boundary_all_recall': 0.5367,
        'boundary_noedge_fscore': 0.4804,
        'boundary_noedge_precision': 0.7161,
        'boundary_noedge_recall': 0.3614}

    assert score == pytest.approx(expected, rel=1e-3)
예제 #5
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = puddle.segment(_prepared, nfolds=1)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.06369,
        'type_precision': 0.1075,
        'type_recall': 0.04525,
        'token_fscore': 0.06295,
        'token_precision': 0.2056,
        'token_recall': 0.03716,
        'boundary_all_fscore': 0.4605,
        'boundary_all_precision': 1.0,
        'boundary_all_recall': 0.2991,
        'boundary_noedge_fscore': 0.02806,
        'boundary_noedge_precision': 1.0,
        'boundary_noedge_recall': 0.01423

    assert score == pytest.approx(expected, rel=1e-3)
예제 #6
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = tp.segment(_prepared)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.304,
        'type_precision': 0.2554,
        'type_recall': 0.3756,
        'token_fscore': 0.3994,
        'token_precision': 0.3674,
        'token_recall': 0.4375,
        'boundary_all_fscore': 0.7174,
        'boundary_all_precision': 0.6671,
        'boundary_all_recall': 0.776,
        'boundary_noedge_fscore': 0.6144,
        'boundary_noedge_precision': 0.557,
        'boundary_noedge_recall': 0.685

    assert score == pytest.approx(expected, rel=1e-3)
예제 #7
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    # only the last 10 lines, for a fast test. We cannot take the 10
    # first lines because they cause the dpseg_bugfix to correct a
    # fold (the implementation of that fix differs in CDS and wordseg,
    # so the results are not replicated exactly)
    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt

    _prepared = prepare(_tags, separator=sep, unit='syllable')
    _gold = gold(_tags, separator=sep)

    uni_dmcmc_conf = [
        c for c in wordseg.utils.get_config_files('dpseg') if 'uni_dmcmc' in c
    args = '--ngram 1 --a1 0 --b1 1 -C {}'.format(uni_dmcmc_conf)
    segmented = segment(_prepared, nfolds=5, njobs=4, args=args)
    score = evaluate(segmented, _gold)

    # we obtained that scores from the dpseg version in CDSWordSeg
    expected = {
        'type_fscore': 0.3768,
        'type_precision': 0.3939,
        'type_recall': 0.3611,
        'token_fscore': 0.3836,
        'token_precision': 0.4118,
        'token_recall': 0.359,
        'boundary_all_fscore': 0.7957,
        'boundary_all_precision': 0.8409,
        'boundary_all_recall': 0.7551,
        'boundary_noedge_fscore': 0.6415,
        'boundary_noedge_precision': 0.7083,
        'boundary_noedge_recall': 0.5862

    assert score == pytest.approx(expected, rel=1e-3)
예제 #8
파일: dibs.py 프로젝트: manelkhe/wordseg
def main():
    """Entry point of the 'wordseg-dibs' command"""
    streamin, streamout, _, log, args = utils.prepare_main(

    # setup the separator from parsed arguments
    separator = Separator(phone=args.phone_separator,

    # load test text as a list of utterances, ignore empty lines
    test_text = [line for line in streamin if line]
    log.info('loaded %s utterances as test data', len(test_text))

    # user provided a train text, ensure it is valid and that test_text does
    # not include word separators
    if args.train_file:
        if not os.path.isfile(args.train_file):
            raise ValueError(
                'train file specified but does not exist: {}'.format(

        # make sure test_text is in prepared form
        for n, line in enumerate(test_text):
            if separator.word in line:
                raise ValueError(
                    f'word separator found in test text (line {n+1})')

        # load train and test texts, ignore empty lines
        train_text = codecs.open(args.train_file, 'r', encoding='utf8')
        train_text = [line for line in train_text if line]
        log.info('loaded %s utterances as train data', len(train_text))
        log.info('using test data for training')
        # the presence of word separator in train utterance will be checked
        # during training
        train_text = test_text

        # remove the word separators for testing
        test_text = prepare(test_text)

    # train the model (learn diphone statistics)
    trained_model = CorpusSummary(train_text,

    # segment the test text on the trained model
    segmented = segment(test_text,

    # output the segmented text
    streamout.write('\n'.join(segmented) + '\n')

    # save the computed diphones if required
    if args.diphones:
        log.info('saving %s diphones to %s', len(trained_model.diphones),

        output = ('{} {} {}'.format(v, k[0], k[1])
                  for k, v in sorted(trained_model.diphones.items(),

        codecs.open(args.diphones, 'w',
                    encoding='utf8').write('\n'.join(output) + '\n')
예제 #9
def test_punctuation(utt):
    with pytest.raises(ValueError):
        list(prepare([utt], check_punctuation=True))

    list(prepare([utt], check_punctuation=False))
예제 #10
 def f(u):
     return list(prepare(p['raw'], separator=Separator(), unit=u))
예제 #11
# load the input text file
text = open(sys.argv[1], 'r').readlines()

# compute some statistics on the input text (text tokenized at phone
# and word levels)
separator = Separator(phone=' ', syllable=None, word=';eword')
stats = CorpusStatistics(text, separator).describe_all()

# display the computed statistics
    '* Statistics\n\n' +
    json.dumps(stats, indent=4) + '\n')

# prepare the input for segmentation
prepared = list(prepare(text))

# generate the gold text
gold = list(gold(text))

# segment the prepared text with different algorithms
segmented_baseline = baseline.segment(prepared, probability=0.2)
segmented_tp = tp.segment(prepared, threshold='relative')
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')

# we must provide a trained model to dibs (with stats on diphones)
model_dibs = dibs.CorpusSummary(text)
segmented_dibs = dibs.segment(prepared, model_dibs)