Пример #1
0
def test_levels():
    assert Separator(phone='a', syllable='b', word='c').levels() \
        == ['phone', 'syllable', 'word']
    assert Separator(phone='a', syllable='b', word=None).levels() \
        == ['phone', 'syllable']
    assert Separator(phone='a', syllable=None, word=None).levels() \
        == ['phone']
Пример #2
0
def test_split_vs_tokenize(text, expected, keep_boundaries):
    s = Separator(phone='p', syllable='s', word='w')

    assert list(s.split(text, 'word', keep_boundaries=keep_boundaries)) \
        == expected

    assert list(s.tokenize(text, 'word', keep_boundaries=keep_boundaries)) \
        == [e for e in expected if len(e)]
Пример #3
0
    def __init__(self):
        # token separation on words only
        self.separator = Separator(phone=None, syllable=None, word=' ')

        # count over/under/mis/good segmentation for each word type
        self.over_segmentation = collections.defaultdict(int)
        self.under_segmentation = collections.defaultdict(int)
        self.mis_segmentation = collections.defaultdict(int)
        self.correct_segmentation = collections.defaultdict(int)
Пример #4
0
def test_summary_perfect(gold):
    d = summary(gold, gold)
    sep = Separator(phone=None, syllable=None, word=' ')
    nwords = sum(len(sep.tokenize(utt, level='word')) for utt in gold)

    # all is in correct
    for category in ('under', 'over', 'mis'):
        assert not d[category]

    # expected number of words in correct
    assert sum(d['correct'].values()) == nwords
Пример #5
0
def test_describe3(tags):
    stats_tags = CorpusStatistics(tags,
                                  separator=Separator(
                                      phone=' ',
                                      syllable=';esyll',
                                      word=';eword')).describe_tokens('word')

    stats_gold = CorpusStatistics(
        tags, separator=Separator()).describe_tokens('word')

    assert pytest.approx(stats_tags) == stats_gold
Пример #6
0
def test_basic(prep, tags, type, threshold, pwb):
    sep = Separator()
    model = dibs.CorpusSummary(tags, separator=sep)

    out = list(dibs.segment(
        prep, model, type=type, threshold=threshold, pwb=pwb))

    s = Separator().remove
    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n+1, s(a), s(b))
Пример #7
0
def test_empty_lines():
    text = ['', '']
    assert len(list(prepare(text))) == 0
    assert len(list(gold(text))) == 0

    text = [
        'hh ax l ;esyll ow ;esyll ;eword', '',
        'hh ax l ;esyll ow ;esyll ;eword'
    ]
    assert len(list(prepare(text, separator=Separator(), unit='phone'))) == 2
    assert len(list(gold(text, separator=Separator()))) == 2
Пример #8
0
def test_entropy(tags):
    stats = CorpusStatistics(UTTS,
                             separator=Separator(phone=None,
                                                 syllable=None,
                                                 word=' '))
    with pytest.raises(KeyError):
        stats.normalized_segmentation_entropy()

    stats = CorpusStatistics(tags, Separator())
    assert stats.normalized_segmentation_entropy() \
        == pytest.approx(0.06298494117721846)
Пример #9
0
def test_remove_level():
    s = Separator(phone='p', syllable='s', word='w')
    assert s.remove('..p.s.p.w') == '.....'
    assert s.remove('..p.s.p.w', level='phone') == '...s..w'
    assert s.remove('..p.s.p.w', level='syllable') == '..p..p.w'
    assert s.remove('..p.s.p.w', level='word') == '..p.s.p.'

    s = Separator(phone=';', syllable='_', word=' ')
    assert s.remove('ab c', level='phone') == 'ab c'
Пример #10
0
def test_no_vowel(onsets, vowels):
    text = 's;i; a;j; l;j; a;l; a;j; '

    s = Syllabifier(onsets, vowels, separator=Separator(';', '_', ' '))
    with pytest.raises(ValueError) as err:
        s.syllabify([text])
    assert 'no vowel in word' in str(err.value)

    s = Syllabifier(onsets, vowels, separator=Separator(';', '_', ' '))
    assert [] == s.syllabify([text], tolerant=True)

    s = Syllabifier(
        onsets, vowels, separator=Separator(';', '_', ' '), filling_vowel=True)
    assert ['s;i;_ a;j;_ l;j;_ a;l;_ a;j;_ '] == s.syllabify([text])
Пример #11
0
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = puddle.segment(_prepared, nfolds=1)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.06369,
        'type_precision': 0.1075,
        'type_recall': 0.04525,
        'token_fscore': 0.06295,
        'token_precision': 0.2056,
        'token_recall': 0.03716,
        'boundary_all_fscore': 0.4605,
        'boundary_all_precision': 1.0,
        'boundary_all_recall': 0.2991,
        'boundary_noedge_fscore': 0.02806,
        'boundary_noedge_precision': 1.0,
        'boundary_noedge_recall': 0.01423
    }

    assert score == pytest.approx(expected, rel=1e-3)
Пример #12
0
def test_replicate(datadir):
    sep = Separator()

    _tags = [
        utt for utt in codecs.open(
            os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8') if utt
    ][:100]  # 100 first lines only
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)

    segmented = tp.segment(_prepared)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases)
    expected = {
        'type_fscore': 0.304,
        'type_precision': 0.2554,
        'type_recall': 0.3756,
        'token_fscore': 0.3994,
        'token_precision': 0.3674,
        'token_recall': 0.4375,
        'boundary_all_fscore': 0.7174,
        'boundary_all_precision': 0.6671,
        'boundary_all_recall': 0.776,
        'boundary_noedge_fscore': 0.6144,
        'boundary_noedge_precision': 0.557,
        'boundary_noedge_recall': 0.685
    }

    assert score == pytest.approx(expected, rel=1e-3)
Пример #13
0
def test_prepare_bad_types():
    # give dict or list of int as input, must fail
    with pytest.raises(AttributeError):
        list(prepare({1: 1, 2: 2}))

    with pytest.raises(AttributeError):
        list(prepare([1, 2], separator=Separator()))
Пример #14
0
def gold(text, separator=Separator()):
    """Returns a gold text from a phonologized one

    The returned gold text is the ground-truth segmentation. It has
    phone and syllable separators removed and word separators replaced
    by a single space ' '. It is used to evaluate the output of
    segmentation algorithms.

    Parameters
    ----------
    text : sequence
        The input text to be prepared for segmentation. Each element
        of the sequence is assumed to be a single and complete
        utterance in valid phonological form.
    separator : Separator, optional
        Token separation in the `text`

    Returns
    -------
    gold_text : generator
        Gold utterances with separators removed and words separated by
        spaces. The returned text is the gold version, against which
        the algorithms are evaluated.

    """
    # delete phone and syllable separators. Replace word boundaries by
    # a single space.
    gold = (line.replace(separator.syllable,
                         '').replace(separator.phone or '',
                                     '').replace(separator.word, ' ')
            for line in text)

    # delete any duplicate, begin or end spaces. As for prepare, we
    # ignore empty lines.
    return (line for line in (utils.strip(line) for line in gold) if line)
Пример #15
0
def test_descibe2(tags):
    stats = CorpusStatistics(
        tags, separator=Separator(phone=' ', syllable=';esyll',
                                  word=';eword')).describe_all()

    assert stats['corpus'] == pytest.approx({
        'entropy': 0.06298494117721846,
        'mattr': 0.7166666666666667,
        'nutts': 13,
        'nutts_single_word': 4
    })

    assert stats['phones'] == pytest.approx({
        'tokens': 121,
        'types': 28,
        'hapaxes': 5
    })

    assert stats['syllables'] == pytest.approx({
        'tokens': 49,
        'types': 31,
        'hapaxes': 24
    })

    assert stats['words'] == pytest.approx({
        'tokens': 34,
        'types': 24,
        'hapaxes': 19
    })
Пример #16
0
def test_puddle(prep, window, nfolds, njobs):
    out = list(puddle.segment(prep, window=window, nfolds=nfolds, njobs=njobs))
    s = Separator().remove

    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
Пример #17
0
def test_cspanish_phones(onsets, vowels, strip):
    separator = Separator(phone=';', syllable='_', word=' ')

    text = [
        'n;o; s;e; k;a;e; ',
        's;i; a;j; a;j; a;l; a;j; ',
        'es;t;a; a;j; l;a; t;a;t;a; e;9u; ',
        'm;i;r;a; es;t;a; x;u;g;a;n;9o; '
    ]

    if strip:
        expected = [
            'n;o s;e k;a_e',
            's;i a;j a;j a;l a;j',
            'es_t;a a;j l;a t;a_t;a e_9u',
            'm;i_r;a es_t;a x;u_g;a;n_9o'
        ]
    else:
        expected = [
            'n;o;_ s;e;_ k;a;_e;_ ',
            's;i;_ a;j;_ a;j;_ a;l;_ a;j;_ ',
            'es;_t;a;_ a;j;_ l;a;_ t;a;_t;a;_ e;_9u;_ ',
            'm;i;_r;a;_ es;_t;a;_ x;u;_g;a;n;_9o;_ ']

    s = Syllabifier(onsets, vowels, separator=separator)
    sylls = s.syllabify(text, strip=strip)
    assert sylls == expected
Пример #18
0
    def __init__(self,
                 onsets,
                 vowels,
                 separator=Separator(),
                 filling_vowel=False,
                 log=utils.null_logger()):
        self.onsets = onsets
        self.vowels = vowels
        self.separator = separator
        self.log = log

        # ensure onsets and vowels are not empty
        if not isinstance(vowels, list) or not len(vowels):
            raise ValueError('unvalid or empty vowels list')
        if not isinstance(onsets, list) or not len(onsets):
            raise ValueError('unvalid or empty onsets list')

        # concatenation of all chars in onsets and vowels (usefull to
        # detect any char during syllabification)
        self.symbols = (set(''.join(v for v in vowels)).union(
            set(''.join(o for o in onsets))))

        # if defined, ensure the silent vowel is not already used
        if filling_vowel:
            # find a silent vowel (some char not already prensent in
            # the symbols)
            code = 1
            while six.unichr(code) in self.symbols:
                code += 1
            self.silent = six.unichr(code)
            self.symbols.add(self.silent)
            self.vowels.append(self.silent)
        else:
            self.silent = None
Пример #19
0
def main():
    """Entry point of the 'wordseg-baseline' command"""
    streamin, streamout, _, log, args = utils.prepare_main(
        name='wordseg-baseline',
        description=__doc__,
        add_arguments=_add_arguments)

    # setup the seed for random number generation
    if args.random:
        log.info('setup random seed to %s', args.random)
    random.seed(args.random)

    if args.oracle:
        # load the oracle text
        if not os.path.isfile(args.oracle):
            raise ValueError('oracle file not found: {}'.format(args.oracle))
        oracle_text = list(codecs.open(args.oracle, 'r'))
        log.info('loaded %s utterances from oracle text', len(oracle_text))

        # init the oracle tokens separator
        oracle_separator = Separator(phone=args.phone_separator,
                                     syllable=args.syllable_separator,
                                     word=args.word_separator)

        segmented = segment_oracle(streamin,
                                   oracle_text,
                                   oracle_separator,
                                   args.level,
                                   log=log)
    else:
        segmented = segment(streamin, probability=args.probability, log=log)

    streamout.write('\n'.join(segmented) + '\n')
Пример #20
0
def test_replicate_cdswordseg(datadir):
    sep = Separator()

    _tags = [utt for utt in codecs.open(
        os.path.join(datadir, 'tagged.txt'), 'r', encoding='utf8')
            if utt]
    _prepared = prepare(_tags, separator=sep)
    _gold = gold(_tags, separator=sep)
    _train = _tags[:200]

    model = dibs.CorpusSummary(_train)
    segmented = dibs.segment(_prepared, model)
    score = evaluate(segmented, _gold)

    # we obtained that score from the dibs version in CDSWordSeg
    # (using wordseg.prepare and wordseg.evaluate in both cases). You
    # can replicate this result in CDSWordseg using
    # ".../CDSwordSeg/algoComp/segment.py test/data/tagged.txt -a dibs"
    expected = {
        'type_fscore': 0.2359,
        'type_precision': 0.2084,
        'type_recall': 0.2719,
        'token_fscore': 0.239,
        'token_precision': 0.3243,
        'token_recall': 0.1892,
        'boundary_all_fscore': 0.6543,
        'boundary_all_precision': 0.8377,
        'boundary_all_recall': 0.5367,
        'boundary_noedge_fscore': 0.4804,
        'boundary_noedge_precision': 0.7161,
        'boundary_noedge_recall': 0.3614}

    assert score == pytest.approx(expected, rel=1e-3)
Пример #21
0
def main():
    """Entry point of the 'wordseg-stats' command"""

    # options description
    def add_arguments(parser):
        parser.add_argument(
            '--json',
            action='store_true',
            help='print the results in JSON format, else print in raw text')

    # command initialization
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-stats',
        description=__doc__,
        add_arguments=add_arguments,
        separator=Separator())

    # compute the statistics
    stats = CorpusStatistics(streamin, separator, log=log)
    results = stats.describe_all()

    # display the results either as a JSON string or in raw text
    if args.json:
        streamout.write((json.dumps(results, indent=4)) + '\n')
    else:
        out = (' '.join((name, k, str(v))) for name, stats in results.items()
               for k, v in stats.items())
        streamout.write('\n'.join(out) + '\n')
Пример #22
0
def test_remove_re():
    s = Separator('ab', None, None)
    assert s.remove('ab') == ''
    assert s.remove('aa') == 'aa'
    assert s.remove('[ab]') == '[]'

    s = Separator('[ab]', None, None)
    assert s.remove('ab') == ''
    assert s.remove('aa') == ''
    assert s.remove('[ab]') == '[]'

    with pytest.raises(ValueError):
        Separator(r'\[ab\]', None, None)

    with pytest.raises(ValueError):
        Separator(re.escape('[ab]'), None, None)
Пример #23
0
def test_most_common():
    stats = CorpusStatistics(UTTS,
                             separator=Separator(phone=None,
                                                 syllable=None,
                                                 word=' '))

    top_freq = stats.most_common_tokens('word', n=4)
    assert dict(top_freq) == {'i': 2, 'people': 2, 'she\'s': 2, 'like': 2}
Пример #24
0
def test_cspanish_default_separator(onsets, vowels, strip):
    text = ['m i r a ;eword']
    expected = (
        ['m i ;esyllr a ;esyll;eword'] if not strip else ['m i;esyllr a'])

    s = Syllabifier(onsets, vowels, separator=Separator())
    sylls = s.syllabify(text, strip=strip)
    assert sylls == expected
Пример #25
0
def test_tp(prep, threshold, dependency):
    """Check input and output are the same, once the separators are removed"""
    out = list(tp.segment(prep, threshold=threshold, dependency=dependency))
    s = Separator().remove

    assert len(out) == len(prep)
    for n, (a, b) in enumerate(zip(out, prep)):
        assert s(a) == s(b), 'line {}: "{}" != "{}"'.format(n + 1, s(a), s(b))
Пример #26
0
def test_remove_restore_phones(text):
    separator = Separator(phone=';', syllable='_', word=' ')
    s = Syllabifier(['foo'], ['bar'], separator=separator)

    clean, index = s._remove_phone_separators(text)
    assert not re.search(separator.phone, clean)

    restored = s._restore_phone_separators(clean, index, strip=False)
    assert restored == text
Пример #27
0
def test_remove():
    s = Separator(phone='p', syllable='s', word='w')
    assert s.remove('abc') == 'abc'
    assert s.remove('wss p') == ' '

    s = Separator(phone='_', word=';eword ')
    t = 'j_uː_;eword n_oʊ_;eword dʒ_ʌ_s_t_;eword s_t_uː_p_ɪ_d_ɪ_ɾ_i_;eword '
    assert s.remove(t) == 'juːnoʊdʒʌststuːpɪdɪɾi'
Пример #28
0
def test_tokenize_full_nosyll():
    t = 'j_uː_ n_oʊ_ dʒ_ʌ_s_t_ '

    s = Separator(phone='_', syllable=None, word=' ')
    assert list(s.tokenize(t)) \
        == [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']]

    s = Separator(phone='_', syllable=';', word=' ')
    assert list(s.tokenize(t)) \
        == [[['j', 'uː']], [['n', 'oʊ']], [['dʒ', 'ʌ', 's', 't']]]

    # tokenize phones only
    t = t.replace(' ', '')
    s = Separator(phone='_', syllable=None, word=None)
    assert list(s.tokenize(t)) == \
        ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
Пример #29
0
def test_phone_sep(level):
    text = ['hh_ih_r_;eword ',
            'dh_eh_r_;eword w_iy_;eword g_ow_;eword ']

    sep = Separator(
        phone='_' if level == 'phone' else None,
        syllable='_' if level == 'syllable' else None,
        word=';eword ')

    model = dibs.CorpusSummary(text, separator=sep, level=level)
    assert model.summary == {'nlines': 2, 'nwords': 4, 'nphones': 10}
Пример #30
0
def test_remove_phones():
    separator = Separator(phone=' ', syllable=';esyll', word=';eword')
    s = Syllabifier(['foo'], ['bar'], separator=separator)
    text = 'a b ;ewordc ;eword'
    clean, index = s._remove_phone_separators(text)
    assert clean == 'ab;ewordc;eword'
    assert index == [[1, 1], [1]]

    separator = Separator(phone=';', syllable='_', word=' ')
    s = Syllabifier(['foo'], ['bar'], separator=separator)
    text = 'a;b; c;'
    clean, index = s._remove_phone_separators(text)
    assert index == [[1, 1], [1]]
    assert clean == 'ab c'

    separator = Separator(phone=';', syllable='_', word=' ')
    s = Syllabifier(['foo'], ['bar'], separator=separator)
    text = 'ab c'
    clean, index = s._remove_phone_separators(text)
    assert index == []
    assert clean == 'ab c'