示例#1
0
def corpus_data_syllable_morpheme_srur():
    levels = [SegmentTier('sr', 'phone', label=True),
              TranscriptionTier('ur', 'word'),
              GroupingTier('syllable', 'syllable'),
              MorphemeTier('morphemes', 'word'),
              OrthographyTier('word', 'word'),
              GroupingTier('line', 'line')]

    srs = [('b', 0, 0.1), ('aa', 0.1, 0.2), ('k', 0.2, 0.3), ('s', 0.3, 0.4),
           ('ah', 0.4, 0.5), ('s', 0.5, 0.6),
           ('er', 0.7, 0.8),
           ('f', 0.9, 1.0), ('er', 1.0, 1.1),
           ('p', 1.2, 1.3), ('ae', 1.3, 1.4), ('k', 1.4, 1.5), ('eng', 1.5, 1.6)]
    urs = [('b.aa.k.s-ah.z', 0, 0.6), ('aa.r', 0.7, 0.8),
           ('f.ao.r', 0.9, 1.1), ('p.ae.k-ih.ng', 1.2, 1.6)]
    syllables = [(0, 0.3), (0.3, 0.6), (0.7, 0.8), (0.9, 1.1),
                 (1.2, 1.5), (1.5, 1.6)]
    morphemes = [('box-PL', 0, 0.6), ('are', 0.7, 0.8),
                 ('for', 0.9, 1.1), ('pack-PROG', 1.2, 1.6)]
    words = [('boxes', 0, 0.6), ('are', 0.7, 0.8),
             ('for', 0.9, 1.1), ('packing', 1.2, 1.6)]
    lines = [(0, 1.6)]

    levels[0].add(srs)
    levels[1].add(urs)
    levels[2].add(syllables)
    levels[3].add(morphemes)
    levels[4].add(words)
    levels[5].add(lines)

    hierarchy = Hierarchy({'phone': 'syllable', 'syllable': 'word',
                           'word': 'line', 'line': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_syllable_morpheme')
    return data
示例#2
0
def subannotation_data():
    levels = [SegmentTier('label', 'phone'),
              OrthographyTier('label', 'word'),
              OrthographyTier('stop_information', 'phone')]
    levels[2].subannotation = True
    phones = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('t', 0.2, 0.3), ('s', 0.3, 0.4),
              ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
              ('k', 0.8, 0.9), ('u', 0.9, 1.0), ('t', 1.0, 1.1),
              ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.3), ('z', 2.3, 2.4),
              ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
              ('t', 2.6, 2.7), ('uw', 2.7, 2.8),
              ('ay', 3.0, 3.1),
              ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    info = [('burst', 0, 0.05), ('vot', 0.05, 0.1), ('closure', 0.2, 0.25),
            ('burst', 0.25, 0.26), ('vot', 0.26, 0.3), ('closure', 2.2, 2.25),
            ('burst', 2.25, 2.26), ('vot', 2.26, 2.3),
            ('voicing_during_closure', 2.2, 2.23), ('voicing_during_closure', 2.24, 2.25)]
    levels[0].add(phones)
    levels[1].add(words)
    levels[2].add(info)
    hierarchy = Hierarchy({'phone': 'word', 'word': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_sub')
    return data
示例#3
0
def corpus_data_ur_sr():
    levels = [SegmentTier('sr', 'phone'),
              OrthographyTier('word', 'word'),
              TranscriptionTier('ur', 'word')]
    srs = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('s', 0.2, 0.4),
           ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
           ('k', 0.8, 0.9), ('u', 0.9, 1.1),
           ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.25),
           ('ah', 2.25, 2.3), ('z', 2.3, 2.4),
           ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
           ('t', 2.6, 2.7), ('uw', 2.7, 2.8),
           ('ay', 3.0, 3.1),
           ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    urs = [('k.ae.t.s', 0.0, 0.4), ('aa.r', 0.5, 0.7), ('k.y.uw.t', 0.8, 1.1),
           ('d.aa.g.z', 2.0, 2.4), ('aa.r', 2.4, 2.6), ('t.uw', .6, 2.8),
           ('ay', 3.0, 3.1), ('g.eh.s', 3.3, 3.6)]
    levels[0].add(srs)
    levels[1].add(words)
    levels[2].add(urs)

    hierarchy = Hierarchy({'phone': 'word', 'word': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_ursr')
    return data
示例#4
0
def corpus_data_timed():
    levels = [
        SegmentTier('label', 'phone'),
        OrthographyTier('label', 'word'),
        GroupingTier('line', 'line')
    ]
    phones = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('t', 0.2, 0.3),
              ('s', 0.3, 0.4), ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
              ('k', 0.8, 0.9), ('uw', 0.9, 1.0), ('t', 1.0, 1.1),
              ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.3),
              ('z', 2.3, 2.4), ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
              ('t', 2.6, 2.7), ('uw', 2.7, 2.8), ('ay', 3.0, 3.1),
              ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    lines = [(0.0, 1.1), (2.0, 2.8), (3.0, 3.6)]

    levels[0].add(phones)
    levels[1].add(words)
    levels[2].add(lines)
    hierarchy = Hierarchy({'phone': 'word', 'word': 'line', 'line': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_timed')
    return data