Exemplo n.º 1
0
def inspect_mfa(path):
    """
    Generate an :class:`~polyglotdb.io.parsers.ilg.MfaParser`
    for a specified text file for parsing it as a Mfa file

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.ilg.Mfa`
        Autodetected parser for Mfa
    """

    annotation_types = [
        OrthographyTier('words', 'word'),
        OrthographyTier('phones', 'phone')
    ]

    annotation_types[0].label = True
    annotation_types[1].label = True
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return MfaParser(annotation_types, hierarchy)
Exemplo n.º 2
0
def corpus_data_untimed():
    levels = [TextTranscriptionTier('transcription', 'word'),
              TextOrthographyTier('spelling', 'word'),
              TextMorphemeTier('morpheme', 'word'),
              GroupingTier('line', 'line')]

    transcriptions = [('k.ae.t-s', 0), ('aa.r', 1), ('k.y.uw.t', 2),
                      ('d.aa.g-z', 3), ('aa.r', 4), ('t.uw', 5),
                      ('ay', 6), ('g.eh.s', 7)]
    morphemes = [('cat-PL', 0), ('are', 1), ('cute', 2),
                 ('dog-PL', 3), ('are', 4), ('too', 5),
                 ('i', 6), ('guess', 7)]
    words = [('cats', 0), ('are', 1), ('cute', 2),
             ('dogs', 3), ('are', 4), ('too', 5),
             ('i', 6), ('guess', 7)]
    lines = [(0, 2), (3, 5), (6, 7)]

    levels[0].add(transcriptions)
    levels[1].add(words)
    levels[2].add(morphemes)
    levels[3].add(lines)

    hierarchy = Hierarchy({'word': 'line', 'line': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_untimed')
    return data
Exemplo n.º 3
0
def corpus_data_syllable_morpheme_srur():
    levels = [SegmentTier('sr', 'phone', label=True),
              TranscriptionTier('ur', 'word'),
              GroupingTier('syllable', 'syllable'),
              MorphemeTier('morphemes', 'word'),
              OrthographyTier('word', 'word'),
              GroupingTier('line', 'line')]

    srs = [('b', 0, 0.1), ('aa', 0.1, 0.2), ('k', 0.2, 0.3), ('s', 0.3, 0.4),
           ('ah', 0.4, 0.5), ('s', 0.5, 0.6),
           ('er', 0.7, 0.8),
           ('f', 0.9, 1.0), ('er', 1.0, 1.1),
           ('p', 1.2, 1.3), ('ae', 1.3, 1.4), ('k', 1.4, 1.5), ('eng', 1.5, 1.6)]
    urs = [('b.aa.k.s-ah.z', 0, 0.6), ('aa.r', 0.7, 0.8),
           ('f.ao.r', 0.9, 1.1), ('p.ae.k-ih.ng', 1.2, 1.6)]
    syllables = [(0, 0.3), (0.3, 0.6), (0.7, 0.8), (0.9, 1.1),
                 (1.2, 1.5), (1.5, 1.6)]
    morphemes = [('box-PL', 0, 0.6), ('are', 0.7, 0.8),
                 ('for', 0.9, 1.1), ('pack-PROG', 1.2, 1.6)]
    words = [('boxes', 0, 0.6), ('are', 0.7, 0.8),
             ('for', 0.9, 1.1), ('packing', 1.2, 1.6)]
    lines = [(0, 1.6)]

    levels[0].add(srs)
    levels[1].add(urs)
    levels[2].add(syllables)
    levels[3].add(morphemes)
    levels[4].add(words)
    levels[5].add(lines)

    hierarchy = Hierarchy({'phone': 'syllable', 'syllable': 'word',
                           'word': 'line', 'line': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_syllable_morpheme')
    return data
Exemplo n.º 4
0
def corpus_data_ur_sr():
    levels = [SegmentTier('sr', 'phone'),
              OrthographyTier('word', 'word'),
              TranscriptionTier('ur', 'word')]
    srs = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('s', 0.2, 0.4),
           ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
           ('k', 0.8, 0.9), ('u', 0.9, 1.1),
           ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.25),
           ('ah', 2.25, 2.3), ('z', 2.3, 2.4),
           ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
           ('t', 2.6, 2.7), ('uw', 2.7, 2.8),
           ('ay', 3.0, 3.1),
           ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    urs = [('k.ae.t.s', 0.0, 0.4), ('aa.r', 0.5, 0.7), ('k.y.uw.t', 0.8, 1.1),
           ('d.aa.g.z', 2.0, 2.4), ('aa.r', 2.4, 2.6), ('t.uw', .6, 2.8),
           ('ay', 3.0, 3.1), ('g.eh.s', 3.3, 3.6)]
    levels[0].add(srs)
    levels[1].add(words)
    levels[2].add(urs)

    hierarchy = Hierarchy({'phone': 'word', 'word': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_ursr')
    return data
Exemplo n.º 5
0
def inspect_buckeye(word_path):
    """
    Generate a :class:`~polyglotdb.io.parsers.buckeye.BuckeyeParser`
    for the Buckeye corpus.

    Parameters
    ----------
    word_path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.buckeye.BuckeyeParser`
        Auto-detected parser for the Buckeye corpus
    """
    annotation_types = [
        OrthographyTier('word', 'word'),
        OrthographyTier('transcription', 'word'),
        OrthographyTier('surface_transcription', 'word'),
        OrthographyTier('category', 'word'),
        SegmentTier('phone', 'phone')
    ]
    #annotation_types[1].trans_delimiter = ' '
    #annotation_types[2].trans_delimiter = ' '
    annotation_types[2].type_property = False
    annotation_types[3].type_property = False
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return BuckeyeParser(annotation_types, hierarchy)
Exemplo n.º 6
0
def corpus_data_timed():
    levels = [
        SegmentTier('label', 'phone'),
        OrthographyTier('label', 'word'),
        GroupingTier('line', 'line')
    ]
    phones = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('t', 0.2, 0.3),
              ('s', 0.3, 0.4), ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
              ('k', 0.8, 0.9), ('uw', 0.9, 1.0), ('t', 1.0, 1.1),
              ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.3),
              ('z', 2.3, 2.4), ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
              ('t', 2.6, 2.7), ('uw', 2.7, 2.8), ('ay', 3.0, 3.1),
              ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    lines = [(0.0, 1.1), (2.0, 2.8), (3.0, 3.6)]

    levels[0].add(phones)
    levels[1].add(words)
    levels[2].add(lines)
    hierarchy = Hierarchy({'phone': 'word', 'word': 'line', 'line': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_timed')
    return data
Exemplo n.º 7
0
def inspect_maus(path):
    """
    Generate an :class:`~polyglotdb.io.parsers.maus.MausParser`
    for a specified text file for parsing it as a MAUS file

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.maus.MausParser`
        Autodetected parser for MAUS TextGrids
    """

    annotation_types = [
        OrthographyTier(MausParser.word_label, 'word'),
        OrthographyTier(MausParser.phone_label, 'phone')
    ]

    annotation_types[0].label = True
    annotation_types[1].label = True
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return MausParser(annotation_types, hierarchy)
Exemplo n.º 8
0
def inspect_partitur(path):
    """
    Generate an :class:`~polyglotdb.io.parsers.partitur.PartiturParser`
    for a specified text file for parsing it as a BAS Partitur file

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.paritur.PartiturParser`
        Autodetected parser for BAS Partitur
    """
    annotation_types = [
        OrthographyTier('word', 'word'),
        OrthographyTier('transcription', 'word'),
        OrthographyTier('phones', 'phone')
    ]

    annotation_types[0].label = True
    annotation_types[1].label = False
    annotation_types[1].type_property = True
    annotation_types[2].label = True

    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return PartiturParser(annotation_types, hierarchy)
Exemplo n.º 9
0
def inspect_fave(path):
    """
    Generate an :class:`~polyglotdb.io.parsers.fave.FaveParser`
    for a specified text file for parsing it as an FAVE text file

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.ilg.FaveParser`
        Autodetected parser for the text file
    """
    annotation_types = [
        OrthographyTier(FaveParser.word_label, 'word'),
        OrthographyTier(FaveParser.phone_label, 'phone')
    ]

    annotation_types[0].label = True
    annotation_types[1].label = True
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return FaveParser(annotation_types, hierarchy)
Exemplo n.º 10
0
def subannotation_data():
    levels = [SegmentTier('label', 'phone'),
              OrthographyTier('label', 'word'),
              OrthographyTier('stop_information', 'phone')]
    levels[2].subannotation = True
    phones = [('k', 0.0, 0.1), ('ae', 0.1, 0.2), ('t', 0.2, 0.3), ('s', 0.3, 0.4),
              ('aa', 0.5, 0.6), ('r', 0.6, 0.7),
              ('k', 0.8, 0.9), ('u', 0.9, 1.0), ('t', 1.0, 1.1),
              ('d', 2.0, 2.1), ('aa', 2.1, 2.2), ('g', 2.2, 2.3), ('z', 2.3, 2.4),
              ('aa', 2.4, 2.5), ('r', 2.5, 2.6),
              ('t', 2.6, 2.7), ('uw', 2.7, 2.8),
              ('ay', 3.0, 3.1),
              ('g', 3.3, 3.4), ('eh', 3.4, 3.5), ('s', 3.5, 3.6)]
    words = [('cats', 0.0, 0.4), ('are', 0.5, 0.7), ('cute', 0.8, 1.1),
             ('dogs', 2.0, 2.4), ('are', 2.4, 2.6), ('too', 2.6, 2.8),
             ('i', 3.0, 3.1), ('guess', 3.3, 3.6)]
    info = [('burst', 0, 0.05), ('vot', 0.05, 0.1), ('closure', 0.2, 0.25),
            ('burst', 0.25, 0.26), ('vot', 0.26, 0.3), ('closure', 2.2, 2.25),
            ('burst', 2.25, 2.26), ('vot', 2.26, 2.3),
            ('voicing_during_closure', 2.2, 2.23), ('voicing_during_closure', 2.24, 2.25)]
    levels[0].add(phones)
    levels[1].add(words)
    levels[2].add(info)
    hierarchy = Hierarchy({'phone': 'word', 'word': None})
    parser = BaseParser(levels, hierarchy)
    data = parser.parse_discourse('test_sub')
    return data
Exemplo n.º 11
0
 def __init__(self, annotation_types, stop_check=None, call_back=None):
     super(IlgParser, self).__init__(annotation_types,
                                     Hierarchy({'word': None}),
                                     make_transcription=False,
                                     make_label=True,
                                     stop_check=stop_check,
                                     call_back=call_back)
Exemplo n.º 12
0
 def __init__(self,
              annotation_types,
              column_delimiter,
              stop_check=None,
              call_back=None):
     self.annotation_types = annotation_types
     self.column_delimiter = column_delimiter
     self.hierarchy = Hierarchy({'word': None})
     self.stop_check = stop_check
     self.call_back = call_back
     self.make_transcription = False
     self.make_label = True
Exemplo n.º 13
0
def guess_tiers(tg):
    """
    Guesses whether tiers are words or segments

    Parameters
    ----------
    tg : TextGrid
        the textgrid object

    Returns
    -------
    tier_guesses : dict
        the tiers and their likelihoods
    hierarchy : `~polyglotdb.structure.Hierarchy`
        the hierarchy object
    """
    tier_properties = {}
    tier_guesses = {}
    for i, t in enumerate(tg.tiers):
        if len(t) == 0:
            continue
        t.maxTime = tg.maxTime
        tier_properties[t.name] = (i, average_duration(t))
    for k, v in tier_properties.items():
        if v is None:
            continue
        word_p = word_probability(v[1])
        phone_p = segment_probability(v[1])
        if word_p > phone_p:
            tier_guesses[k] = ('word', v[0])
        else:
            tier_guesses[k] = ('segment', v[0])
    word_labels = [(k, v[1]) for k, v in tier_guesses.items()
                   if v[0] == 'word']
    phone_labels = [(k, v[1]) for k, v in tier_guesses.items()
                    if v[0] == 'segment']
    word_type = figure_linguistic_type(word_labels)
    phone_type = figure_linguistic_type(phone_labels)
    for k, v in tier_guesses.items():
        if 'word' in k.lower() or v[0] == 'word':
            tier_guesses[k] = word_type
        else:
            tier_guesses[k] = phone_type
    h = {word_type: None}
    if phone_type is not None:
        h[phone_type] = word_type
    hierarchy = Hierarchy(h)
    return tier_guesses, hierarchy
Exemplo n.º 14
0
def inspect_timit(word_path):
    """
    Generate a :class:`~polyglotdb.io.parsers.timit.TimitParser`.

    Parameters
    ----------
    word_path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.timit.TimitParser`
        Auto-detected parser for TIMIT
    """
    annotation_types = [
        OrthographyTier('word', 'word'),
        SegmentTier('phone', 'phone')
    ]
    hierarchy = Hierarchy({'phone': 'word', 'word': None})
    return TimitParser(annotation_types, hierarchy)
Exemplo n.º 15
0
def inspect_labbcat(path):
    """
    Generate an :class:`~polyglotdb.io.parsers.ilg.LabbCatParser`
    for a specified text file for parsing it as a LabbCat file

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.ilg.LabbCat`
        Autodetected parser for LabbCat
    """

    annotation_types = [OrthographyTier('transcrip', 'word'),
                        OrthographyTier('segment', 'phone')]

    annotation_types[0].label = True
    annotation_types[1].label = True
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return LabbCatParser(annotation_types, hierarchy)
Exemplo n.º 16
0
            tier_guesses[k] = ('word', v[0])
        else:
            tier_guesses[k] = ('segment', v[0])
    word_labels = [(k,v[1]) for k,v in tier_guesses.items() if v[0] == 'word']
    phone_labels = [(k,v[1]) for k,v in tier_guesses.items() if v[0] == 'segment']
    word_type = figure_linguistic_type(word_labels)
    phone_type = figure_linguistic_type(phone_labels)
    for k,v in tier_guesses.items():
        if v[0] == 'word':
            tier_guesses[k] = word_type
        else:
            tier_guesses[k] = phone_type
    h = {word_type: None}
    if phone_type is not None:
        h[phone_type] = word_type
    hierarchy = Hierarchy(h)
    return tier_guesses, hierarchy

def inspect_textgrid(path):
    """
    Generate a :class:`~polyglotdb.io.parsers.textgrid.TextgridParser` for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.textgrid.TextgridParser`
        Autodetected parser for the TextGrid file
Exemplo n.º 17
0
 def __init__(self, annotation_tiers, stop_check=None, call_back=None):
     super(OrthographyTextParser, self).__init__(annotation_tiers,
                                                 Hierarchy({'word': None}),
                                                 make_transcription=False,
                                                 stop_check=stop_check,
                                                 call_back=call_back)
Exemplo n.º 18
0
from polyglotdb.structure import Hierarchy

from ..types.parsing import *

from ..parsers import FaveParser

def inspect_fave(path):
	"""
    Generate an :class:`~polyglotdb.io.parsers.fave.FaveParser`
    for a specified text file for parsing it as an FAVE text file

    Parameters
    ----------
    path : str
        Full path to text file
 
    Returns
    -------
    :class:`~polyglotdb.io.parsers.ilg.FaveParser`
        Autodetected parser for the text file
    """
    annotation_types = [OrthographyTier('word', 'word'),
                            OrthographyTier('phone', 'phone')]

    annotation_types[0].label = True
    annotation_types[1].label = True
    hierarchy = Hierarchy({'phone': 'word', 'word': None})

    return FaveParser(annotation_types, hierarchy)