Пример #1
0
def test_ts():
    with pytest.raises(AssertionError):
        TranscriptionSystem('')
    with pytest.raises(ValueError):
        TranscriptionSystem('_f1')
    with pytest.raises(ValueError):
        TranscriptionSystem('_f2')
    with pytest.raises(ValueError):
        TranscriptionSystem('_f3')
    with pytest.raises(ValueError):
        _ = TranscriptionSystem('what')
Пример #2
0
 def __init__(self, id_):
     if not hasattr(self, 'data'):
         # Only initialize, if this is really a new instance!
         assert id_ in SOUNDCLASS_SYSTEMS
         data, self.sounds, self.names = read_data('soundclasses',
                                                   'lingpy.tsv', id_)
         self.data = {}
         self.classes = set()
         for k, v in data.items():
             self.data[k] = v[0]
             self.classes.add(v[0]['grapheme'])
         self.system = TranscriptionSystem('bipa')
Пример #3
0
def _make_package(args):  # pragma: no cover
    """Prepare transcriptiondata from the transcription sources."""
    from lingpy.sequence.sound_classes import token2class
    from lingpy.data import Model

    columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE']
    bipa = TranscriptionSystem('bipa')
    for src, rows in args.repos.iter_sources(type='td'):
        args.log.info('TranscriptionData {0} ...'.format(src['NAME']))
        uritemplate = URITemplate(
            src['URITEMPLATE']) if src['URITEMPLATE'] else None
        out = [[
            'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME',
            'URL'
        ] + columns]
        graphemes = set()
        for row in rows:
            if row['GRAPHEME'] in graphemes:
                args.log.warn('skipping duplicate grapheme: {0}'.format(
                    row['GRAPHEME']))
                continue
            graphemes.add(row['GRAPHEME'])
            if not row['BIPA']:
                bipa_sound = bipa[row['GRAPHEME']]
                explicit = ''
            else:
                bipa_sound = bipa[row['BIPA']]
                explicit = '+'
            generated = '+' if bipa_sound.generated else ''
            if is_valid_sound(bipa_sound, bipa):
                bipa_grapheme = bipa_sound.s
                bipa_name = bipa_sound.name
            else:
                bipa_grapheme, bipa_name = '<NA>', '<NA>'
            url = uritemplate.expand(
                **row) if uritemplate else row.get('URL', '')
            out.append([
                bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'],
                url
            ] + [row.get(c, '') for c in columns])
        found = len([o for o in out if o[0] != '<NA>'])
        args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format(
            found, len(out), found / len(out) * 100))
        with UnicodeWriter(pkg_path('transcriptiondata',
                                    '{0}.tsv'.format(src['NAME'])),
                           delimiter='\t') as writer:
            writer.writerows(out)

    count = 0
    with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS)
        for grapheme, sound in sorted(bipa.sounds.items()):
            if not sound.alias:
                writer.writerow([sound.name, grapheme] + [
                    token2class(grapheme, Model(cls))
                    for cls in SOUNDCLASS_SYSTEMS
                ])
                count += 1
    args.log.info('SoundClasses: {0} written to file.'.format(count))
Пример #4
0
 def __init__(self, id_):
     if not hasattr(self, 'data'):
         # Only initialize, if this is really a new instance!
         self.data, self.sounds, self.names = read_data(
             'transcriptiondata', id_ + '.tsv', 'GRAPHEME', 'URL',
             'BIPA_GRAPHEME', 'GENERATED', 'URL', 'LATEX', 'FEATURES',
             'SOUND', 'IMAGE', 'COUNT', 'NOTE', 'EXPLICIT')
         self.system = TranscriptionSystem('bipa')
Пример #5
0
def _make_app_data(args, test=False):
    tts = TranscriptionSystem('bipa')

    def sound_to_dict(snd):
        res = {'name': snd.name, 'bipa': snd.s, 'type': snd.type}
        for f in snd._name_order:
            res[f] = getattr(snd, f)
        return res

    # retrieve all sounds in the datasets
    all_sounds = {}
    for td in args.repos.iter_transcriptiondata():
        for sound in td.data:
            if ' ' in sound:
                snd = tts[sound]
                glyph = snd.s
                assert '<?>' not in snd.s
                if snd.s not in all_sounds:
                    all_sounds[glyph] = sound_to_dict(snd)
                for item in td.data[sound]:
                    if item['grapheme'] not in all_sounds:
                        all_sounds[item['grapheme']] = all_sounds[glyph]

                all_sounds[glyph][td.id] = td.data[sound]
        if test:
            break

    # add sounds from transcription system
    for sound in tts:
        if sound not in all_sounds:
            snd = tts[sound]
            if snd.type != 'marker':
                if snd.s in all_sounds:
                    all_sounds[sound] = all_sounds[snd.s]
                else:
                    all_sounds[sound] = sound_to_dict(snd)

    args.log.info('{0} unique graphemes loaded'.format(len(all_sounds)))

    for i, sc in enumerate(args.repos.iter_soundclass()):
        for sound in all_sounds:
            try:
                all_sounds[sound][sc.id] = [dict(grapheme=sc[sound])]
            except KeyError:  # pragma: no cover
                pass
            if i == 0:
                if hasattr(sound, 's'):
                    all_sounds[sound]['bipa'] = tts[sound].s
        if test:
            break

    datafile = args.repos.app_path('data.js')
    with datafile.open('w', encoding='utf8') as handler:
        handler.write('var BIPA = ' + json.dumps(all_sounds, indent=2) + ';\n')
        handler.write('var normalize = ' + json.dumps(tts._normalize) + ';\n')
    args.log.info('{0} written'.format(datafile))
Пример #6
0
def features(args):
    bipa = TranscriptionSystem(args.system)
    features = set()
    for sound in bipa.sounds.values():
        if sound.type not in ['marker', 'unknownsound']:
            for k, v in sound.featuredict.items():
                features.add((sound.type, k, v or ''))
    table = Table('TYPE', 'FEATURE', 'VALUE')
    table.extend(sorted(features))
    print(table.render(tablefmt='simple'))
Пример #7
0
def sounds(args):
    tts = TranscriptionSystem(args.system)
    data = []
    for sound in args.args:
        sound = tts.get(
            sound if isinstance(sound, text_type) else sound.decode('utf8'))
        if sound.type != 'unknownsound':
            data += [[
                text_type(sound), sound.source or ' ',
                '1' if sound.generated else ' ',
                sound.grapheme if sound.alias else ' ', sound.name
            ]]
        else:
            data += [['?', sound.source, '?', '?', '?']]
    tbl = Table(args.system.upper(),
                'SOURCE',
                'GENERATED',
                'ALIAS',
                'NAME',
                rows=data)
    print(tbl.render(tablefmt=args.format, condensed=False))
Пример #8
0
def dstats(args):
    table = [['id', 'valid', 'total', 'percent']]
    bipa = TranscriptionSystem('bipa')
    for td in args.repos.iter_transcriptiondata():
        ln = [
            1 if is_valid_sound(bipa[name], bipa) else 0 for name in td.names
        ]
        table += [[td.id, sum(ln), len(ln), sum(ln) / len(ln)]]
    table += [[
        len(table) - 1, '', '',
        sum([line[-1] for line in table[1:]]) / (len(table) - 1)
    ]]
    print(tabulate.tabulate(table, headers='firstrow'))
Пример #9
0
def table(args):
    tts = TranscriptionSystem(args.system)
    tts_sounds = [
        tts.get(
            sound if isinstance(sound, text_type) else sound.decode('utf8'))
        for sound in args.args
    ]
    if args.filter == 'generated':
        tts_sounds = [s for s in tts_sounds if s.generated]
    elif args.filter == 'unknown':
        tts_sounds = [s for s in tts_sounds if s.type == 'unknownsound']
    elif args.filter == 'known':
        tts_sounds = [
            s for s in tts_sounds
            if not s.generated and not s.type == 'unknownsound'
        ]

    data = defaultdict(list)
    ucount = 0
    for sound in tts_sounds:
        if sound.type != 'unknownsound':
            data[sound.type] += [sound.table]
        else:
            ucount += 1
            data['unknownsound'].append(
                [text_type(ucount), sound.source or '', sound.grapheme])
    for cls in tts.sound_classes:
        if cls in data:
            print('# {0}\n'.format(cls))
            tbl = Table(*[c.upper() for c in tts.columns[cls]], rows=data[cls])
            print(tbl.render(tablefmt=args.format, condensed=False))
            print('')
    if data['unknownsound']:
        print('# Unknown sounds\n')
        tbl = Table('NUMBER', 'SOURCE', 'GRAPHEME', rows=data['unknownsound'])
        print(tbl.render(tablefmt=args.format, condensed=False))
Пример #10
0
class SoundClasses(TranscriptionBase):
    """
    Class for handling sound class models.
    """
    def __init__(self, id_):
        if not hasattr(self, 'data'):
            # Only initialize, if this is really a new instance!
            assert id_ in SOUNDCLASS_SYSTEMS
            data, self.sounds, self.names = read_data('soundclasses',
                                                      'lingpy.tsv', id_)
            self.data = {}
            self.classes = set()
            for k, v in data.items():
                self.data[k] = v[0]
                self.classes.add(v[0]['grapheme'])
            self.system = TranscriptionSystem('bipa')

    def resolve_sound(self, sound):
        """Function tries to identify a sound in the data.

        Notes
        -----
        The function tries to resolve sounds to take a sound with less complex
        features in order to yield the next approximate sound class, if the
        transcription data are sound classes.
        """
        sound = sound if isinstance(sound, Sound) else self.system[sound]
        if sound.name in self.data:
            return self.data[sound.name]['grapheme']
        if not sound.type == 'unknownsound':
            if sound.type in ['diphthong', 'cluster']:
                return self.resolve_sound(sound.from_sound)
            name = [
                s for s in sound.name.split(' ')
                if self.system._feature_values.get(s, '') not in
                ['laminality', 'ejection', 'tone']
            ]
            while len(name) >= 4:
                sound = self.system.get(' '.join(name))
                if sound and sound.name in self.data:
                    return self.resolve_sound(sound)
                name.pop(0)
        raise KeyError(":sc:resolve_sound: No sound could be found.")
Пример #11
0
from lingpy import *
import json
from bxs import sampa
from unicodedata import normalize
from pyclts.transcriptionsystem import TranscriptionSystem
from lingpy.data.ipa.sampa import xs

bipa = TranscriptionSystem('bipa')

data = csv2list('graphemes.tsv')

prof = [['Grapheme', 'IPA', 'BIPA', 'CLTS_Name']]
visited = set()

mapper = {}
for k, v in sampa.items():
    if 'U+' in v['ipa']:
        v['ipa'] = chr(int('0x' + v['ipa'][2:], 0))
    mapper[normalize('NFD', v['ipa'])] = v['grapheme']
    mapper[normalize('NFC', v['ipa'])] = v['grapheme']

    sound = bipa[v['ipa']]
    if not sound.type in ['unknownsound', 'marker']:
        mapper[sound.s] = v['grapheme']

        if v['grapheme'] not in visited:
            prof += [[
                v['grapheme'], v['ipa'], sound.s,
                sound.name.replace(' ', '_')
            ]]
            visited.add(v['grapheme'])
Пример #12
0
def dump(args, test=False):
    sounds = defaultdict(dict)
    data = []
    bipa = TranscriptionSystem('bipa')
    # start from assembling bipa-sounds
    for grapheme, sound in sorted(bipa.sounds.items(),
                                  key=lambda p: p[1].alias
                                  if p[1].alias else False):
        if sound.type not in ['marker']:
            if sound.alias:
                assert sound.name in sounds
                sounds[sound.name]['aliases'].add(grapheme)
            else:
                assert sound.name not in sounds
                sounds[sound.name] = {
                    'grapheme': grapheme,
                    'unicode': sound.uname or '',
                    'generated': '',
                    'note': sound.note or '',
                    'type': sound.type,
                    'aliases': set(),
                    'normalized': '+' if sound.normalized else ''
                }
            data.append(
                Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '',
                         '', '', sound.note or ''))

    # add sounds systematically by their alias
    for td in args.repos.iter_transcriptiondata():
        for name in td.names:
            bipa_sound = bipa[name]
            # check for consistency of mapping here
            if not is_valid_sound(bipa_sound, bipa):
                continue

            sound = sounds.get(name)
            if not sound:
                sound = sounds[name] = {
                    'grapheme': bipa_sound.s,
                    'aliases': {bipa_sound.s},
                    'generated': '+',
                    'unicode': bipa_sound.uname or '',
                    'note': '',
                    'type': bipa_sound.type,
                    'alias': '+' if bipa_sound.alias else '',
                    'normalized': '+' if bipa_sound.normalized else ''
                }

            for item in td.data[name]:
                sound['aliases'].add(item['grapheme'])
                # add the values here
                data.append(
                    Grapheme(
                        item['grapheme'],
                        name,
                        item['explicit'],
                        '',  # sounds[name]['alias'],
                        td.id,
                        item.get('frequency', ''),
                        item.get('url', ''),
                        item.get('features', ''),
                        item.get('image', ''),
                        item.get('sound', ''),
                    ))
        if test:
            break

    # sound classes have a generative component, so we need to treat them
    # separately
    for sc in args.repos.iter_soundclass():
        for name in sounds:
            try:
                grapheme = sc[name]
                data.append(
                    Grapheme(
                        grapheme,
                        name,
                        '+' if name in sc.data else '',
                        '',
                        sc.id,
                    ))
            except KeyError:  # pragma: no cover
                args.log.debug(name, sounds[name]['grapheme'])
        if test:
            break

    # last run, check again for each of the remaining transcription systems,
    # whether we can translate the sound
    for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']):
        for name in sounds:
            try:
                ts_sound = ts[name]
                if is_valid_sound(ts_sound, ts):
                    sounds[name]['aliases'].add(ts_sound.s)
                    data.append(
                        Grapheme(
                            ts_sound.s,
                            name,
                            '' if sounds[name]['generated'] else '+',
                            '',  # sounds[name]['alias'],
                            ts.id,
                        ))
            except ValueError:
                pass
            except TypeError:
                args.log.debug('{0}: {1}'.format(ts.id, name))
        if test:
            break

    with UnicodeWriter(args.repos.data_path('sounds.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(
            ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE'])
        for k, v in sorted(sounds.items(), reverse=True):
            writer.writerow([
                k, v['type'], v['grapheme'], v['unicode'], v['generated'],
                v['note']
            ])

    with UnicodeWriter(args.repos.data_path('graphemes.tsv'),
                       delimiter='\t') as writer:
        writer.writerow([f.name for f in attr.fields(Grapheme)])
        for row in data:
            writer.writerow(attr.astuple(row))