Пример #1
0
def _make_package(args):  # pragma: no cover
    """Prepare transcriptiondata from the transcription sources."""
    from lingpy.sequence.sound_classes import token2class
    from lingpy.data import Model

    columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE']
    bipa = TranscriptionSystem('bipa')
    for src, rows in args.repos.iter_sources(type='td'):
        args.log.info('TranscriptionData {0} ...'.format(src['NAME']))
        uritemplate = URITemplate(
            src['URITEMPLATE']) if src['URITEMPLATE'] else None
        out = [[
            'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME',
            'URL'
        ] + columns]
        graphemes = set()
        for row in rows:
            if row['GRAPHEME'] in graphemes:
                args.log.warn('skipping duplicate grapheme: {0}'.format(
                    row['GRAPHEME']))
                continue
            graphemes.add(row['GRAPHEME'])
            if not row['BIPA']:
                bipa_sound = bipa[row['GRAPHEME']]
                explicit = ''
            else:
                bipa_sound = bipa[row['BIPA']]
                explicit = '+'
            generated = '+' if bipa_sound.generated else ''
            if is_valid_sound(bipa_sound, bipa):
                bipa_grapheme = bipa_sound.s
                bipa_name = bipa_sound.name
            else:
                bipa_grapheme, bipa_name = '<NA>', '<NA>'
            url = uritemplate.expand(
                **row) if uritemplate else row.get('URL', '')
            out.append([
                bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'],
                url
            ] + [row.get(c, '') for c in columns])
        found = len([o for o in out if o[0] != '<NA>'])
        args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format(
            found, len(out), found / len(out) * 100))
        with UnicodeWriter(pkg_path('transcriptiondata',
                                    '{0}.tsv'.format(src['NAME'])),
                           delimiter='\t') as writer:
            writer.writerows(out)

    count = 0
    with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS)
        for grapheme, sound in sorted(bipa.sounds.items()):
            if not sound.alias:
                writer.writerow([sound.name, grapheme] + [
                    token2class(grapheme, Model(cls))
                    for cls in SOUNDCLASS_SYSTEMS
                ])
                count += 1
    args.log.info('SoundClasses: {0} written to file.'.format(count))
Пример #2
0
def main():
    catalog = {}

    # iterate over all lines, collecting data from the appropriate ones
    in_data = False
    with urllib.request.urlopen(CREANZA_DATASET) as handler:
        for line in handler:
            # clean line
            line = line.decode('utf-8').strip()

            if in_data:
                fields = line.split('\t')
                catalog[fields[1]] = [fields[0], fields[2]]
            if line == HEADER_STR:
                # enter into collecting data mode
                in_data = True

    # output
    with open(pkg_path('sources', 'creanza.tsv').as_posix(), 'w') as handler:
        handler.write('GRAPHEME\tCOLUMN (ID)\tCOUNT\n')
        for segment in sorted(catalog):
            if segment not in BLACKLIST:
                buf = [segment, catalog[segment][0], catalog[segment][1]]
                handler.write('\t'.join(buf))
                handler.write('\n')
Пример #3
0
def main():
    # There is no language catalog on the on-line PBASE interface;
    # while we could loop over index until we hit a 404 or a 500, this is too
    # complicated and not guaranteed to work if they reorganize their
    # interface. I manually checked for the last entry and I'm looping
    # over this range (the '+1' is to make it clear that last entry is
    # 629 and not 630)
    catalog = set()
    for lang_id in range(1, 629 + 1):
        while True:
            try:
                print('Fetching language #%i...' % lang_id)
                catalog.update(fetch_inventory(lang_id))
            except:
                print('WARNING: error fetching #i, waiting a few seconds...' %
                      lang_id)
                time.sleep(30)
                continue

            break

    # output
    with open(pkg_path('sources', 'pbase.tsv').as_posix(), 'w') as handler:
        handler.write('GRAPHEME\n')
        for grapheme in sorted(catalog):
            handler.write('%s\n' % grapheme)
Пример #4
0
 def iter_transcriptionsystem(self, include_private=False, exclude=None):
     exclude = exclude or []
     for ts in sorted(pkg_path('transcriptionsystems').iterdir(),
                      key=lambda p: p.name):
         if ts.is_dir():
             if (not ts.name.startswith('_')) or include_private:
                 if ts.name not in exclude:
                     yield TranscriptionSystem(ts.name)
Пример #5
0
def main():
    # iterate over all lines, collecting data from the appropriate ones
    in_data = False

    with urllib.request.urlopen(DATASET) as handler:
        data = handler.read().decode('utf-8')
    sounds = re.findall('<a.href="(.*?search.q=.*?)">(.*?)<.a>', data)
    print(len(sounds))

    # output
    with open(pkg_path('sources', 'diachronica.tsv').as_posix(),
              'w') as handler:
        handler.write('GRAPHEME\tURL\n')
        for url, segment in sounds:
            handler.write(segment + '\t' + PREFIX + url + '\n')
Пример #6
0
def main():
    # fetch language codes
    print("Fetching language codes...")
    lang_codes = fetch_lang_codes()

    # fetch inventories
    catalog = {}
    inventories = set()
    for lang_code in lang_codes:
        # make sure we retry fetching if it fails (e.g., network failure,
        # hitting server limit, etc.)
        while True:
            try:
                print("Fetching %s..." % lang_code)
                time.sleep(1)
                inventory = fetch_inventory(lang_code)
                # update global catalogue
                for segment in inventory:
                    catalog[segment[0]] = [segment[1], segment[2]]
                # update global in-inventory count
                inventories.update([segment[0] for segment in inventories])
            except:
                print(
                    "WARNING: Error while fetching '%s', retrying in some seconds."
                    % lang_code)
                time.sleep(30)
                continue
            break

    # output
    with open(pkg_path('sources', 'lapsyd.tsv').as_posix(), 'w') as handler:
        handler.write('LAPSyD ID\tGRAPHEME\tLAPSyD DESCRIPTION\n')
        for segment_id in sorted(catalog):
            buf = [segment_id, catalog[segment_id][0], catalog[segment_id][1]]
            handler.write('\t'.join(buf))
            handler.write('\n')
Пример #7
0
import urllib.request
import re
from pyclts.util import pkg_path

URL = 'http://apics-online.info/parameters/{0}'
sounds = []
for i in range(131, 308):
    with urllib.request.urlopen(URL.format(i)) as handler:
        data = handler.read().decode('utf-8')
    print('Downloading parameter', i)
    id_, sound, feature = re.findall(
        '<h2>([1-3][0-9][0-9]) ([^\s]+) . ([^<]*)</h2>', data)[0]
    sounds += [[id_, sound, feature]]
with open(pkg_path('sources', 'apics.tsv'), 'w') as f:
    f.write('BIPA\tGRAPHEME\tURL\tFEATURES\n')
    for line in sounds:
        f.write(
            '\t{0[1]}\thttp://apics-online.info/parameters/{0[0]}\t{0[2]}\n'.
            format(line))
Пример #8
0
 def iter_transcriptiondata(self):
     for td in sorted(pkg_path('transcriptiondata').iterdir(),
                      key=lambda p: p.name):
         yield TranscriptionData(td.stem)
Пример #9
0
    def __init__(self, id_):
        """
        :param system: The name of a transcription system or a directory containing one.
        """
        if hasattr(self, 'features'):
            # Only initialize, if this is really a new instance!
            return
        assert id_
        system = pkg_path('transcriptionsystems', id_)
        if not (system.exists() and system.is_dir()):
            raise ValueError('unknown system: {0}'.format(id_))

        self.system = TableGroup.from_file(
            pkg_path('transcriptionsystems',
                     'transcription-system-metadata.json'))
        self.system._fname = system / 'metadata.json'

        self.features = {'consonant': {}, 'vowel': {}, 'tone': {}}
        # dictionary for feature values, checks when writing elements from
        # write_order to make sure no output is doubled
        self._feature_values = {}

        # load the general features
        features = jsonlib.load(
            pkg_path('transcriptionsystems', 'features.json'))

        self.diacritics = dict(consonant={},
                               vowel={},
                               click={},
                               diphthong={},
                               tone={},
                               cluster={})
        for dia in itertable(self.system.tabledict['diacritics.tsv']):
            if not dia['alias'] and not dia['typography']:
                self.features[dia['type']][dia['value']] = dia['grapheme']
            # assign feature values to the dictionary
            self._feature_values[dia['value']] = dia['feature']
            self.diacritics[dia['type']][dia['grapheme']] = dia['value']

        self.sound_classes = {}
        self.columns = {}  # the basic column structure, to allow for rendering
        self.sounds = {}  # Sounds by grapheme
        self._covered = {}
        # check for unresolved aliased sounds
        aliases = []
        for cls in [Consonant, Vowel, Tone, Marker]:  # noqa: F405
            type_ = cls.__name__.lower()
            self.sound_classes[type_] = cls
            # store information on column structure to allow for rendering of a
            # sound in this form, which will make it easier to insert it when
            # finding generated sounds
            self.columns[type_] = [
                c['name'].lower()
                for c in self.system.tabledict['{0}s.tsv'.format(
                    type_)].asdict()['tableSchema']['columns']
            ]
            for l, item in enumerate(
                    itertable(
                        self.system.tabledict['{0}s.tsv'.format(type_)])):
                if item['grapheme'] in self.sounds:
                    raise ValueError(
                        'duplicate grapheme in {0}:{1}: {2}'.format(
                            type_ + 's.tsv', l + 2, item['grapheme']))
                sound = cls(ts=self, **item)
                # make sure this does not take too long
                for key, value in item.items():
                    if key not in {'grapheme', 'note', 'alias'} and \
                            value and value not in self._feature_values:
                        self._feature_values[value] = key
                        if type_ != 'marker' and value not in features[type_][
                                key]:
                            raise ValueError(
                                "Unrecognized features ({0}: {1}, line {2}))".
                                format(key, value, l + 2))

                self.sounds[item['grapheme']] = sound
                if not sound.alias:
                    if sound.featureset in self.features:
                        raise ValueError(
                            'duplicate features in {0}:{1}: {2}'.format(
                                type_ + 's.tsv', l + 2, sound.name))
                    self.features[sound.featureset] = sound
                else:
                    aliases += [(l, sound.type, sound.featureset)]
        # check for consistency of aliases: if an alias has no counterpart, it
        # is orphaned and needs to be deleted or given an accepted non-aliased
        # sound
        if [x for x in aliases
                if x[2] not in self.features]:  # pragma: no cover
            error = ', '.join(
                text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases
                if x[2] not in self.features)
            raise ValueError('Orphaned aliases in line(s) {0}'.format(error))

        # basic regular expression, used to match the basic sounds in the system.
        self._regex = None
        self._update_regex()

        # normalization data
        self._normalize = {
            norm(r['source']): norm(r['target'])
            for r in itertable(self.system.tabledict['normalize.tsv'])
        }