def _make_package(args): # pragma: no cover """Prepare transcriptiondata from the transcription sources.""" from lingpy.sequence.sound_classes import token2class from lingpy.data import Model columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE'] bipa = TranscriptionSystem('bipa') for src, rows in args.repos.iter_sources(type='td'): args.log.info('TranscriptionData {0} ...'.format(src['NAME'])) uritemplate = URITemplate( src['URITEMPLATE']) if src['URITEMPLATE'] else None out = [[ 'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME', 'URL' ] + columns] graphemes = set() for row in rows: if row['GRAPHEME'] in graphemes: args.log.warn('skipping duplicate grapheme: {0}'.format( row['GRAPHEME'])) continue graphemes.add(row['GRAPHEME']) if not row['BIPA']: bipa_sound = bipa[row['GRAPHEME']] explicit = '' else: bipa_sound = bipa[row['BIPA']] explicit = '+' generated = '+' if bipa_sound.generated else '' if is_valid_sound(bipa_sound, bipa): bipa_grapheme = bipa_sound.s bipa_name = bipa_sound.name else: bipa_grapheme, bipa_name = '<NA>', '<NA>' url = uritemplate.expand( **row) if uritemplate else row.get('URL', '') out.append([ bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'], url ] + [row.get(c, '') for c in columns]) found = len([o for o in out if o[0] != '<NA>']) args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format( found, len(out), found / len(out) * 100)) with UnicodeWriter(pkg_path('transcriptiondata', '{0}.tsv'.format(src['NAME'])), delimiter='\t') as writer: writer.writerows(out) count = 0 with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'), delimiter='\t') as writer: writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS) for grapheme, sound in sorted(bipa.sounds.items()): if not sound.alias: writer.writerow([sound.name, grapheme] + [ token2class(grapheme, Model(cls)) for cls in SOUNDCLASS_SYSTEMS ]) count += 1 args.log.info('SoundClasses: {0} written to file.'.format(count))
def main(): catalog = {} # iterate over all lines, collecting data from the appropriate ones in_data = False with urllib.request.urlopen(CREANZA_DATASET) as handler: for line in handler: # clean line line = line.decode('utf-8').strip() if in_data: fields = line.split('\t') catalog[fields[1]] = [fields[0], fields[2]] if line == HEADER_STR: # enter into collecting data mode in_data = True # output with open(pkg_path('sources', 'creanza.tsv').as_posix(), 'w') as handler: handler.write('GRAPHEME\tCOLUMN (ID)\tCOUNT\n') for segment in sorted(catalog): if segment not in BLACKLIST: buf = [segment, catalog[segment][0], catalog[segment][1]] handler.write('\t'.join(buf)) handler.write('\n')
def main(): # There is no language catalog on the on-line PBASE interface; # while we could loop over index until we hit a 404 or a 500, this is too # complicated and not guaranteed to work if they reorganize their # interface. I manually checked for the last entry and I'm looping # over this range (the '+1' is to make it clear that last entry is # 629 and not 630) catalog = set() for lang_id in range(1, 629 + 1): while True: try: print('Fetching language #%i...' % lang_id) catalog.update(fetch_inventory(lang_id)) except: print('WARNING: error fetching #i, waiting a few seconds...' % lang_id) time.sleep(30) continue break # output with open(pkg_path('sources', 'pbase.tsv').as_posix(), 'w') as handler: handler.write('GRAPHEME\n') for grapheme in sorted(catalog): handler.write('%s\n' % grapheme)
def iter_transcriptionsystem(self, include_private=False, exclude=None): exclude = exclude or [] for ts in sorted(pkg_path('transcriptionsystems').iterdir(), key=lambda p: p.name): if ts.is_dir(): if (not ts.name.startswith('_')) or include_private: if ts.name not in exclude: yield TranscriptionSystem(ts.name)
def main(): # iterate over all lines, collecting data from the appropriate ones in_data = False with urllib.request.urlopen(DATASET) as handler: data = handler.read().decode('utf-8') sounds = re.findall('<a.href="(.*?search.q=.*?)">(.*?)<.a>', data) print(len(sounds)) # output with open(pkg_path('sources', 'diachronica.tsv').as_posix(), 'w') as handler: handler.write('GRAPHEME\tURL\n') for url, segment in sounds: handler.write(segment + '\t' + PREFIX + url + '\n')
def main(): # fetch language codes print("Fetching language codes...") lang_codes = fetch_lang_codes() # fetch inventories catalog = {} inventories = set() for lang_code in lang_codes: # make sure we retry fetching if it fails (e.g., network failure, # hitting server limit, etc.) while True: try: print("Fetching %s..." % lang_code) time.sleep(1) inventory = fetch_inventory(lang_code) # update global catalogue for segment in inventory: catalog[segment[0]] = [segment[1], segment[2]] # update global in-inventory count inventories.update([segment[0] for segment in inventories]) except: print( "WARNING: Error while fetching '%s', retrying in some seconds." % lang_code) time.sleep(30) continue break # output with open(pkg_path('sources', 'lapsyd.tsv').as_posix(), 'w') as handler: handler.write('LAPSyD ID\tGRAPHEME\tLAPSyD DESCRIPTION\n') for segment_id in sorted(catalog): buf = [segment_id, catalog[segment_id][0], catalog[segment_id][1]] handler.write('\t'.join(buf)) handler.write('\n')
import urllib.request import re from pyclts.util import pkg_path URL = 'http://apics-online.info/parameters/{0}' sounds = [] for i in range(131, 308): with urllib.request.urlopen(URL.format(i)) as handler: data = handler.read().decode('utf-8') print('Downloading parameter', i) id_, sound, feature = re.findall( '<h2>([1-3][0-9][0-9]) ([^\s]+) . ([^<]*)</h2>', data)[0] sounds += [[id_, sound, feature]] with open(pkg_path('sources', 'apics.tsv'), 'w') as f: f.write('BIPA\tGRAPHEME\tURL\tFEATURES\n') for line in sounds: f.write( '\t{0[1]}\thttp://apics-online.info/parameters/{0[0]}\t{0[2]}\n'. format(line))
def iter_transcriptiondata(self): for td in sorted(pkg_path('transcriptiondata').iterdir(), key=lambda p: p.name): yield TranscriptionData(td.stem)
def __init__(self, id_): """ :param system: The name of a transcription system or a directory containing one. """ if hasattr(self, 'features'): # Only initialize, if this is really a new instance! return assert id_ system = pkg_path('transcriptionsystems', id_) if not (system.exists() and system.is_dir()): raise ValueError('unknown system: {0}'.format(id_)) self.system = TableGroup.from_file( pkg_path('transcriptionsystems', 'transcription-system-metadata.json')) self.system._fname = system / 'metadata.json' self.features = {'consonant': {}, 'vowel': {}, 'tone': {}} # dictionary for feature values, checks when writing elements from # write_order to make sure no output is doubled self._feature_values = {} # load the general features features = jsonlib.load( pkg_path('transcriptionsystems', 'features.json')) self.diacritics = dict(consonant={}, vowel={}, click={}, diphthong={}, tone={}, cluster={}) for dia in itertable(self.system.tabledict['diacritics.tsv']): if not dia['alias'] and not dia['typography']: self.features[dia['type']][dia['value']] = dia['grapheme'] # assign feature values to the dictionary self._feature_values[dia['value']] = dia['feature'] self.diacritics[dia['type']][dia['grapheme']] = dia['value'] self.sound_classes = {} self.columns = {} # the basic column structure, to allow for rendering self.sounds = {} # Sounds by grapheme self._covered = {} # check for unresolved aliased sounds aliases = [] for cls in [Consonant, Vowel, Tone, Marker]: # noqa: F405 type_ = cls.__name__.lower() self.sound_classes[type_] = cls # store information on column structure to allow for rendering of a # sound in this form, which will make it easier to insert it when # finding generated sounds self.columns[type_] = [ c['name'].lower() for c in self.system.tabledict['{0}s.tsv'.format( type_)].asdict()['tableSchema']['columns'] ] for l, item in enumerate( itertable( self.system.tabledict['{0}s.tsv'.format(type_)])): if item['grapheme'] in self.sounds: raise ValueError( 'duplicate grapheme in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, item['grapheme'])) sound = cls(ts=self, **item) # make sure this does not take too long for key, value in item.items(): if key not in {'grapheme', 'note', 'alias'} and \ value and value not in self._feature_values: self._feature_values[value] = key if type_ != 'marker' and value not in features[type_][ key]: raise ValueError( "Unrecognized features ({0}: {1}, line {2}))". format(key, value, l + 2)) self.sounds[item['grapheme']] = sound if not sound.alias: if sound.featureset in self.features: raise ValueError( 'duplicate features in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, sound.name)) self.features[sound.featureset] = sound else: aliases += [(l, sound.type, sound.featureset)] # check for consistency of aliases: if an alias has no counterpart, it # is orphaned and needs to be deleted or given an accepted non-aliased # sound if [x for x in aliases if x[2] not in self.features]: # pragma: no cover error = ', '.join( text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases if x[2] not in self.features) raise ValueError('Orphaned aliases in line(s) {0}'.format(error)) # basic regular expression, used to match the basic sounds in the system. self._regex = None self._update_regex() # normalization data self._normalize = { norm(r['source']): norm(r['target']) for r in itertable(self.system.tabledict['normalize.tsv']) }