def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2 # now check with all possible data we have so far, but only on cases where # tokenization doesn't require the merge_vowels = False flag tokens = csv2list(test_data('test_tokenization.tsv')) for a, b in tokens: tks = ' '.join(ipa2tokens(a)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b # now test on smaller set with unmerged vowels tokens = csv2list(test_data('test_tokenization_mv.tsv')) for a, b in tokens: tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b tokens = csv2list(test_data('test_tokenization_nasals.tsv')) for a, b in tokens: tks = ' '.join( ipa2tokens(a, merge_vowels=True, merge_geminates=True, expand_nasals=True, semi_diacritics='h')) assert tks == b
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2 # now check with all possible data we have so far tokens = csv2list(test_data('test_tokenization.csv')) for a, b in tokens: tks1 = ' '.join(ipa2tokens(a)) tks2 = ' '.join(ipa2tokens(a, merge_vowels=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks1 == b or tks2 == b
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' '))-2 # now check with all possible data we have so far tokens = csv2list(test_data('test_tokenization.csv')) for a,b in tokens: tks1 = ' '.join(ipa2tokens(a)) tks2 = ' '.join(ipa2tokens(a, merge_vowels=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks1 == b or tks2 == b
def test_ipa2tokens(): seq = 'ˈtʲʰoɔːix_tərp͡f¹¹' assert len(ipa2tokens(seq)) != len(list(seq)) seq = 'ʰto͡i' assert len(ipa2tokens(seq)) == 2 seq = 'th o x t a' assert len(ipa2tokens(seq)) == len(seq.split(' ')) seq = '# b l a #' assert len(ipa2tokens(seq)) == len(seq.split(' '))-2 # now check with all possible data we have so far, but only on cases where # tokenization doesn't require the merge_vowels = False flag tokens = csv2list(test_data('test_tokenization.tsv')) for a,b in tokens: tks = ' '.join(ipa2tokens(a)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b # now test on smaller set with unmerged vowels tokens = csv2list(test_data('test_tokenization_mv.tsv')) for a,b in tokens: tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False)) # we check for two variants, since we don't know whether vowels are # merged or not in the test data assert tks == b tokens = csv2list(test_data('test_tokenization_nasals.tsv')) for a,b in tokens: print(tks) tks = ' '.join(ipa2tokens(a, merge_vowels=True, merge_geminates=True, expand_nasals=True, semi_diacritics='h')) assert tks == b
def get_unihan(): _unihan = lingpy.csv2list(_path('Unihan_Readings.txt')) unihan = defaultdict(dict) for line in _unihan: key = eval('"""' + line[0] + '"""') if line[1] == 'kHanyuPinyin': unihan[key]['pinyin'] = line[2].split(':')[1] else: unihan[key][line[1][1:].lower()] = line[2] return unihan
def get_transformer(profile, exception=None): profile = lp.csv2list(cddb_path('profiles', profile), strip_lines=False) for i, line in enumerate(profile): profile[i] = [unicodedata.normalize('NFD', clpa.normalize(x)) for x in line] tokenizer = Tokenizer(profile, errors_replace=lambda x: "«{0}»".format(x)) return lambda x, y: unicodedata.normalize( 'NFC', tokenizer.transform(clpa.normalize(x), column=y, separator=' + ') )
def get_ids(): _ids = lingpy.csv2list(_path('ids.txt')) ids = {} for line in _ids: char = line[1] motivations = line[2:] for motivation in motivations: if '[' in motivation: motivation = motivation[:motivation.index('[')] ids[motivation] = char return ids
def inventories(ds): data = csv2list(ds.raw('inv.tsv')) header = data[0] invs = {l: [] for l in ds.languages} for i, line in enumerate(data[1:]): stype, sis, ipa, struc = line[1:5] if len(struc.split()) != len(ipa.split()): print(i + 2, 'warn', struc, ' | ', ipa) for l, n in zip(header[5:], line[5:]): if n: note = '' if 'X' else n invs[l] += [[sis, ipa, struc, stype, note]] ds.write_inventories(invs)
def prepare(dataset): data = lingpy.csv2list(dataset.get_path('raw', 'data-starostin.tsv'), strip_lines=False) header = [h.lower() for h in data[0]] out = {} idx = 1 for line in data[1:]: char = line[0] coc = line[2] bijiang = line[1] note = line[3] dali = line[4] doc_url = line[5] lhc = line[7] gloss = line[8] jianchuan = line[12] kg = line[14] mch = line[16] pinyin = line[18] rad = line[20] shijing = line[21] if coc.strip(): out[idx] = [ char, pinyin, 'Old_Chinese', 'Classical Old Chinese', coc, rad, kg[:4], kg, gloss ] idx += 1 if lhc.strip(): out[idx] = [ char, pinyin, 'Late_Han_Chinese', 'Eastern Han Chinese', lhc, rad, kg[:4], kg, gloss ] idx += 1 if mch.strip(): out[idx] = [ char, pinyin, 'Middle_Chinese', 'Middle Chinese', mch, rad, kg[:4], kg, gloss ] idx += 1 out[0] = [ 'character', 'pinyin', 'doculect', 'doculect_in_source', 'reading', 'semantic_class', 'phonetic_class', 'karlgren_id', 'gloss' ] dataset.write_wordlist(lingpy.Wordlist(out, row='character'), 'characters')
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("chinese.tsv").as_posix()) maxcogid = 0 args.writer.add_sources() args.writer.add_languages(id_factory=lambda l: l["Name"]) args.writer.add_concepts( id_factory=lambda c: slug(c.label, lowercase=False)) # store list of proto-form to cognate set p2c = {} for k in wl: for row in args.writer.add_lexemes( Language_ID=wl[k, "doculect"], Parameter_ID=slug(wl[k, "concept"], lowercase=False), Value=wl[k, "ipa"], Source="Hamed2006", Cognacy=wl[k, "COGID"], ): args.writer.add_cognate(lexeme=row, Cognateset_ID=wl[k, "cogid"], Source=["Hamed2006", "List2015"]) maxcogid = max([maxcogid, int(wl[k, "cogid"])]) p2c[wl[k, "concept"], wl[k, "proto"]] = wl[k, "cogid"] idx = max([k for k in wl]) + 1 for line in lingpy.csv2list( self.raw_dir.joinpath("old_chinese.csv").as_posix()): for val in line[1].split(", "): cogid = p2c.get((line[0], val)) if not cogid: maxcogid += 1 cogid = p2c[line[0], val] = maxcogid for row in args.writer.add_lexemes( Language_ID="OldChinese", Parameter_ID=slug(line[0], lowercase=False), Value=val, Source="Hamed2006", Cognacy=p2c.get(val, val), ): args.writer.add_cognate(lexeme=row, Cognateset_ID=cogid, Source=["Hamed2006", "List2015"]) idx += 1
def write_map(varieties, outfile): languages = lp.csv2list(varieties) colors = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", '#040404', '#F6E3CE', '#81F79F', '#8A0808', '#FA58F4', '#0489B1', '#088A08'] points = [] header = [x.strip() for x in languages[0]] nidx = header.index('NAME') latidx = header.index('LATITUDE') lonidx = header.index('LONGITUDE') pinidx = header.index('PINYIN') hanidx = header.index('HANZI') groupidx = header.index("SUBGROUP") pinidx = header.index("PINYIN") famidx = header.index('FAMILY') groups = sorted(set([line[groupidx] for line in languages[1:]])) for line in languages[1:]: name = line[nidx] pinyin = line[pinidx] hanzi = line[hanidx] lat, lon = line[latidx], line[lonidx] group = line[groupidx] family = line[famidx] if lat.strip() and lat != '?': lat, lon = float(lat), float(lon) if lat > 400 or lon > 400: raise ValueError("Coords for {0} are wrong.".format(name)) point = geojson.Point((lon, lat)) feature = geojson.Feature(geometry=point, properties = { "Family" : family, "Variety" : name, "Pinyin" : pinyin, "Chinese" : hanzi, "Group" : group, "marker-color" : colors[groups.index(group)] }) points += [feature] with open(outfile, 'w') as f: f.write(json.dumps(geojson.FeatureCollection(points)))
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} ccode = {x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2015d') with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Cognacy', ) , dataset) as ds: ds.sources.add(src, src2) # store list of proto-form to cognate set p2c = {} for k in wl: ds.add_row([ '{0}-{1}'.format(SOURCE, k), gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', ccode[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'ipa'], SOURCE, wl[k, 'COGID'] ]) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, k), ds.name, wl[k, 'ipa'], '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ]] p2c[wl[k, 'proto']] = wl[k, 'cogid'] idx = max([k for k in wl]) + 1 for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()): for val in line[1].split(', '): ds.add_row(( '{0}-{1}'.format(SOURCE, idx), 'sini1245', 'Old Chinese', '', ccode[line[0]], line[0], val, SOURCE, p2c.get(val, val) )) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, idx), ds.name, val, '-'.join([slug(line[0]), text_type(p2c.get(val, val))]), '', 'expert', SOURCE, '', '', '']] idx += 1
def cmd_install(self, **kw): data = json.load(open(self.raw.posix('data.json'))) wl = lingpy.Wordlist(self.raw.posix('sino-tibetan-raw.tsv')) profile = { l[0]: l[1] for l in lingpy.csv2list(self.raw.posix('profile.tsv')) } for idx, tokens in pb(wl.iter_rows('tokens'), desc='tokenize'): tks = [] for token in tokens: tks += profile.get( token, profile.get( token.split('/')[1] if '/' in token else token, token)).split(' ') wl[idx, 'tokens'] = [ x.strip() for x in tks if x != 'NULL' and x.strip() ] with self.cldf as ds: ds.add_sources(*self.raw.read_bib()) ds.add_languages() for c in self.conceptlist.concepts.values(): ds.add_concept(ID=c.concepticon_id, TBL_ID=c.attributes['huang_1992_1820'], Name=c.english, Coverage=c.attributes['coverage'], Concepticon_ID=c.concepticon_id) concept2id = { c.english: c.concepticon_id for c in self.conceptlist.concepts.values() } source_dict, langs_dict = {}, {} concept_dict = {} for l in self.languages: source_dict[l['Name']] = l['Source'] langs_dict[l['Name']] = l['ID'] wl.output('tsv', filename=self.raw.posix('sino-tibetan-cleaned'), subset=True, rows={"ID": "not in " + str(data['blacklist'])}) for k in pb(wl, desc='wl-to-cldf'): if wl[k, 'tokens']: row = ds.add_form_with_segments( Language_ID=langs_dict.get(data['taxa'].get( wl[k, 'doculect'], wl[k, 'doculect'])), Parameter_ID=concept2id[wl[k, 'concept']], Value=wl[k, 'entry_in_source'].strip() or ''.join(wl[k, 'tokens']) or wl[k, 'ipa'], Form=wl[k, 'ipa'] or wl[k, 'entry_in_source'] or ''.join(wl[k, 'tokens']), Segments=wl[k, 'tokens'], Source=source_dict.get(data['taxa'].get( wl[k, 'doculect'], wl[k, 'doculect'])).split(','), Comment=wl[k, 'note'], Cognacy=wl[k, 'cogid'], Loan=True if wl[k, 'borrowing'].strip() else False) cid = slug(wl[k, 'concept']) + '-' + '{0}'.format( wl[k, 'cogid']) ds.add_cognate(lexeme=row, Cognateset_ID=cid, Source='Sagart2018', Alignment='', Alignment_Source='')
def download(dataset): print(HEADER) chars = load_characters() charset = [] for k in chars: if chars[k, 'source'] == 'Baxter1992' or chars[k, 'source'] == 'Baxter2014': charset += [chars[k, 'character']] print('[Loaded Characters]') for char in charset: _tmp = [ x[0] for x in lingpy.csv2list(dataset.get_path( 'raw', 'data-starostin.tsv'), strip_lines=True) if x ] if not _tmp: out = open(dataset.get_path('raw', 'data-starostin.tsv'), 'w') out.write('CHARACTER' + '\t' + '\t'.join(HEADER) + '\n') out.close() _tmp += [ x[0] for x in lingpy.csv2list( dataset.get_path('raw', 'data-missing.tsv')) ] if char not in _tmp: new_url = URL + parse.quote(char) print('[LOADING] ' + char + ' ' + new_url) try: req = request.urlopen(new_url) data = req.read().decode('utf-8') found = False tmp = {} for f in FIELDS: d = re.findall( '<span class="fld">' + f + ':</span>.*?<span class="unicode">(.*?)</span>', data, re.DOTALL) print(f, d) if d: tmp[f] = d[0] found = True else: tmp[f] = '' for l in LINKS: d = re.findall( '<span class="fld">' + l + ':</span>.*?<a href="(.*?)"', data, re.DOTALL) print(l, d) if d: tmp[l] = d[0] else: tmp[l] = '' if found: print('Found character {0} reading: {1}'.format( char, tmp['Modern .Beijing. reading'])) out = open(dataset.get_path('raw', 'data-starostin.tsv'), 'a') out.write(char + '\t' + '\t'.join( [tmp[h].strip().replace('\t', '') for h in HEADER]) + '\n') out.close() else: print('Problem, ', len(data)) out = open(dataset.get_path('raw', 'data-missing.tsv'), 'a') out.write(char + '\n') out.close() except urllib.error.HTTPError: print('[ERROR IN LOADING URL]')
# email : [email protected] # created : 2014-03-11 15:50 # modified : 2014-03-11 15:50 """ <++> """ __author__="Johann-Mattis List" __date__="2014-03-11" from lingpy import csv2list import sqlite3 vals = csv2list('msa_taxa.csv') conn = sqlite3.connect('../website_new/bdhl.de/data/data.sqlite3') cursor = conn.cursor() try: cursor.execute('drop table alignments;') except: pass cursor.execute('create table alignments(id int, file text, dataset text, sequence tex, pid int, seqnum int, uniques int, taxa text);') for line in vals: cursor.execute( 'insert into alignments values(?,?,?,?,?,?,?,?);', tuple(line)
relations = dict(broader='narrower', similar='similar', sameas='sameas', resultof='resultsin', produces='producedby', usedfor='requires', consistsof='', classof='instanceof', intransitiveof='transitiveof', baseof='hasform') for k, v in list(relations.items()): if v and v != k: relations[v] = k # load teh concepticon to get the meta-data _C = lingpy.csv2list('../concepticondata/concepticon.tsv') C = {} for line in _C[1:]: tmp = dict(zip([x.lower() for x in _C[0]], line)) C[line[0]] = tmp with open('../concepticondata/conceptrelations.tsv') as f: for line in f.readlines()[1:]: print(line.replace('\t', '-x-')) a, _a, b, c, _c = [x.strip() for x in line.split('\t')] if a and b and relations[b]: G.add_edge(a, c, relation=b) G.add_edge(c, a, relation=relations[b])
broader = 'narrower', similar = 'similar', sameas = 'sameas', resultof = 'resultsin', produces = 'producedby', usedfor = 'requires', consistsof = '', classof = 'instanceof', intransitiveof = 'transitiveof' ) for k,v in list(relations.items()): if v and v != k: relations[v] = k # load teh concepticon to get the meta-data _C = lingpy.csv2list('../concepticondata/concepticon.tsv') C = {} for line in _C[1:]: tmp = dict(zip([x.lower() for x in _C[0]],line)) C[line[0]] = tmp with open('../concepticondata/conceptrelations.tsv') as f: for line in f.readlines()[1:]: print(line) a,_a,b,c,_c = [x.strip() for x in line.split('\t')] if a and b and relations[b]: G.add_edge(a,c,relation=b) G.add_edge(c,a,relation=relations[b])
"ipa": 'e' }, 'ei': { "ipa": 'ei' }, 'k': { "ipa": 'k' }, 'p': { "ipa": 'p' } } out = segmentize('khaetphaeit', segments, debug=False, column='ipa') print(' '.join(out)) segments = { k[0]: { 'ipa': k[1], 'structure': k[2] } for k in csv2list(_path('chinese.tsv')) } for word in [ 'khap55', 'khuang5', 'kai', 'kiAng', 'thang', 'pfhang35', 'pfing44fu24', 'mao35tse35doŋ51' ]: print(' '.join(segmentize(word, segments, column='ipa'))) print(' '.join(segmentize(word, segments, column='structure'))) print(' ')