def test_syllabify(): seq1 = "t i a o ¹ b u ² d a o" seq2 = "jabloko" seq3 = "jabəlko" seq4 = "j a b u - k o" assert_raises(ValueError, syllabify, seq1, output="test") assert syllabify(seq1, output="flat").count(rc('morpheme_separator')) == 2 assert syllabify(seq2, output="breakpoints")[0] == (0,2) assert syllabify(seq3, output="nested")[1] == list("bəl") assert syllabify(seq4, output="nested")[1] == list("bu-")
def test_syllabify(self): seq1 = "t i a o ¹ b u ² d a o" seq2 = "jabloko" seq3 = "jabəlko" seq4 = "j a b u - k o" seq5 = "ma⁵io" assert_raises(ValueError, syllabify, seq1, output="test") assert syllabify(seq1, output="flat").count(rc('morpheme_separator')) == 2 assert syllabify(seq2, output="breakpoints")[0] == (0, 2) assert syllabify(seq3, output="nested")[1] == list("bəl") assert syllabify(seq4, output="nested")[1] == list("bu-") assert ''.join(syllabify(seq5, sep="+")).split('+')[-1] == 'io'
def prepare(ds): rawdata = csv2list(ds.raw('Burling1967.csv'), strip_lines=False) concepts = [] header = rawdata[0] crect = {} D = { 0: [ 'doculect', 'concept', 'doculect_is', 'concept_number_is', 'value_is', 'value', 'notes', 'notes_mm', 'cogid', 'segments' ] } idx = 1 cogid = 1 for i, line in enumerate(rawdata[1:]): tmp = dict(zip(header, line)) concepts += [(tmp['number'], tmp['meaning'])] for lid, lang in sorted(ds.lid2lang.items()): cell = tmp[lid] if cell.strip() != '-' and cell.strip(): ccell = crect.get(cell, cell) syls = ' + '.join( [' '.join(x) for x in syllabify(ccell, output='nested')]) D[idx] = [ lang, tmp['meaning'], lid, tmp['number'], cell, ccell, tmp['notes'], tmp['MM notes'], cogid, syls ] idx += 1 cogid += 1 with open(ds.raw('concepts.tsv'), 'w') as f: f.write('NUMBER\tENGLISH\n') for a, b in concepts: f.write(a + '\t' + b + '\n') ds.write_wordlist(Wordlist(D))
def lexeme2syllables(lexeme): # Process the lexeme; the syllabification algorithm is stricter, # so we cannot have the option of keeping slashes, marks, etc phonemes = [ phoneme.split("/")[1] if "/" in phoneme else phoneme for phoneme in lexeme.split() ] phonemes = [phoneme for phoneme in phonemes if phoneme not in ["+", "_"]] clean_lexeme = " ".join(phonemes) # Get the syllables syllables = syllabify(clean_lexeme, output="nested") return syllables
def prepare(ds): rawdata = csv2list(ds.raw('Burling1967.csv'), strip_lines=False) concepts = [] header = rawdata[0] crect = {} D = {0: [ 'doculect', 'concept', 'doculect_is', 'concept_number_is', 'value_is', 'value', 'notes', 'notes_mm', 'cogid', 'segments' ]} idx = 1 cogid = 1 for i, line in enumerate(rawdata[1:]): tmp = dict(zip(header, line)) concepts += [(tmp['number'], tmp['meaning'])] for lid, lang in sorted(ds.lid2lang.items()): cell = tmp[lid] if cell.strip() != '-' and cell.strip(): ccell = crect.get(cell, cell) syls = ' + '.join([' '.join(x) for x in syllabify( ccell, output='nested')]) D[idx] = [ lang, tmp['meaning'], lid, tmp['number'], cell, ccell, tmp['notes'], tmp['MM notes'], cogid, syls] idx += 1 cogid += 1 with open(ds.raw('concepts.tsv'), 'w') as f: f.write('NUMBER\tENGLISH\n') for a, b in concepts: f.write(a+'\t'+b+'\n') ds.write_wordlist(Wordlist(D))
'Burmese (Written)' : 'Written_Burmese', 'Burmese (Rangoon)' : 'Rangoon', 'Achang (Xiandao)' : 'Xiandao', 'Langsu (Luxi)' : 'Maru' } D = {} idx = 1 for line in tbl[1:]: ridx, reflex, gloss = line[:3] if line[-1].strip(): tbl = str(int(line[-1].split('.')[0])) lang = line[6].strip() if lang in doculects and tbl in new_concepts: doc = doculects[lang] tokens = syllabify(ipa2tokens(reflex.replace(' ', '_'), merge_vowels=False, expand_nasals=True, semi_diacritics='shɕʑʃʒʐʂ')) ipa = ''.join(tokens) alm = [] for t in tokens: alm += [sounds.get((t, doc), '_'+t if t in '()_' else t)] alm = ' '.join(alm) tokens = ' '.join(tokens) concept = new_concepts[tbl][0] if reflex.strip() == '*': pass else: D[idx] = [ridx, reflex, gloss, ipa,
for i, w in enumerate(wordlists): wl = Wordlist(w) colors = {} tmp = defaultdict(int) sylen = [] clen = [] for k in wl: dolgos = tokens2class(wl[k, 'tokens'], 'dolgo') for idx, t in zip(dolgos, wl[k, 'tokens']): if idx not in '+': tmp[idx] += 1 colors[idx] = token2class(t, color) all_cols += [(k, colors[idx])] all_sounds[idx] += 1 all_colors[idx] = colors[idx] sylen += [len(syllabify(' '.join(wl[k, 'tokens']), output='nested'))] clen += [len([x for x in dolgos if x not in '1V'])] print(w, sum(sylen) / len(sylen), sum(clen) / len(clen)) ax = plt.subplot(gs[i]) labels = [x for x, y in sorted(tmp.items(), key=lambda x: x[0])] ax.pie([y for x, y in sorted(tmp.items(), key=lambda x: x[0])], colors=[y for x, y in sorted(colors.items(), key=lambda x: x[0])], radius = 0.95, frame=True, shadow=True) ax.set_autoscale_on(False) plt.ylim(-1, 1) plt.xlim(-1, 1) plt.title(w.split('_')[2].split('-')[0]) plt.axis('off') ax.set_aspect('equal') print('plotting')
'Burmese (Written)' : 'Written_Burmese', 'Burmese (Rangoon)' : 'Rangoon', 'Achang (Xiandao)' : 'Xiandao', 'Langsu (Luxi)' : 'Maru' } D = {} idx = 1 for line in tbl[1:]: ridx, reflex, gloss = line[:3] if line[-1].strip(): tbl = str(int(line[-1].split('.')[0])) lang = line[6].strip() if lang in doculects and tbl in new_concepts: doc = doculects[lang] tokens = syllabify(ipa2tokens(reflex.replace(' ', '_'), merge_vowels=False, expand_nasals=True, semi_diacritics='shɕʑʃʒʐʂ')) ipa = ''.join(tokens) alm = [] for t in tokens: alm += [sounds.get((t, doc), '†'+t if t in '()_' else t)] alm = ' '.join(alm) tokens = ' '.join(tokens) concept = new_concepts[tbl][0] if reflex.strip() == '*': pass else: D[idx] = [ridx, reflex, gloss, ipa,
def cmd_makecldf(self, args): with self.cldf_writer(args) as writer: # load data as wordlists, as we need to bring the already segmented # entries in line with clts wl = lingpy.Wordlist( self.raw_dir.joinpath("words.tsv").as_posix(), conf=self.raw_dir.joinpath("wordlist.rc").as_posix(), ) wl.add_entries( "new_segments", "segments", lambda x: syllabify(self.tokenizer( {}, "^" + "".join(x) + "$", column="IPA"), cldf=True), ) writer.add_sources() # note: no way to easily replace this with the direct call to `add_concepts` # as we add the Chinese gloss via concept.attributes concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.gloss) writer.add_concept( ID=idx, Name=concept.gloss, Chinese_Gloss=concept.attributes["chinese"], Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.number] = idx language_lookup = writer.add_languages(lookup_factory="Name") for k in pylexibank.progressbar(wl, desc="wl-to-cldf", total=len(wl)): if wl[k, "value"]: form = self.form_spec.clean(form=wl[k, "value"], item=None) writer.add_form_with_segments( Language_ID=language_lookup[wl[k, "doculect"]], Parameter_ID=concept_lookup[wl[k, "beida_id"]], Value=wl[k, "value"], Form=form, Segments=wl[k, "new_segments"], Source="Cihui", Benzi=wl[k, "benzi"], ) # We explicitly remove the ISO code column since the languages in # this datasets do not have an ISO code. writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ] language_table = writer.cldf["LanguageTable"] with self.cldf_writer(args, cldf_spec="structure", clean=False) as writer: writer.cldf.add_component(language_table) writer.objects["LanguageTable"] = self.languages inventories = self.raw_dir.read_csv("inventories.tsv", normalize="NFC", delimiter="\t", dicts=True) writer.cldf.add_columns( "ParameterTable", { "name": "CLTS_BIPA", "datatype": "string" }, { "name": "CLTS_Name", "datatype": "string" }, { "name": "Lexibank_BIPA", "datatype": "string" }, { "name": "Prosody", "datatype": "string" }, ) writer.cldf.add_columns("ValueTable", { "name": "Context", "datatype": "string" }) clts = CLTS(args.clts.dir) bipa = clts.transcriptionsystem_dict["bipa"] td = clts.transcriptiondata_dict["beidasinitic"] pids, visited = {}, set() for row in pylexibank.progressbar(inventories, desc="inventories"): if not row["Value"].startswith("(") and row["Value"] != "Ø": for s1, s2, p in zip(row["Value"].split(), row["Lexibank"].split(), row["Prosody"].split()): s1 = normalize("NFD", s1) pidx = "-".join( [str(hex(ord(s)))[2:].rjust(4, "0") for s in s1]) + "_" + p if not s1 in td.grapheme_map: args.log.warn("missing sound {0} / {1}".format( s1, " ".join([str(hex(ord(x))) for x in s1]))) else: sound = bipa[td.grapheme_map[s1]] sound_name = (sound.name if sound.type not in ["unknown", "marker"] else "") if not pidx in visited: visited.add(pidx) writer.objects["ParameterTable"].append({ "ID": pidx, "Name": s1, "Description": sound_name, "CLTS_BIPA": td.grapheme_map[s1], "CLTS_Name": sound_name, "Lexibank_BIPA": s2, "Prosody": p, }) writer.objects["ValueTable"].append({ "ID": row["Language_ID"] + "_" + pidx, "Language_ID": row["Language_ID"], "Parameter_ID": pidx, "Value": s1, "Context": p, "Source": ["Cihui"], })