示例#1
0
def test_syllabify():
    
    seq1 = "t i a o ¹ b u ² d a o"
    seq2 = "jabloko"
    seq3 = "jabəlko"
    seq4 = "j a b u - k o"

    assert_raises(ValueError, syllabify, seq1, output="test")

    assert syllabify(seq1, output="flat").count(rc('morpheme_separator')) == 2
    assert syllabify(seq2, output="breakpoints")[0] == (0,2)
    assert syllabify(seq3, output="nested")[1] == list("bəl")
    assert syllabify(seq4, output="nested")[1] == list("bu-")
示例#2
0
    def test_syllabify(self):
        seq1 = "t i a o ¹ b u ² d a o"
        seq2 = "jabloko"
        seq3 = "jabəlko"
        seq4 = "j a b u - k o"
        seq5 = "ma⁵io"

        assert_raises(ValueError, syllabify, seq1, output="test")

        assert syllabify(seq1,
                         output="flat").count(rc('morpheme_separator')) == 2
        assert syllabify(seq2, output="breakpoints")[0] == (0, 2)
        assert syllabify(seq3, output="nested")[1] == list("bəl")
        assert syllabify(seq4, output="nested")[1] == list("bu-")
        assert ''.join(syllabify(seq5, sep="+")).split('+')[-1] == 'io'
示例#3
0
def prepare(ds):

    rawdata = csv2list(ds.raw('Burling1967.csv'), strip_lines=False)

    concepts = []
    header = rawdata[0]
    crect = {}
    D = {
        0: [
            'doculect', 'concept', 'doculect_is', 'concept_number_is',
            'value_is', 'value', 'notes', 'notes_mm', 'cogid', 'segments'
        ]
    }
    idx = 1
    cogid = 1
    for i, line in enumerate(rawdata[1:]):
        tmp = dict(zip(header, line))
        concepts += [(tmp['number'], tmp['meaning'])]
        for lid, lang in sorted(ds.lid2lang.items()):
            cell = tmp[lid]
            if cell.strip() != '-' and cell.strip():
                ccell = crect.get(cell, cell)
                syls = ' + '.join(
                    [' '.join(x) for x in syllabify(ccell, output='nested')])
                D[idx] = [
                    lang, tmp['meaning'], lid, tmp['number'], cell, ccell,
                    tmp['notes'], tmp['MM notes'], cogid, syls
                ]
                idx += 1
        cogid += 1
    with open(ds.raw('concepts.tsv'), 'w') as f:
        f.write('NUMBER\tENGLISH\n')
        for a, b in concepts:
            f.write(a + '\t' + b + '\n')
    ds.write_wordlist(Wordlist(D))
def lexeme2syllables(lexeme):

    # Process the lexeme; the syllabification algorithm is stricter,
    # so we cannot have the option of keeping slashes, marks, etc
    phonemes = [
        phoneme.split("/")[1] if "/" in phoneme else phoneme
        for phoneme in lexeme.split()
    ]
    phonemes = [phoneme for phoneme in phonemes if phoneme not in ["+", "_"]]
    clean_lexeme = " ".join(phonemes)

    # Get the syllables
    syllables = syllabify(clean_lexeme, output="nested")

    return syllables
示例#5
0
def prepare(ds):
    
    rawdata = csv2list(ds.raw('Burling1967.csv'), strip_lines=False)

    concepts = []
    header = rawdata[0]
    crect = {}
    D = {0: [
        'doculect',
        'concept',
        'doculect_is',
        'concept_number_is',
        'value_is',
        'value',
        'notes',
        'notes_mm',
        'cogid',
        'segments'
        ]}
    idx = 1
    cogid = 1
    for i, line in enumerate(rawdata[1:]):
        tmp = dict(zip(header, line))
        concepts += [(tmp['number'], tmp['meaning'])]
        for lid, lang in sorted(ds.lid2lang.items()):
            cell = tmp[lid]
            if cell.strip() != '-' and cell.strip():
                ccell = crect.get(cell, cell)
                syls = ' + '.join([' '.join(x) for x in syllabify(
                    ccell, output='nested')])
                D[idx] = [
                        lang, tmp['meaning'], lid, tmp['number'], cell,
                        ccell, tmp['notes'], tmp['MM notes'], 
                        cogid, syls]
                idx += 1
        cogid += 1
    with open(ds.raw('concepts.tsv'), 'w') as f:
        f.write('NUMBER\tENGLISH\n')
        for a, b in concepts:
            f.write(a+'\t'+b+'\n')
    ds.write_wordlist(Wordlist(D))
示例#6
0
        'Burmese (Written)' : 'Written_Burmese',
        'Burmese (Rangoon)' : 'Rangoon',
        'Achang (Xiandao)' : 'Xiandao',
        'Langsu (Luxi)' : 'Maru'
        }
D = {}
idx = 1
for line in tbl[1:]:
    ridx, reflex, gloss = line[:3]
    if line[-1].strip():
        tbl = str(int(line[-1].split('.')[0]))
        lang = line[6].strip()
        if lang in doculects and tbl in new_concepts:
            doc = doculects[lang]
            tokens = syllabify(ipa2tokens(reflex.replace(' ', '_'), 
                    merge_vowels=False, expand_nasals=True,
                    semi_diacritics='shɕʑʃʒʐʂ'))
            ipa = ''.join(tokens)
            alm = []
            for t in tokens:
                alm += [sounds.get((t, doc), '_'+t if t in '()_' else t)]
            alm = ' '.join(alm)
            tokens = ' '.join(tokens)
            concept = new_concepts[tbl][0]
            if reflex.strip() == '*':
                pass
            else:
                D[idx] = [ridx,
                        reflex,
                        gloss,
                        ipa,
for i, w in enumerate(wordlists):
    wl = Wordlist(w)
    colors = {}
    tmp = defaultdict(int)
    sylen = []
    clen = []
    for k in wl:
        dolgos = tokens2class(wl[k, 'tokens'], 'dolgo')
        for idx, t in zip(dolgos, wl[k, 'tokens']):
            if idx not in '+':
                tmp[idx] += 1
                colors[idx] = token2class(t, color)
                all_cols += [(k, colors[idx])]
                all_sounds[idx] += 1
                all_colors[idx] = colors[idx]
        sylen += [len(syllabify(' '.join(wl[k, 'tokens']), output='nested'))]
        clen += [len([x for x in dolgos if x not in '1V'])]
    print(w, sum(sylen) / len(sylen), sum(clen) / len(clen))
    ax = plt.subplot(gs[i])
    labels = [x for x, y in sorted(tmp.items(), key=lambda x: x[0])]
    ax.pie([y for x, y in sorted(tmp.items(), key=lambda x: x[0])],
            colors=[y for x, y in sorted(colors.items(), key=lambda x: x[0])],
            radius = 0.95, frame=True, shadow=True)
    ax.set_autoscale_on(False)
    plt.ylim(-1, 1)
    plt.xlim(-1, 1)
    plt.title(w.split('_')[2].split('-')[0])
    plt.axis('off')
    ax.set_aspect('equal')

print('plotting')
示例#8
0
        'Burmese (Written)' : 'Written_Burmese',
        'Burmese (Rangoon)' : 'Rangoon',
        'Achang (Xiandao)' : 'Xiandao',
        'Langsu (Luxi)' : 'Maru'
        }
D = {}
idx = 1
for line in tbl[1:]:
    ridx, reflex, gloss = line[:3]
    if line[-1].strip():
        tbl = str(int(line[-1].split('.')[0]))
        lang = line[6].strip()
        if lang in doculects and tbl in new_concepts:
            doc = doculects[lang]
            tokens = syllabify(ipa2tokens(reflex.replace(' ', '_'), 
                    merge_vowels=False, expand_nasals=True,
                    semi_diacritics='shɕʑʃʒʐʂ'))
            ipa = ''.join(tokens)
            alm = []
            for t in tokens:
                alm += [sounds.get((t, doc), '†'+t if t in '()_' else t)]
            alm = ' '.join(alm)
            tokens = ' '.join(tokens)
            concept = new_concepts[tbl][0]
            if reflex.strip() == '*':
                pass
            else:
                D[idx] = [ridx,
                        reflex,
                        gloss,
                        ipa,
示例#9
0
    def cmd_makecldf(self, args):
        with self.cldf_writer(args) as writer:
            # load data as wordlists, as we need to bring the already segmented
            # entries in line with clts
            wl = lingpy.Wordlist(
                self.raw_dir.joinpath("words.tsv").as_posix(),
                conf=self.raw_dir.joinpath("wordlist.rc").as_posix(),
            )
            wl.add_entries(
                "new_segments",
                "segments",
                lambda x: syllabify(self.tokenizer(
                    {}, "^" + "".join(x) + "$", column="IPA"),
                                    cldf=True),
            )

            writer.add_sources()

            # note: no way to easily replace this with the direct call to `add_concepts`
            # as we add the Chinese gloss via concept.attributes
            concept_lookup = {}
            for concept in self.conceptlists[0].concepts.values():
                idx = concept.id.split("-")[-1] + "_" + slug(concept.gloss)
                writer.add_concept(
                    ID=idx,
                    Name=concept.gloss,
                    Chinese_Gloss=concept.attributes["chinese"],
                    Number=concept.number,
                    Concepticon_ID=concept.concepticon_id,
                    Concepticon_Gloss=concept.concepticon_gloss,
                )
                concept_lookup[concept.number] = idx

            language_lookup = writer.add_languages(lookup_factory="Name")

            for k in pylexibank.progressbar(wl,
                                            desc="wl-to-cldf",
                                            total=len(wl)):
                if wl[k, "value"]:
                    form = self.form_spec.clean(form=wl[k, "value"], item=None)

                    writer.add_form_with_segments(
                        Language_ID=language_lookup[wl[k, "doculect"]],
                        Parameter_ID=concept_lookup[wl[k, "beida_id"]],
                        Value=wl[k, "value"],
                        Form=form,
                        Segments=wl[k, "new_segments"],
                        Source="Cihui",
                        Benzi=wl[k, "benzi"],
                    )

            # We explicitly remove the ISO code column since the languages in
            # this datasets do not have an ISO code.
            writer.cldf["LanguageTable"].tableSchema.columns = [
                col for col in writer.cldf["LanguageTable"].tableSchema.columns
                if col.name != "ISO639P3code"
            ]
            language_table = writer.cldf["LanguageTable"]

        with self.cldf_writer(args, cldf_spec="structure",
                              clean=False) as writer:
            writer.cldf.add_component(language_table)
            writer.objects["LanguageTable"] = self.languages
            inventories = self.raw_dir.read_csv("inventories.tsv",
                                                normalize="NFC",
                                                delimiter="\t",
                                                dicts=True)
            writer.cldf.add_columns(
                "ParameterTable",
                {
                    "name": "CLTS_BIPA",
                    "datatype": "string"
                },
                {
                    "name": "CLTS_Name",
                    "datatype": "string"
                },
                {
                    "name": "Lexibank_BIPA",
                    "datatype": "string"
                },
                {
                    "name": "Prosody",
                    "datatype": "string"
                },
            )
            writer.cldf.add_columns("ValueTable", {
                "name": "Context",
                "datatype": "string"
            })
            clts = CLTS(args.clts.dir)
            bipa = clts.transcriptionsystem_dict["bipa"]
            td = clts.transcriptiondata_dict["beidasinitic"]
            pids, visited = {}, set()
            for row in pylexibank.progressbar(inventories, desc="inventories"):
                if not row["Value"].startswith("(") and row["Value"] != "Ø":
                    for s1, s2, p in zip(row["Value"].split(),
                                         row["Lexibank"].split(),
                                         row["Prosody"].split()):
                        s1 = normalize("NFD", s1)
                        pidx = "-".join(
                            [str(hex(ord(s)))[2:].rjust(4, "0")
                             for s in s1]) + "_" + p

                        if not s1 in td.grapheme_map:
                            args.log.warn("missing sound {0} / {1}".format(
                                s1, " ".join([str(hex(ord(x))) for x in s1])))
                        else:
                            sound = bipa[td.grapheme_map[s1]]
                            sound_name = (sound.name if sound.type
                                          not in ["unknown", "marker"] else "")
                            if not pidx in visited:
                                visited.add(pidx)
                                writer.objects["ParameterTable"].append({
                                    "ID":
                                    pidx,
                                    "Name":
                                    s1,
                                    "Description":
                                    sound_name,
                                    "CLTS_BIPA":
                                    td.grapheme_map[s1],
                                    "CLTS_Name":
                                    sound_name,
                                    "Lexibank_BIPA":
                                    s2,
                                    "Prosody":
                                    p,
                                })
                            writer.objects["ValueTable"].append({
                                "ID":
                                row["Language_ID"] + "_" + pidx,
                                "Language_ID":
                                row["Language_ID"],
                                "Parameter_ID":
                                pidx,
                                "Value":
                                s1,
                                "Context":
                                p,
                                "Source": ["Cihui"],
                            })