예제 #1
0
def test_rules(tokenizer_with_profile, tokenizer, testdata):
    assert tokenizer.rules('abc') == 'abc'
    assert tokenizer_with_profile.rules("aabchonn-ih") == "  ii-ii"
    assert Tokenizer(profile=testdata / 'profile_without_rules.prf').rules('aa') != \
        tokenizer_with_profile.rules('aa')
    rules = Rules((r'(a|á|e|é|i|í|o|ó|u|ú)(n)(\s)(a|á|e|é|i|í|o|ó|u|ú)', r'\1 \2 \4'))
    assert rules.apply('tan ab') == 'ta n ab'
예제 #2
0
def test_normalization():
    specs = [
        {'Grapheme': 'ä'},
        {'Grapheme': 'aa'},
        {'Grapheme': 'a'},
    ]
    prf = Profile(*specs, **{'form': 'NFD'})
    t = Tokenizer(profile=prf)
    # "aa" matches, because the "ä" is decomposed:
    assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
    # A composed "ä" doesn't match anymore:
    assert t(unicodedata.normalize('NFC', 'aä')) == 'a ' + REPLACEMENT_MARKER
    prf = Profile(*specs, **{'form': 'NFC'})
    t = Tokenizer(profile=prf)
    # "aa" doesn't match here, this is typically the behaviour one wants:
    assert t(unicodedata.normalize('NFC', 'aä')) == 'a ä'
    assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
예제 #3
0
def tokenize(args):
    """
    Tokenize a string (passed as argument or read from stdin)

    segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
    """
    if args.profile and not Path(args.profile).exists():  # pragma: no cover
        raise ParserError('--profile must be a path for an existing file')
    _write(args,
           Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
예제 #4
0
def test_errors(profile_path):
    t = Tokenizer(profile_path, errors_replace=lambda c: '<{0}>'.format(c))
    assert t('habe') == '<i> a b <e>'

    with pytest.raises(ValueError):
        t('habe', form='xyz')

    with pytest.raises(ValueError):
        t('habe', errors='strict')

    assert t('habe', errors='ignore') == 'a b'
예제 #5
0
def test_profile():
    prf = Profile(
        {'Grapheme': 'bischen', 'Out': 'b i s ch e n'},
        {'Grapheme': 'sch', 'Out': 'sch'},
        {'Grapheme': 'n', 'Out': 'n'},
        {'Grapheme': 'a', 'Out': 'a'},
        {'Grapheme': 'e', 'Out': 'e'},
        {'Grapheme': 'n', 'Out': 'n'},
    )
    t = Tokenizer(profile=prf)
    assert t('bischen', column='Out') == 'b i s ch e n'
    assert t('naschen', column='Out') == 'n a sch e n'
    assert t('x', column='Out') == REPLACEMENT_MARKER

    prf = Profile(
        {'Grapheme': 'uu'},
        {'Grapheme': 'b'},
        {'Grapheme': 'o'},
    )
    t = Tokenizer(profile=prf)
    assert t('uubo uubo') == 'uu b o # uu b o'
예제 #6
0
    def tokenizer(self):
        """
        Datasets can provide support for segmentation (aka tokenization) in two ways:
        - by providing an orthography profile at etc/orthography.tsv or
        - by overwriting this method to return a custom tokenizer callable.

        :return: A callable to do segmentation.

        The expected signature of the callable is

            def t(item, string, **kw)

        where
        - `item` is a `dict` representing the complete CLDF FormTable row
        - `string` is the string to be segmented
        - `kw` may be used to pass any context info to the tokenizer, when called
          explicitly.
        """
        tokenizers = {
            k: Tokenizer(profile=p, errors_replace=lambda c: '<{0}>'.format(c))
            for k, p in self.orthography_profile_dict.items()
        }

        if tokenizers:

            def _tokenizer(item, string, **kw):
                """
                Adds `Profile` and `Graphemes` keys to `item`, returns `list` of segments.
                """
                kw.setdefault("column", "IPA")
                kw.setdefault("separator", " + ")
                profile = kw.pop('profile', None)
                if profile:
                    tokenizer = tokenizers[profile]
                    item['Profile'] = profile
                elif isinstance(item, dict) \
                        and 'Language_ID' in item \
                        and item['Language_ID'] in tokenizers:
                    tokenizer = tokenizers[item['Language_ID']]
                    item['Profile'] = item['Language_ID']
                else:
                    tokenizer = tokenizers[None]
                    item['Profile'] = 'default'
                form = self.form_for_segmentation(string)
                res = tokenizer(form, **kw).split()
                kw['column'] = Profile.GRAPHEME_COL
                item['Graphemes'] = tokenizer(form, **kw)
                return res

            return _tokenizer
예제 #7
0
    def tokenizer(self):
        """
        Datasets can provide support for segmentation (aka tokenization) in two ways:
        - by providing an orthography profile at etc/orthography.tsv or
        - by overwriting this method to return a custom tokenizer callable.

        :return: A callable to do segmentation.

        The expected signature of the callable is

            def t(item, string, **kw)

        where
        - `item` is a `dict` representing the complete CLDF FormTable row
        - `string` is the string to be segmented
        - `kw` may be used to pass any context info to the tokenizer, when called
          explicitly.
        """
        profile = self.dir / 'etc' / 'orthography.tsv'
        if profile.exists():
            profile = Profile.from_file(str(profile), form='NFC')
            default_spec = list(next(iter(profile.graphemes.values())).keys())
            for grapheme in ['^', '$']:
                if grapheme not in profile.graphemes:
                    profile.graphemes[grapheme] = {
                        k: None
                        for k in default_spec
                    }
            profile.tree = Tree(list(profile.graphemes.keys()))
            tokenizer = Tokenizer(profile=profile,
                                  errors_replace=lambda c: '<{0}>'.format(c))

            def _tokenizer(item, string, **kw):
                kw.setdefault("column", "IPA")
                kw.setdefault("separator", " + ")
                return tokenizer(
                    unicodedata.normalize('NFC', '^' + string + '$'),
                    **kw).split()

            return _tokenizer
예제 #8
0
def main(args):
    # Initiate tokenizer and profile
    profile = Profile.from_file(args.profile)
    tokenizer = Tokenizer(profile=profile)

    # Open file and check items
    errors = []
    with open(args.wordlist) as handler:
        reader = csv.DictReader(handler, delimiter="\t")
        for count, row in enumerate(reader):
            segments = my_tokenizer(row[args.form], tokenizer)
            reference = row[args.segments]
            if segments != reference:
                errors.append([row["ID"], row[args.form], segments, reference])

            if args.l:
                if count > args.l:
                    break

    # Output
    print(tabulate(errors, headers=["ID", "Form", "Result", "Reference"]))
    print("Errors: %i/%i (%.2f%%)" % (len(errors), count + 1,
                                      (len(errors) / (count + 1)) * 100))
예제 #9
0
def tokenizer():
    return Tokenizer()
예제 #10
0
def test_tokenize_with_profile_from_object():
    prf = Profile(dict(Grapheme='aa', mapping=['x', 'y']),
                  dict(Grapheme='b', mapping='z'))
    assert Tokenizer(profile=prf)('aab', column='mapping') == 'x y z'
예제 #11
0
def tokenizer_with_profile(profile_path):
    return Tokenizer(profile_path)
예제 #12
0
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("tukano.csv", dicts=True)
        args.writer.add_sources()

        # Get our own tokenizer from the orthography profile
        # because of multi-profile support, the orthography profile dict
        # has a single item, keyed by `None`.
        tokenizer = Tokenizer(profile=self.orthography_profile_dict[None])

        def _re_tokenize(segmented):
            """ Generator of re-tokenized sequences.

            Used to re-tokenize alignments, which is needed due to changes
            in the orthography profile

            Args:
                segmented: list of strings

            Generates: tokenized segments
            """
            preserve_chars = {"(", ")", "-"}
            for seg in segmented:
                if seg in preserve_chars:
                    yield seg
                else:
                    normalized = self.form_for_segmentation(seg)
                    tokenized = tokenizer(normalized, column="IPA")
                    for seg in tokenized.split(" "):
                        yield seg

        concept_lookup = {}
        for concept in self.conceptlists[0].concepts.values():
            c_id = "{0}-{1}".format(
                concept.id.split("-")[-1], slug(concept.english))
            concept_lookup[concept.english] = c_id
            args.writer.add_concept(
                ID=c_id,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
                Name=concept.english,
            )

        language_lookup = {}
        for language in self.languages:
            args.writer.add_language(ID=language["ID"],
                                     Glottocode=language["Glottocode"],
                                     Name=language["Name"])
            language_lookup[language["ID_in_raw"]] = language["ID"]

        # add data
        for row in pylexibank.progressbar(data):
            language_id = language_lookup[row["DOCULECT"]]
            c_id = concept_lookup[row["CONCEPT"]]

            # The alignments were corrected by hand,
            # when they differ from the segments,
            # the correct notation is in the alignments
            tokens = row["TOKENS"].split()
            alignment = row["ALIGNMENT"].split(" ")
            stripped_alignments = [
                s for s in alignment if s not in {"(", "-", ")"}
            ]
            if tokens != stripped_alignments:
                tokens = stripped_alignments

            lex = args.writer.add_form(
                Language_ID=language_id,
                Parameter_ID=c_id,
                Value=row["IPA"],
                # This is a workaround to re-tokenize tokens
                Form=".".join(tokens),
                Source=["Chacon2014"],
            )

            # add cognates -- make sure Cognateset_ID is global!
            args.writer.add_cognate(
                lexeme=lex,
                Cognateset_ID="{0}-{1}".format(c_id, row["COGID"]),
                Source=["Chacon2014"],
                Alignment=list(_re_tokenize(alignment)),
                Alignment_Method="expert",
                Alignment_Source="Chacon2014",
            )
예제 #13
0
                tone += tones[c]
            else:
                new_segment += c
        if len(new_segment) > 0: result.append(new_segment)
        if len(tone) > 0: result.append(tone)
    return " ".join(result)


# Load tones.csv as lookup table
with open('op/tones.csv', mode='r') as infile:
    reader = csv.reader(infile)
    tones = {rows[0]: rows[1] for rows in reader}


# Heath2016 orthography profile
t = Tokenizer("op/Heath2016-profile.tsv")

# Dogon data to tokenize
df = pd.read_csv("data/dogon-wordlist-long.csv", index_col="ID")

# Tokenize
tokenizer = lambda x: t.transform(x, column="IPA")
tone_changer = lambda x: convert_tone(x)
df['TOKENS'] = pd.Series(df['COUNTERPART'].apply(tokenizer))
df['TOKENS_CHAO'] = pd.Series(df['TOKENS'].apply(tone_changer))
df['TOKENS'] = df['TOKENS'].str.strip()
df['TOKENS_CHAO'] = df['TOKENS_CHAO'].str.strip()

df.to_csv('data/dogon-wordlist-lingpy-format.csv')
# df.to_csv('final-wordlist-new-tone.tsv', sep="\t")
예제 #14
0
#!/usr/bin/env python
"""Similarity code tentative cognates in a word list and align them"""

import sys
from pycldf.util import Path
import hashlib
import argparse

import lingpy
import lingpy.compare.partial

from pylexirumah import get_dataset
from segments import Tokenizer
from pyclts import TranscriptionSystem

tokenizer = Tokenizer()
bipa = TranscriptionSystem("bipa")


def sha1(path):
    return hashlib.sha1(str(path).encode('utf-8')).hexdigest()[:12]


def clean_segments(row):
    """Reduce the row's segments to not contain empty morphemes.

    This function removes all unknown sound segments (/0/) from the "Segments"
    list of the `row` dict it is passed, and removes empty morphemes by
    collapsing subsequent morpheme boundaries (_#◦+→←) into one. The `row` is
    modified in-place, the resulting cleaned segment list is returned.
    "e͡i",
    "a͡i",
    "o͡i",
    "u͡i",
    "a͡e",
    "o͡e",
    "e͡o",
    "a͡o",
    "i͡u",
    "e͡u",
    "a͡u",
    "o͡u",
])
tokenizer = Tokenizer(Profile(*({
    "Grapheme": x,
    "mapping": x
} for x in sounds)),
                      errors_ignore=lambda c: c)

from pylexirumah import get_dataset, repository


def needleman_wunsch(x,
                     y,
                     lodict={},
                     gop=-2.5,
                     gep=-1.75,
                     local=False,
                     indel=''):
    """Needleman-Wunsch algorithm with affine gaps penalties.
예제 #16
0
def test_characters():
    t = Tokenizer()
    assert t.characters(
        "ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p"
    assert t.characters('abc def', segment_separator='_',
                        separator='|') == 'a_b_c|d_e_f'
예제 #17
0
def test_jipa(lang, testdata):
    tokenize = Tokenizer()
    assert tokenize(_read_data(testdata / (lang + '_input.txt')), ipa=True) ==\
        _read_data(testdata / (lang + '_output.txt'))
예제 #18
0

def clean_for_lemmatization(word):
    word = remove_other_chars(word)
    word = replace_greek_word(word)
    word = replace_salus(word)
    word = replace_lacuna(word)
    word = replace_full_stop(word)
    word = replace_j(word)
    word = word.lower()
    return word


# Tokenize

tokenize_graphemes = Tokenizer(GRAPHEME_PROFILE)


def clean_and_tokenize(word, for_lemmatization=False):
    if for_lemmatization:
        word = clean_for_lemmatization(word)
    else:
        word = clean(word)
    graphemes = tokenize_graphemes(
        word, segment_separator=GRAPHEME_SEPARATOR, column="mapping"
    )
    if graphemes:
        return GRAPHEME_SEPARATOR.join([START_WORD, graphemes, END_WORD])
    return ""

예제 #19
0
def recode(s):
    t = Tokenizer(profile=_orthography_profile(s))
    return t(s.replace('\n', NEWLINE),
             column='IPA',
             segment_separator='',
             separator=' ').replace(SPACE, ' ')
예제 #20
0
def test_single_combining_character():
    assert Tokenizer()("ˈ", ipa=True) == "ˈ"
    assert Tokenizer()("ʲ", ipa=True) == "ʲ"
예제 #21
0
    def cmd_install(self, **kw):
        # Read individual orthographic profiles, extract the corresponding
        # doculect ids (here, glottocodes), and build the appropriate
        # tokenizers
        profile_files = sorted(glob.glob(str(self.dir / "etc" / "*.prof")))
        doculect_codes = [
            os.path.splitext(os.path.basename(pf))[0] for pf in profile_files
        ]

        self.doc_tokenizers = {
            doculect: Tokenizer(
                profile=Profile.from_file(pf, form="NFC"),
                errors_replace=lambda c: "<{0}>".format(c),
            )
            for pf, doculect in zip(profile_files, doculect_codes)
        }

        # Cache the Concepticon IDs
        concepticon = {
            x.attributes["wold_id"]: x.concepticon_id
            for x in self.conceptlist.concepts.values()
        }

        # cache the field names for CLDF output
        fields = self.lexeme_class.fieldnames()

        # Write data to CLDF
        with self.cldf as ds:
            vocab_ids = [
                v["ID"] for v in self.original_cldf["contributions.csv"]
            ]

            # add sources
            self.add_sources(ds)

            # add languages and build map for choosing the right profile
            lang_map = {}
            for row in self.original_cldf["LanguageTable"]:
                gc, iso = row["Glottocode"], row["ISO639P3code"]
                if gc == "tzot1264":
                    gc, iso = "tzot1259", "tzo"
                if row["ID"] in vocab_ids:
                    ds.add_language(ID=row["ID"],
                                    Name=row["Name"],
                                    Glottocode=gc,
                                    ISO639P3code=iso)

                # Add to map only those which are receivers
                if int(row["ID"]) <= 41:
                    lang_map[row["ID"]] = gc

            # add parameters
            for row in self.original_cldf["ParameterTable"]:
                ds.add_concept(
                    ID=row["ID"],
                    Name=row.pop("Name"),
                    Concepticon_ID=concepticon.get(row["ID"]),
                )

            # Being explicit on what we are adding
            for row in self.original_cldf["FormTable"]:
                if row["Language_ID"] in vocab_ids:
                    # Copy the raw Form to Value, clean form, and tokenize
                    row["Value"] = row["Form"]
                    row["Form"] = self.clean_form(row["Form"])
                    row["Segments"] = self.tokenizer(
                        row["Form"], lang_map[row["Language_ID"]])

                    # Note: We count words marked as "probably borrowed" as loans.
                    row["Loan"] = float(row["BorrowedScore"]) > 0.6

                    ds.add_form_with_segments(
                        **{k: v
                           for k, v in row.items() if k in fields})
예제 #22
0
            if c in tones:
                tone += tones[c]
            else:
                new_segment += c
        if len(new_segment) > 0: result.append(new_segment)
        if len(tone) > 0: result.append(tone)
    return " ".join(result)


# Load tones.csv as lookup table
with open('op/tones.csv', mode='r') as infile:
    reader = csv.reader(infile)
    tones = {rows[0]: rows[1] for rows in reader}

# Heath2016 orthography profile
t = Tokenizer("op/Heath2016-profile.tsv")

# Dogon data to tokenize
df = pd.read_csv("data/dogon-wordlist-long.csv", index_col="ID")

# Tokenize
tokenizer = lambda x: t.transform(x, column="IPA")
tone_changer = lambda x: convert_tone(x)
df['TOKENS'] = pd.Series(df['COUNTERPART'].apply(tokenizer))
df['TOKENS_CHAO'] = pd.Series(df['TOKENS'].apply(tone_changer))
df['TOKENS'] = df['TOKENS'].str.strip()
df['TOKENS_CHAO'] = df['TOKENS_CHAO'].str.strip()

df.to_csv('data/dogon-wordlist-lingpy-format.csv')
# df.to_csv('final-wordlist-new-tone.tsv', sep="\t")
예제 #23
0
def get_orthography(name):
    return Tokenizer(Profile.from_file(profile_path(name + '.tsv'),
                                       form='NFD'))