def test_rules(tokenizer_with_profile, tokenizer, testdata): assert tokenizer.rules('abc') == 'abc' assert tokenizer_with_profile.rules("aabchonn-ih") == " ii-ii" assert Tokenizer(profile=testdata / 'profile_without_rules.prf').rules('aa') != \ tokenizer_with_profile.rules('aa') rules = Rules((r'(a|á|e|é|i|í|o|ó|u|ú)(n)(\s)(a|á|e|é|i|í|o|ó|u|ú)', r'\1 \2 \4')) assert rules.apply('tan ab') == 'ta n ab'
def test_normalization(): specs = [ {'Grapheme': 'ä'}, {'Grapheme': 'aa'}, {'Grapheme': 'a'}, ] prf = Profile(*specs, **{'form': 'NFD'}) t = Tokenizer(profile=prf) # "aa" matches, because the "ä" is decomposed: assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER # A composed "ä" doesn't match anymore: assert t(unicodedata.normalize('NFC', 'aä')) == 'a ' + REPLACEMENT_MARKER prf = Profile(*specs, **{'form': 'NFC'}) t = Tokenizer(profile=prf) # "aa" doesn't match here, this is typically the behaviour one wants: assert t(unicodedata.normalize('NFC', 'aä')) == 'a ä' assert t(unicodedata.normalize('NFD', 'aä')) == 'aa ' + REPLACEMENT_MARKER
def tokenize(args): """ Tokenize a string (passed as argument or read from stdin) segments [--profile=PATH/TO/PROFILE] tokenize [STRING] """ if args.profile and not Path(args.profile).exists(): # pragma: no cover raise ParserError('--profile must be a path for an existing file') _write(args, Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
def test_errors(profile_path): t = Tokenizer(profile_path, errors_replace=lambda c: '<{0}>'.format(c)) assert t('habe') == '<i> a b <e>' with pytest.raises(ValueError): t('habe', form='xyz') with pytest.raises(ValueError): t('habe', errors='strict') assert t('habe', errors='ignore') == 'a b'
def test_profile(): prf = Profile( {'Grapheme': 'bischen', 'Out': 'b i s ch e n'}, {'Grapheme': 'sch', 'Out': 'sch'}, {'Grapheme': 'n', 'Out': 'n'}, {'Grapheme': 'a', 'Out': 'a'}, {'Grapheme': 'e', 'Out': 'e'}, {'Grapheme': 'n', 'Out': 'n'}, ) t = Tokenizer(profile=prf) assert t('bischen', column='Out') == 'b i s ch e n' assert t('naschen', column='Out') == 'n a sch e n' assert t('x', column='Out') == REPLACEMENT_MARKER prf = Profile( {'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'}, ) t = Tokenizer(profile=prf) assert t('uubo uubo') == 'uu b o # uu b o'
def tokenizer(self): """ Datasets can provide support for segmentation (aka tokenization) in two ways: - by providing an orthography profile at etc/orthography.tsv or - by overwriting this method to return a custom tokenizer callable. :return: A callable to do segmentation. The expected signature of the callable is def t(item, string, **kw) where - `item` is a `dict` representing the complete CLDF FormTable row - `string` is the string to be segmented - `kw` may be used to pass any context info to the tokenizer, when called explicitly. """ tokenizers = { k: Tokenizer(profile=p, errors_replace=lambda c: '<{0}>'.format(c)) for k, p in self.orthography_profile_dict.items() } if tokenizers: def _tokenizer(item, string, **kw): """ Adds `Profile` and `Graphemes` keys to `item`, returns `list` of segments. """ kw.setdefault("column", "IPA") kw.setdefault("separator", " + ") profile = kw.pop('profile', None) if profile: tokenizer = tokenizers[profile] item['Profile'] = profile elif isinstance(item, dict) \ and 'Language_ID' in item \ and item['Language_ID'] in tokenizers: tokenizer = tokenizers[item['Language_ID']] item['Profile'] = item['Language_ID'] else: tokenizer = tokenizers[None] item['Profile'] = 'default' form = self.form_for_segmentation(string) res = tokenizer(form, **kw).split() kw['column'] = Profile.GRAPHEME_COL item['Graphemes'] = tokenizer(form, **kw) return res return _tokenizer
def tokenizer(self): """ Datasets can provide support for segmentation (aka tokenization) in two ways: - by providing an orthography profile at etc/orthography.tsv or - by overwriting this method to return a custom tokenizer callable. :return: A callable to do segmentation. The expected signature of the callable is def t(item, string, **kw) where - `item` is a `dict` representing the complete CLDF FormTable row - `string` is the string to be segmented - `kw` may be used to pass any context info to the tokenizer, when called explicitly. """ profile = self.dir / 'etc' / 'orthography.tsv' if profile.exists(): profile = Profile.from_file(str(profile), form='NFC') default_spec = list(next(iter(profile.graphemes.values())).keys()) for grapheme in ['^', '$']: if grapheme not in profile.graphemes: profile.graphemes[grapheme] = { k: None for k in default_spec } profile.tree = Tree(list(profile.graphemes.keys())) tokenizer = Tokenizer(profile=profile, errors_replace=lambda c: '<{0}>'.format(c)) def _tokenizer(item, string, **kw): kw.setdefault("column", "IPA") kw.setdefault("separator", " + ") return tokenizer( unicodedata.normalize('NFC', '^' + string + '$'), **kw).split() return _tokenizer
def main(args): # Initiate tokenizer and profile profile = Profile.from_file(args.profile) tokenizer = Tokenizer(profile=profile) # Open file and check items errors = [] with open(args.wordlist) as handler: reader = csv.DictReader(handler, delimiter="\t") for count, row in enumerate(reader): segments = my_tokenizer(row[args.form], tokenizer) reference = row[args.segments] if segments != reference: errors.append([row["ID"], row[args.form], segments, reference]) if args.l: if count > args.l: break # Output print(tabulate(errors, headers=["ID", "Form", "Result", "Reference"])) print("Errors: %i/%i (%.2f%%)" % (len(errors), count + 1, (len(errors) / (count + 1)) * 100))
def tokenizer(): return Tokenizer()
def test_tokenize_with_profile_from_object(): prf = Profile(dict(Grapheme='aa', mapping=['x', 'y']), dict(Grapheme='b', mapping='z')) assert Tokenizer(profile=prf)('aab', column='mapping') == 'x y z'
def tokenizer_with_profile(profile_path): return Tokenizer(profile_path)
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("tukano.csv", dicts=True) args.writer.add_sources() # Get our own tokenizer from the orthography profile # because of multi-profile support, the orthography profile dict # has a single item, keyed by `None`. tokenizer = Tokenizer(profile=self.orthography_profile_dict[None]) def _re_tokenize(segmented): """ Generator of re-tokenized sequences. Used to re-tokenize alignments, which is needed due to changes in the orthography profile Args: segmented: list of strings Generates: tokenized segments """ preserve_chars = {"(", ")", "-"} for seg in segmented: if seg in preserve_chars: yield seg else: normalized = self.form_for_segmentation(seg) tokenized = tokenizer(normalized, column="IPA") for seg in tokenized.split(" "): yield seg concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): c_id = "{0}-{1}".format( concept.id.split("-")[-1], slug(concept.english)) concept_lookup[concept.english] = c_id args.writer.add_concept( ID=c_id, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Name=concept.english, ) language_lookup = {} for language in self.languages: args.writer.add_language(ID=language["ID"], Glottocode=language["Glottocode"], Name=language["Name"]) language_lookup[language["ID_in_raw"]] = language["ID"] # add data for row in pylexibank.progressbar(data): language_id = language_lookup[row["DOCULECT"]] c_id = concept_lookup[row["CONCEPT"]] # The alignments were corrected by hand, # when they differ from the segments, # the correct notation is in the alignments tokens = row["TOKENS"].split() alignment = row["ALIGNMENT"].split(" ") stripped_alignments = [ s for s in alignment if s not in {"(", "-", ")"} ] if tokens != stripped_alignments: tokens = stripped_alignments lex = args.writer.add_form( Language_ID=language_id, Parameter_ID=c_id, Value=row["IPA"], # This is a workaround to re-tokenize tokens Form=".".join(tokens), Source=["Chacon2014"], ) # add cognates -- make sure Cognateset_ID is global! args.writer.add_cognate( lexeme=lex, Cognateset_ID="{0}-{1}".format(c_id, row["COGID"]), Source=["Chacon2014"], Alignment=list(_re_tokenize(alignment)), Alignment_Method="expert", Alignment_Source="Chacon2014", )
#!/usr/bin/env python """Similarity code tentative cognates in a word list and align them""" import sys from pycldf.util import Path import hashlib import argparse import lingpy import lingpy.compare.partial from pylexirumah import get_dataset from segments import Tokenizer from pyclts import TranscriptionSystem tokenizer = Tokenizer() bipa = TranscriptionSystem("bipa") def sha1(path): return hashlib.sha1(str(path).encode('utf-8')).hexdigest()[:12] def clean_segments(row): """Reduce the row's segments to not contain empty morphemes. This function removes all unknown sound segments (/0/) from the "Segments" list of the `row` dict it is passed, and removes empty morphemes by collapsing subsequent morpheme boundaries (_#◦+→←) into one. The `row` is modified in-place, the resulting cleaned segment list is returned.
"e͡i", "a͡i", "o͡i", "u͡i", "a͡e", "o͡e", "e͡o", "a͡o", "i͡u", "e͡u", "a͡u", "o͡u", ]) tokenizer = Tokenizer(Profile(*({ "Grapheme": x, "mapping": x } for x in sounds)), errors_ignore=lambda c: c) from pylexirumah import get_dataset, repository def needleman_wunsch(x, y, lodict={}, gop=-2.5, gep=-1.75, local=False, indel=''): """Needleman-Wunsch algorithm with affine gaps penalties.
def test_characters(): t = Tokenizer() assert t.characters( "ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p" assert t.characters('abc def', segment_separator='_', separator='|') == 'a_b_c|d_e_f'
def test_jipa(lang, testdata): tokenize = Tokenizer() assert tokenize(_read_data(testdata / (lang + '_input.txt')), ipa=True) ==\ _read_data(testdata / (lang + '_output.txt'))
def clean_for_lemmatization(word): word = remove_other_chars(word) word = replace_greek_word(word) word = replace_salus(word) word = replace_lacuna(word) word = replace_full_stop(word) word = replace_j(word) word = word.lower() return word # Tokenize tokenize_graphemes = Tokenizer(GRAPHEME_PROFILE) def clean_and_tokenize(word, for_lemmatization=False): if for_lemmatization: word = clean_for_lemmatization(word) else: word = clean(word) graphemes = tokenize_graphemes( word, segment_separator=GRAPHEME_SEPARATOR, column="mapping" ) if graphemes: return GRAPHEME_SEPARATOR.join([START_WORD, graphemes, END_WORD]) return ""
def recode(s): t = Tokenizer(profile=_orthography_profile(s)) return t(s.replace('\n', NEWLINE), column='IPA', segment_separator='', separator=' ').replace(SPACE, ' ')
def test_single_combining_character(): assert Tokenizer()("ˈ", ipa=True) == "ˈ" assert Tokenizer()("ʲ", ipa=True) == "ʲ"
def cmd_install(self, **kw): # Read individual orthographic profiles, extract the corresponding # doculect ids (here, glottocodes), and build the appropriate # tokenizers profile_files = sorted(glob.glob(str(self.dir / "etc" / "*.prof"))) doculect_codes = [ os.path.splitext(os.path.basename(pf))[0] for pf in profile_files ] self.doc_tokenizers = { doculect: Tokenizer( profile=Profile.from_file(pf, form="NFC"), errors_replace=lambda c: "<{0}>".format(c), ) for pf, doculect in zip(profile_files, doculect_codes) } # Cache the Concepticon IDs concepticon = { x.attributes["wold_id"]: x.concepticon_id for x in self.conceptlist.concepts.values() } # cache the field names for CLDF output fields = self.lexeme_class.fieldnames() # Write data to CLDF with self.cldf as ds: vocab_ids = [ v["ID"] for v in self.original_cldf["contributions.csv"] ] # add sources self.add_sources(ds) # add languages and build map for choosing the right profile lang_map = {} for row in self.original_cldf["LanguageTable"]: gc, iso = row["Glottocode"], row["ISO639P3code"] if gc == "tzot1264": gc, iso = "tzot1259", "tzo" if row["ID"] in vocab_ids: ds.add_language(ID=row["ID"], Name=row["Name"], Glottocode=gc, ISO639P3code=iso) # Add to map only those which are receivers if int(row["ID"]) <= 41: lang_map[row["ID"]] = gc # add parameters for row in self.original_cldf["ParameterTable"]: ds.add_concept( ID=row["ID"], Name=row.pop("Name"), Concepticon_ID=concepticon.get(row["ID"]), ) # Being explicit on what we are adding for row in self.original_cldf["FormTable"]: if row["Language_ID"] in vocab_ids: # Copy the raw Form to Value, clean form, and tokenize row["Value"] = row["Form"] row["Form"] = self.clean_form(row["Form"]) row["Segments"] = self.tokenizer( row["Form"], lang_map[row["Language_ID"]]) # Note: We count words marked as "probably borrowed" as loans. row["Loan"] = float(row["BorrowedScore"]) > 0.6 ds.add_form_with_segments( **{k: v for k, v in row.items() if k in fields})
if c in tones: tone += tones[c] else: new_segment += c if len(new_segment) > 0: result.append(new_segment) if len(tone) > 0: result.append(tone) return " ".join(result) # Load tones.csv as lookup table with open('op/tones.csv', mode='r') as infile: reader = csv.reader(infile) tones = {rows[0]: rows[1] for rows in reader} # Heath2016 orthography profile t = Tokenizer("op/Heath2016-profile.tsv") # Dogon data to tokenize df = pd.read_csv("data/dogon-wordlist-long.csv", index_col="ID") # Tokenize tokenizer = lambda x: t.transform(x, column="IPA") tone_changer = lambda x: convert_tone(x) df['TOKENS'] = pd.Series(df['COUNTERPART'].apply(tokenizer)) df['TOKENS_CHAO'] = pd.Series(df['TOKENS'].apply(tone_changer)) df['TOKENS'] = df['TOKENS'].str.strip() df['TOKENS_CHAO'] = df['TOKENS_CHAO'].str.strip() df.to_csv('data/dogon-wordlist-lingpy-format.csv') # df.to_csv('final-wordlist-new-tone.tsv', sep="\t")
def get_orthography(name): return Tokenizer(Profile.from_file(profile_path(name + '.tsv'), form='NFD'))