def test_find_missing_characters(self): result = self.t.find_missing_characters("aa b ch on n - ih x y z") self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd") t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?') result = t.find_missing_characters("aa b ch on n - ih x y z") self.assertEqual(result, "aa b ch on n - ih ? ? ?")
def get_transformer(profile, exception=None): profile = lp.csv2list(cddb_path('profiles', profile), strip_lines=False) for i, line in enumerate(profile): profile[i] = [unicodedata.normalize('NFD', clpa.normalize(x)) for x in line] tokenizer = Tokenizer(profile, errors_replace=lambda x: "«{0}»".format(x)) return lambda x, y: unicodedata.normalize( 'NFC', tokenizer.transform(clpa.normalize(x), column=y, separator=' + ') )
def check(lang): tokenize = Tokenizer() with codecs.open(_test_path(lang + '_input.txt'), "r", "utf-8") as infile: input = infile.read() with codecs.open(_test_path(lang + '_output.txt'), "r", "utf-8") as goldfile: gold = goldfile.read() tools.assert_equal(tokenize(input, ipa=True), gold)
def test_transform1(self): self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih") with self.assertRaises(ValueError): Tokenizer().transform('abc') with self.assertRaises(ValueError): self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih")
def tokenize(args): """ Tokenize a string (passed as argument or read from stdin) segments [--profile=PATH/TO/PROFILE] tokenize [STRING] """ if args.profile and not Path(args.profile).exists(): # pragma: no cover raise ParserError('--profile must be a path for an existing file') _write(args, Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
def test_errors(self): t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c)) self.assertEqual(t('habe'), '<i> a b <e>') with self.assertRaises(ValueError): t('habe', form='xyz') with self.assertRaises(ValueError): t('habe', errors='strict') self.assertEqual(t('habe', errors='ignore'), 'a b')
def inventories(dataset): clpa = get_clpa() files = glob(dataset.get_path('raw', 'inventories.tsv')) dialects = [] t = Tokenizer(dataset.get_path('raw', 'profile.prf')) sounds = defaultdict(lambda: defaultdict(set)) transform = lambda x, y: unicodedata.normalize('NFC', t.transform(x, y)) invs = {l: [] for l in dataset.languages} for f in files: data = csv2list(f) for i, line in enumerate(data): number, dialect, page, sound, value, *rest = line if not rest: rest = [''] cddb = transform(value, 'CDDB') src = transform(value, 'SOURCE') struct = ' '.join(list(t.transform(value, 'STRUCTURE'))) invs[dialect] += [[ src.replace(' ', ''), cddb, struct, ', '.join(rest) ]] if len(struct.split()) != len(cddb.split()): print(i + 1, 'warn', struct, ' | ', cddb) dataset.write_inventories(invs)
'doculect', 'doculect_old', 'concept', 'langid', 'value', 'form', 'cog', 'tokens', ] } # converter from languages.tsv to the rest langs_ = {b: c for a, b, c in csv2list('languages.tsv', strip_lines=False)} # load the tokenizer tk = Tokenizer('profile.tsv') for i, line in enumerate(rest1): concept = line[0] words = dict(zip(header2, line)) cogs = dict(zip(header2, rest2[i])) for l1, l2 in zip(lang1, lang2): forms, cogids = words[l2].split(','), cogs[l2].split(',') if len(cogids) > len(forms): cogids += ['', '', ''] for form, cog in zip(forms, cogids): if form.strip() != "?": tks = tk(form.replace(' ', '_').replace('?', ''), 'IPA') if form.strip(): D[idx] = [ langs_.get(l2, '???'), l2, concept, l1, words[l2],
('ɪ̪ ', 'ɪ̪'), ('ɛ̪ '[1:], 'ɛ̪ '[1:-1]), ('²̙ ¹'[1:-1], ''), ('****', ''), ('ɔ̪'[1], ''), ('??', ''), ('²̘+³¹','²¹³'), ] for s, t in st: if s: form = form.replace(s, t) return form.replace(' ', '+') tk = Tokenizer('../etc/orthography.tsv') D = {} idx = 1 header = [] errors = defaultdict(list) dips = defaultdict(int) for line in data: if line[0].strip() == '汉义': header = line else: print(line[0]) lines = line[0].split(' ') number, concept = lines[0], ' '.join(lines[1:]) for doculect, form in zip(header[1:], line[1:]): if form.strip():
from lingpy import * from segments.tokenizer import Tokenizer # https://github.com/bambooforest/segments # create a tokenizer object t = Tokenizer('orthography.tsv') # make a dictionary to be passed to lingpy afterwards D = {0: ['doculect', 'concept', 'concept_id', 'glottolog', 'word_is', 'segments_is', 'segments', 'cog']} correct = { 'pa31 #': "pa31", "ɲɕʰɔ4 #": "ɲɕʰɔ4", } # load csv file with help of the csv2list function in lingpy csv = csv2list('output.csv', sep=',') for i, (concept_id, concept, language, glottolog, words, cogid) in enumerate(csv): # only take the second element if there are more words ipa = words.split(' ~ ')[-1] ipa = correct.get(ipa, ipa) cognacy = '{0}-{1}'.format(concept_id, cogid) D[i+1] = [language, concept, concept_id, glottolog, words, t(ipa), t(ipa, 'IPA'), cognacy] wl = Wordlist(D) wl.renumber('cog') # adds column with name "cogid" wl.output('tsv', filename='hm-{0}-{1}'.format(wl.height, wl.width), prettify=False, ignore='all') with open('languages.tsv', 'w') as f:
# import relevant modules from segments.tokenizer import Tokenizer # load the tokenizer object tk = Tokenizer('data/P_simple-profile.tsv') # convert a string to test it print(tk('čathashadhža')) print(tk('čathashadhža', column='IPA')) from lingpy import * from segments.tokenizer import Tokenizer wl = Wordlist('data/P_input-file.tsv') op = Tokenizer('data/P_modified-profile.tsv') wl.add_entries('tokens', 'ipa', op, column='IPA') wl.output('tsv', filename='data/P_output-file2', ignore='all', prettify=False) for idx, doculect, form, tokens in wl.iter_rows('doculect', 'ipa', 'tokens'): if form != tokens.replace(' ', ''): print('{0:10} {1:10} {2:15}'.format(doculect, form, tokens)) wl = Wordlist('data/P_input-file.tsv') op = Tokenizer('data/P_modified-context-profile.tsv') wl.add_entries('tokens', 'ipa', lambda x: op('^' + x + '$', column='IPA')) wl.output('tsv', filename='data/P_output-file', ignore='all', prettify=False) for idx, doculect, form, tokens in wl.iter_rows('doculect', 'ipa', 'tokens'): if form != tokens.replace(' ', ''): print('{0:10} {1:10} {2:15}'.format(doculect, form, tokens)) lex = LexStat('data/P_output-file.tsv') lex.cluster(method='sca', threshold=0.45, ref='cogid') lex.output('tsv',
def test_characters(self): t = Tokenizer() result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p")
def test_tokenize_without_profile(self): self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a')
def test_rules(self): self.assertEqual(Tokenizer().rules('abc'), 'abc') result = self.t.rules("aabchonn-ih") self.assertEqual(result, " ii-ii")
def test_ipa(self): t = Tokenizer() self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o')
def test_normalization(self): t = Tokenizer() s = 'n\u0303a' self.assertEqual(t(s), 'n\u0303 a') self.assertEqual(t('\xf1a'), 'n\u0303 a') self.assertEqual(t(s, form='NFC'), '\xf1 a')
#library ###################### import sys from lingpy import * from segments.tokenizer import Tokenizer from lingpy.compare.partial import Partial ###################### # Task : load in word list, segment, detect cognate, output cognate # usage : python3 congate_detect_pipline.py wordlist token_template output ###################### # Preprocess ## load in required data: word list csv and phonetic data as segmentation template csv = csv2list(sys.argv[1], sep=',') t = Tokenizer(sys.argv[2]) ## set the template for lingpy D = { 0: [ 'doculect', 'concept', 'concept_id', 'glottolog', 'word_is', 'segments_is', 'segments', 'cog' ] } correct = {'pa31 #': "pa31", "ɲɕʰɔ4 #": "ɲɕʰɔ4"} # Process start ## tokenize, full cognate for i, (concept_id, concept, language, glottolog, words, cogid) in enumerate(csv): # only take the second element if there are more words
def setUp(self): self.t = Tokenizer(_test_path('test.prf'))
class TokenizerTestCase(unittest.TestCase): """ Tests for tokenizer.py """ maxDiff = None # for printing large output def setUp(self): self.t = Tokenizer(_test_path('test.prf')) def test_errors(self): t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c)) self.assertEqual(t('habe'), '<i> a b <e>') with self.assertRaises(ValueError): t('habe', form='xyz') with self.assertRaises(ValueError): t('habe', errors='strict') self.assertEqual(t('habe', errors='ignore'), 'a b') def test_boundaries(self): self.assertEqual(self.t('aa aa', separator=' _ '), ' b _ b') def test_normalization(self): t = Tokenizer() s = 'n\u0303a' self.assertEqual(t(s), 'n\u0303 a') self.assertEqual(t('\xf1a'), 'n\u0303 a') self.assertEqual(t(s, form='NFC'), '\xf1 a') def test_ipa(self): t = Tokenizer() self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o') def test_tokenize_with_profile(self): self.assertEqual(self.t('aa'), ' b') def test_tokenize_with_profile_from_object(self): prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z')) self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z') def test_tokenize_without_profile(self): self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a') def test_printTree(self): stream = StringIO() self.t.op.tree.printTree(self.t.op.tree.root, stream=stream) stream.seek(0) self.assertIn('a* -- a*', stream.read().split('\n')) printMultigraphs(self.t.op.tree.root, '', '') printMultigraphs(self.t.op.tree.root, 'abcd', '') def test_characters(self): t = Tokenizer() result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p") def test_grapheme_clusters(self): t = Tokenizer() result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p") def test_graphemes(self): t = Tokenizer() self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h") self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih") def test_transform1(self): self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih") with self.assertRaises(ValueError): Tokenizer().transform('abc') with self.assertRaises(ValueError): self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih") def test_transform2(self): result = self.t.transform("aabchonn-ih", "IPA") self.assertEqual(result, "aː b tʃ õ n í") def test_transform3(self): result = self.t.transform("aabchonn-ih", "XSAMPA") self.assertEqual(result, "a: b tS o~ n i_H") def test_rules(self): self.assertEqual(Tokenizer().rules('abc'), 'abc') result = self.t.rules("aabchonn-ih") self.assertEqual(result, " ii-ii") def test_transform_rules(self): result = self.t.transform_rules("aabchonn-ih") self.assertEqual(result, " b b ii - ii") def test_find_missing_characters(self): result = self.t.find_missing_characters("aa b ch on n - ih x y z") self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd") t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?') result = t.find_missing_characters("aa b ch on n - ih x y z") self.assertEqual(result, "aa b ch on n - ih ? ? ?")
from lingpy import * from segments.tokenizer import Tokenizer wl = Wordlist('wordlist-750.tsv') ops = {} for k, lang, form, segments in iter_rows(wl, 'doculect', 'form', 'segments'): if lang not in ops: print('lang', lang) ops[lang] = Tokenizer(lang + '.orthography.tsv') wl[k, 'segments'] = ops[lang].transform(form.replace(' ', '_'), column='IPA', exception={"#": "#"}) wl.output('tsv', filename='wordlist-750-segmented', ignore='all')
def test_grapheme_clusters(self): t = Tokenizer() result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p")
def test_tokenize_with_profile_from_object(self): prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z')) self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z')
'y': 'ʸ', 'ə': 'ᵊ', 'ŭ': 'ᵘ̆', 'z': 'ᶻ', 'gy': 'ᵍʸ', 'h': 'ʰ', 'ŕ': 'ʳ́', 'ĕ': 'ᵉ̆', 'n': 'ⁿ' } tokenizers = {} for file in glob.glob("data/ipa/cdial/*.txt"): lang = file.split('/')[-1].split('.')[0] tokenizers[lang] = Tokenizer(file) with open('data/all.json', 'r') as fin: data = json.load(fin) with open('cldf/cognates.csv', 'w') as fout, open('cldf/parameters.csv', 'w') as fout2, open('data/extensions_ia.csv', 'r') as fin: write = csv.writer(fout) write2 = csv.writer(fout2) read = csv.reader(fin) write.writerow( ['Cognateset_ID', 'Language_ID', 'Form', 'Description', 'Source']) write2.writerow(['ID', 'Name', 'Concepticon_ID', 'Description']) for entry in data:
def test_graphemes(self): t = Tokenizer() self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h") self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")