Пример #1
0
    def test_find_missing_characters(self):
        result = self.t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd")

        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?')
        result = t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih ? ? ?")
Пример #2
0
Файл: util.py Проект: MacyL/cddb
def get_transformer(profile, exception=None):
    
    profile = lp.csv2list(cddb_path('profiles', profile), strip_lines=False)
    for i, line in enumerate(profile):
        profile[i] = [unicodedata.normalize('NFD', clpa.normalize(x)) for x in line]
    tokenizer = Tokenizer(profile, errors_replace=lambda x: "«{0}»".format(x))
    
    return lambda x, y: unicodedata.normalize(
            'NFC',
            tokenizer.transform(clpa.normalize(x), column=y, separator=' + ')
            )
Пример #3
0
 def check(lang):
     tokenize = Tokenizer()
     with codecs.open(_test_path(lang + '_input.txt'), "r", "utf-8") as infile:
         input = infile.read()
     with codecs.open(_test_path(lang + '_output.txt'), "r", "utf-8") as goldfile:
         gold = goldfile.read()
     tools.assert_equal(tokenize(input, ipa=True), gold)
Пример #4
0
    def test_transform1(self):
        self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih")

        with self.assertRaises(ValueError):
            Tokenizer().transform('abc')

        with self.assertRaises(ValueError):
            self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih")
Пример #5
0
def tokenize(args):
    """
    Tokenize a string (passed as argument or read from stdin)

    segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
    """
    if args.profile and not Path(args.profile).exists():  # pragma: no cover
        raise ParserError('--profile must be a path for an existing file')
    _write(args,
           Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
Пример #6
0
    def test_errors(self):
        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c))
        self.assertEqual(t('habe'), '<i> a b <e>')

        with self.assertRaises(ValueError):
            t('habe', form='xyz')

        with self.assertRaises(ValueError):
            t('habe', errors='strict')

        self.assertEqual(t('habe', errors='ignore'), 'a b')
Пример #7
0
def inventories(dataset):
    clpa = get_clpa()
    files = glob(dataset.get_path('raw', 'inventories.tsv'))
    dialects = []

    t = Tokenizer(dataset.get_path('raw', 'profile.prf'))
    sounds = defaultdict(lambda: defaultdict(set))
    transform = lambda x, y: unicodedata.normalize('NFC', t.transform(x, y))
    invs = {l: [] for l in dataset.languages}
    for f in files:
        data = csv2list(f)
        for i, line in enumerate(data):
            number, dialect, page, sound, value, *rest = line
            if not rest: rest = ['']
            cddb = transform(value, 'CDDB')
            src = transform(value, 'SOURCE')
            struct = ' '.join(list(t.transform(value, 'STRUCTURE')))
            invs[dialect] += [[
                src.replace(' ', ''), cddb, struct, ', '.join(rest)
            ]]
            if len(struct.split()) != len(cddb.split()):
                print(i + 1, 'warn', struct, '   |   ', cddb)
    dataset.write_inventories(invs)
Пример #8
0
        'doculect',
        'doculect_old',
        'concept',
        'langid',
        'value',
        'form',
        'cog',
        'tokens',
    ]
}

# converter from languages.tsv to the rest
langs_ = {b: c for a, b, c in csv2list('languages.tsv', strip_lines=False)}

# load the tokenizer
tk = Tokenizer('profile.tsv')

for i, line in enumerate(rest1):
    concept = line[0]
    words = dict(zip(header2, line))
    cogs = dict(zip(header2, rest2[i]))
    for l1, l2 in zip(lang1, lang2):
        forms, cogids = words[l2].split(','), cogs[l2].split(',')
        if len(cogids) > len(forms):
            cogids += ['', '', '']
        for form, cog in zip(forms, cogids):
            if form.strip() != "?":
                tks = tk(form.replace(' ', '_').replace('?', ''), 'IPA')
                if form.strip():
                    D[idx] = [
                        langs_.get(l2, '???'), l2, concept, l1, words[l2],
Пример #9
0
            ('ɪ̪ ', 'ɪ̪'),
            ('ɛ̪ '[1:], 'ɛ̪ '[1:-1]),
            ('²̙ ¹'[1:-1], ''),
            ('****', ''),
            ('ɔ̪'[1], ''),
            ('??', ''),
            ('²̘+³¹','²¹³'),
            ]
    for s, t in st:
        if s:
            form = form.replace(s, t)


    return form.replace(' ', '+')

tk = Tokenizer('../etc/orthography.tsv')

D = {}
idx = 1
header = []
errors = defaultdict(list)
dips = defaultdict(int)
for line in data:
    if line[0].strip() == '汉义':
        header = line
    else:
        print(line[0])
        lines = line[0].split(' ')
        number, concept = lines[0], ' '.join(lines[1:])
        for doculect, form in zip(header[1:], line[1:]):
            if form.strip():
from lingpy import *
from segments.tokenizer import Tokenizer # https://github.com/bambooforest/segments

# create a tokenizer object
t = Tokenizer('orthography.tsv')

# make a dictionary to be passed to lingpy afterwards
D = {0: ['doculect', 'concept', 'concept_id', 'glottolog', 'word_is',
    'segments_is', 'segments', 'cog']}

correct = {
        'pa31 #': "pa31",
        "ɲɕʰɔ4 #": "ɲɕʰɔ4",
        }

# load csv file with help of the csv2list function in lingpy
csv = csv2list('output.csv', sep=',')
for i, (concept_id, concept, language, glottolog, words, cogid) in enumerate(csv):

    # only take the second element if there are more words
    ipa = words.split(' ~ ')[-1]
    ipa = correct.get(ipa, ipa)
    cognacy = '{0}-{1}'.format(concept_id, cogid)
    D[i+1] = [language, concept, concept_id, glottolog, words, t(ipa), t(ipa, 'IPA'), cognacy]
wl = Wordlist(D)
wl.renumber('cog') # adds column with name "cogid"
wl.output('tsv', filename='hm-{0}-{1}'.format(wl.height, wl.width),
        prettify=False, ignore='all')

with open('languages.tsv', 'w') as f:
Пример #11
0
# import relevant modules
from segments.tokenizer import Tokenizer

# load the tokenizer object
tk = Tokenizer('data/P_simple-profile.tsv')

# convert a string to test it
print(tk('čathashadhža'))
print(tk('čathashadhža', column='IPA'))

from lingpy import *
from segments.tokenizer import Tokenizer
wl = Wordlist('data/P_input-file.tsv')
op = Tokenizer('data/P_modified-profile.tsv')
wl.add_entries('tokens', 'ipa', op, column='IPA')
wl.output('tsv', filename='data/P_output-file2', ignore='all', prettify=False)
for idx, doculect, form, tokens in wl.iter_rows('doculect', 'ipa', 'tokens'):
    if form != tokens.replace(' ', ''):
        print('{0:10} {1:10} {2:15}'.format(doculect, form, tokens))

wl = Wordlist('data/P_input-file.tsv')
op = Tokenizer('data/P_modified-context-profile.tsv')
wl.add_entries('tokens', 'ipa', lambda x: op('^' + x + '$', column='IPA'))
wl.output('tsv', filename='data/P_output-file', ignore='all', prettify=False)
for idx, doculect, form, tokens in wl.iter_rows('doculect', 'ipa', 'tokens'):
    if form != tokens.replace(' ', ''):
        print('{0:10} {1:10} {2:15}'.format(doculect, form, tokens))

lex = LexStat('data/P_output-file.tsv')
lex.cluster(method='sca', threshold=0.45, ref='cogid')
lex.output('tsv',
Пример #12
0
 def test_characters(self):
     t = Tokenizer()
     result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
     self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p")
Пример #13
0
 def test_tokenize_without_profile(self):
     self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a')
Пример #14
0
 def test_rules(self):
     self.assertEqual(Tokenizer().rules('abc'), 'abc')
     result = self.t.rules("aabchonn-ih")
     self.assertEqual(result, "  ii-ii")
Пример #15
0
 def test_ipa(self):
     t = Tokenizer()
     self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o')
Пример #16
0
 def test_normalization(self):
     t = Tokenizer()
     s = 'n\u0303a'
     self.assertEqual(t(s), 'n\u0303 a')
     self.assertEqual(t('\xf1a'), 'n\u0303 a')
     self.assertEqual(t(s, form='NFC'), '\xf1 a')
Пример #17
0
#library
######################
import sys
from lingpy import *
from segments.tokenizer import Tokenizer
from lingpy.compare.partial import Partial

######################
# Task : load in word list, segment, detect cognate, output cognate
# usage : python3 congate_detect_pipline.py wordlist token_template output
######################

# Preprocess
## load in required data: word list csv and phonetic data as segmentation template
csv = csv2list(sys.argv[1], sep=',')
t = Tokenizer(sys.argv[2])

## set the template for lingpy
D = {
    0: [
        'doculect', 'concept', 'concept_id', 'glottolog', 'word_is',
        'segments_is', 'segments', 'cog'
    ]
}

correct = {'pa31 #': "pa31", "ɲɕʰɔ4 #": "ɲɕʰɔ4"}
# Process start
## tokenize, full cognate
for i, (concept_id, concept, language, glottolog, words,
        cogid) in enumerate(csv):
    # only take the second element if there are more words
Пример #18
0
 def setUp(self):
     self.t = Tokenizer(_test_path('test.prf'))
Пример #19
0
class TokenizerTestCase(unittest.TestCase):
    """ Tests for tokenizer.py """
    maxDiff = None  # for printing large output

    def setUp(self):
        self.t = Tokenizer(_test_path('test.prf'))

    def test_errors(self):
        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c))
        self.assertEqual(t('habe'), '<i> a b <e>')

        with self.assertRaises(ValueError):
            t('habe', form='xyz')

        with self.assertRaises(ValueError):
            t('habe', errors='strict')

        self.assertEqual(t('habe', errors='ignore'), 'a b')

    def test_boundaries(self):
        self.assertEqual(self.t('aa aa', separator=' _ '), ' b _  b')

    def test_normalization(self):
        t = Tokenizer()
        s = 'n\u0303a'
        self.assertEqual(t(s), 'n\u0303 a')
        self.assertEqual(t('\xf1a'), 'n\u0303 a')
        self.assertEqual(t(s, form='NFC'), '\xf1 a')

    def test_ipa(self):
        t = Tokenizer()
        self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o')

    def test_tokenize_with_profile(self):
        self.assertEqual(self.t('aa'), ' b')

    def test_tokenize_with_profile_from_object(self):
        prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z'))
        self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z')

    def test_tokenize_without_profile(self):
        self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a')

    def test_printTree(self):
        stream = StringIO()
        self.t.op.tree.printTree(self.t.op.tree.root, stream=stream)
        stream.seek(0)
        self.assertIn('a* -- a*', stream.read().split('\n'))
        printMultigraphs(self.t.op.tree.root, '', '')
        printMultigraphs(self.t.op.tree.root, 'abcd', '')

    def test_characters(self):
        t = Tokenizer()
        result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p")

    def test_grapheme_clusters(self):
        t = Tokenizer()
        result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p")

    def test_graphemes(self):
        t = Tokenizer()
        self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h")
        self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")

    def test_transform1(self):
        self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih")

        with self.assertRaises(ValueError):
            Tokenizer().transform('abc')

        with self.assertRaises(ValueError):
            self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih")

    def test_transform2(self):
        result = self.t.transform("aabchonn-ih", "IPA")
        self.assertEqual(result, "aː b tʃ õ n í")

    def test_transform3(self):
        result = self.t.transform("aabchonn-ih", "XSAMPA")
        self.assertEqual(result, "a: b tS o~ n i_H")

    def test_rules(self):
        self.assertEqual(Tokenizer().rules('abc'), 'abc')
        result = self.t.rules("aabchonn-ih")
        self.assertEqual(result, "  ii-ii")

    def test_transform_rules(self):
        result = self.t.transform_rules("aabchonn-ih")
        self.assertEqual(result, " b b ii - ii")

    def test_find_missing_characters(self):
        result = self.t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd")

        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?')
        result = t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih ? ? ?")
Пример #20
0
from lingpy import *
from segments.tokenizer import Tokenizer

wl = Wordlist('wordlist-750.tsv')
ops = {}
for k, lang, form, segments in iter_rows(wl, 'doculect', 'form', 'segments'):
    if lang not in ops:
        print('lang', lang)
        ops[lang] = Tokenizer(lang + '.orthography.tsv')
    wl[k, 'segments'] = ops[lang].transform(form.replace(' ', '_'),
                                            column='IPA',
                                            exception={"#": "#"})
wl.output('tsv', filename='wordlist-750-segmented', ignore='all')
Пример #21
0
 def test_grapheme_clusters(self):
     t = Tokenizer()
     result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
     self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p")
Пример #22
0
 def test_tokenize_with_profile_from_object(self):
     prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z'))
     self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z')
Пример #23
0
    'y': 'ʸ',
    'ə': 'ᵊ',
    'ŭ': 'ᵘ̆',
    'z': 'ᶻ',
    'gy': 'ᵍʸ',
    'h': 'ʰ',
    'ŕ': 'ʳ́',
    'ĕ': 'ᵉ̆',
    'n': 'ⁿ'
}

tokenizers = {}

for file in glob.glob("data/ipa/cdial/*.txt"):
    lang = file.split('/')[-1].split('.')[0]
    tokenizers[lang] = Tokenizer(file)

with open('data/all.json', 'r') as fin:
    data = json.load(fin)

with open('cldf/cognates.csv',
          'w') as fout, open('cldf/parameters.csv',
                             'w') as fout2, open('data/extensions_ia.csv',
                                                 'r') as fin:
    write = csv.writer(fout)
    write2 = csv.writer(fout2)
    read = csv.reader(fin)
    write.writerow(
        ['Cognateset_ID', 'Language_ID', 'Form', 'Description', 'Source'])
    write2.writerow(['ID', 'Name', 'Concepticon_ID', 'Description'])
    for entry in data:
Пример #24
0
 def test_graphemes(self):
     t = Tokenizer()
     self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h")
     self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")