示例#1
0
文件: base.py 项目: glottobank/clpa
 def __init__(self,
              whitelist=None,
              alias=None,
              delete=None,
              explicit=None,
              patterns=None,
              accents=None,
              rules=None):
     self.whitelist = whitelist or load_whitelist()
     self.alias = alias or load_alias('alias.tsv')
     self.delete = delete or ['\u0361', '\u035c', '\u0301']
     self.explicit = explicit or load_alias('explicit.tsv')
     self.patterns = patterns or load_alias('patterns.tsv')
     self.accents = accents or "ˈˌ'"
     self.rules = rules or []
示例#2
0
    def check(self, column='TOKENS', rules=False, clpa=None):
        clpa = clpa or get_clpa()

        if rules:
            rules = load_alias(rules)
            for val in self:
                tokens = [
                    rules[t] if t in rules else t for t in split(val[column])
                ]
                val[column] = join(tokens)

        sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0})
        for item in self:
            new_tokens, sounds, errors = clpa.check_sequence(split(
                item[column]),
                                                             sounds=sounds,
                                                             errors=errors)
            idxs = [clpa.segment2clpa(t) for t in new_tokens]

            #    new_tokens.append(accent + sounds[token]['clpa'])
            #    idxs.append(sounds[token]['id'])
            item['CLPA_TOKENS'] = join(new_tokens)
            item['CLPA_IDS'] = join(idxs)

        return sounds, errors
示例#3
0
文件: base.py 项目: LinguList/clpa
 def __init__(self,
              whitelist=None,
              alias=None,
              delete=None,
              explicit=None,
              patterns=None,
              accents=None,
              rules=None,
              normalized=None):
     self.whitelist = whitelist or load_whitelist()
     self.alias = alias or load_alias('alias.tsv')
     self.delete = delete or ['\u0361', '\u035c', '\u0301']
     self.explicit = explicit or load_alias('explicit.tsv')
     self.patterns = patterns or load_alias('patterns.tsv')
     self.accents = accents or "ˈˌ'"
     self.rules = rules or []
     self.normalized = normalized or load_normalized('normalized.tsv')
示例#4
0
    def test_find_token(self):
        from pyclpa.util import find_token, load_whitelist, load_alias

        wl = load_whitelist()
        patterns = load_alias('patterns.tsv')
        assert not find_token('t', {}, {}, {}, {}, [])
        assert find_token('t', wl, {}, {}, {}, []) == 't'
        assert find_token('th', wl, {'h': 'ʰ'}, {}, {}, []) == 'tʰ'
        assert find_token('th', wl, {}, {'th': 'x'}, {}, []) == 'x'
        with self.assertRaises(ValueError):
            find_token('th', wl, {}, {'th': 'X'}, {}, [])
        assert find_token('th', wl, {}, {}, patterns, []) == 'tʰ'
        assert find_token('th', wl, {}, {}, {}, ['h']) == 't'
示例#5
0
    def check(self, column='TOKENS', rules=False, clpa=None):
        clpa = clpa or get_clpa()

        if rules:
            rules = load_alias(rules)
            for val in self:
                tokens = [rules[t] if t in rules else t for t in split(val[column])]
                val[column] = join(tokens)

        sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0})
        for item in self:
            new_tokens, sounds, errors = clpa.check_sequence(
                split(item[column]), sounds=sounds, errors=errors)
            idxs = [clpa.segment2clpa(t) for t in new_tokens]

            #    new_tokens.append(accent + sounds[token]['clpa'])
            #    idxs.append(sounds[token]['id'])
            item['CLPA_TOKENS'] = join(new_tokens)
            item['CLPA_IDS'] = join(idxs)

        return sounds, errors
示例#6
0
    def test_load_alias(self):
        from pyclpa.util import local_path, load_alias

        assert load_alias(local_path('alias.tsv'))['ɡ'] == 'g'