def __init__(self, whitelist=None, alias=None, delete=None, explicit=None, patterns=None, accents=None, rules=None): self.whitelist = whitelist or load_whitelist() self.alias = alias or load_alias('alias.tsv') self.delete = delete or ['\u0361', '\u035c', '\u0301'] self.explicit = explicit or load_alias('explicit.tsv') self.patterns = patterns or load_alias('patterns.tsv') self.accents = accents or "ˈˌ'" self.rules = rules or []
def check(self, column='TOKENS', rules=False, clpa=None): clpa = clpa or get_clpa() if rules: rules = load_alias(rules) for val in self: tokens = [ rules[t] if t in rules else t for t in split(val[column]) ] val[column] = join(tokens) sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0}) for item in self: new_tokens, sounds, errors = clpa.check_sequence(split( item[column]), sounds=sounds, errors=errors) idxs = [clpa.segment2clpa(t) for t in new_tokens] # new_tokens.append(accent + sounds[token]['clpa']) # idxs.append(sounds[token]['id']) item['CLPA_TOKENS'] = join(new_tokens) item['CLPA_IDS'] = join(idxs) return sounds, errors
def __init__(self, whitelist=None, alias=None, delete=None, explicit=None, patterns=None, accents=None, rules=None, normalized=None): self.whitelist = whitelist or load_whitelist() self.alias = alias or load_alias('alias.tsv') self.delete = delete or ['\u0361', '\u035c', '\u0301'] self.explicit = explicit or load_alias('explicit.tsv') self.patterns = patterns or load_alias('patterns.tsv') self.accents = accents or "ˈˌ'" self.rules = rules or [] self.normalized = normalized or load_normalized('normalized.tsv')
def test_find_token(self): from pyclpa.util import find_token, load_whitelist, load_alias wl = load_whitelist() patterns = load_alias('patterns.tsv') assert not find_token('t', {}, {}, {}, {}, []) assert find_token('t', wl, {}, {}, {}, []) == 't' assert find_token('th', wl, {'h': 'ʰ'}, {}, {}, []) == 'tʰ' assert find_token('th', wl, {}, {'th': 'x'}, {}, []) == 'x' with self.assertRaises(ValueError): find_token('th', wl, {}, {'th': 'X'}, {}, []) assert find_token('th', wl, {}, {}, patterns, []) == 'tʰ' assert find_token('th', wl, {}, {}, {}, ['h']) == 't'
def check(self, column='TOKENS', rules=False, clpa=None): clpa = clpa or get_clpa() if rules: rules = load_alias(rules) for val in self: tokens = [rules[t] if t in rules else t for t in split(val[column])] val[column] = join(tokens) sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0}) for item in self: new_tokens, sounds, errors = clpa.check_sequence( split(item[column]), sounds=sounds, errors=errors) idxs = [clpa.segment2clpa(t) for t in new_tokens] # new_tokens.append(accent + sounds[token]['clpa']) # idxs.append(sounds[token]['id']) item['CLPA_TOKENS'] = join(new_tokens) item['CLPA_IDS'] = join(idxs) return sounds, errors
def test_load_alias(self): from pyclpa.util import local_path, load_alias assert load_alias(local_path('alias.tsv'))['ɡ'] == 'g'