def test_clean_string(self): seq1 = 'this (is an error)' seq2 = 'feature/vector' seq3 = 'ta:tata' seq4 = 'what (the) hack [this is]' seq5 = 't a t' _get_brackets('A') assert clean_string(seq1)[0] == 'th i s' assert clean_string(seq2)[1] == 'v e c t o r' assert clean_string(seq3)[0] == 't a: t a t a' assert clean_string(seq4)[0] == 'wh a t _ h a c k' assert clean_string(seq5, segmentized=True)[0] == 't a t' assert clean_string('a(a', ignore_brackets=False)[0] == 'a ( a' assert clean_string('a/a', split_entries=False)[0] == 'a / a' assert clean_string('aa', preparse=[('a', 'b')])[0] == 'bb' assert clean_string('bb', merge_geminates=False)[0] == 'b b' assert clean_string('bb', rules={"b": "cd"}, merge_geminates=False)[0] == "cd cd"
def test_clean_string(): seq1 = 'this (is an error)' seq2 = 'feature/vector' seq3 = 'ta:tata' seq4 = 'what (the) hack [this is]' seq5 = 't a t' _get_brackets('A') assert clean_string(seq1)[0] == 'th i s' assert clean_string(seq2)[1] == 'v e c t o r' assert clean_string(seq3)[0] == 't a: t a t a' assert clean_string(seq4)[0] == 'wh a t _ h a c k' assert clean_string(seq5, segmentized=True)[0] == 't a t' assert clean_string('a(a', ignore_brackets=False)[0] == 'a ( a' assert clean_string('a/a', split_entries=False)[0] == 'a / a'
def test_sequence(sequence, **keywords): """ Test a sequence for compatibility with CLPA and LingPy. """ invalid = Counter() segment_count = Counter() lingpy_errors = set() clpa_errors = set() clpa_repl = defaultdict(set) general_errors = 0 # clean the string at first, we only take the first item, ignore the rest try: segments = clean_string(sequence, **keywords)[0].split(' ') lingpy_analysis = [ x if y != '0' else '?' for x, y in zip(segments, tokens2class(segments, 'dolgo')) ] clpa_analysis, _sounds, _errors = clpa.check_sequence(segments) general_errors = len( ['?' for x in zip(lingpy_analysis, clpa_analysis) if '?' in x]) except (ValueError, IndexError, AttributeError): invalid.update([sequence]) segments, clpa_analysis = [], [] if segments: for a, b, c in zip(segments, lingpy_analysis, clpa_analysis): if a[0] in clpa.accents: a = a[1:] if c[0] in clpa.accents: c = c[1:] segment_count.update([a]) if b == '?': lingpy_errors.add(a) if c != a: if c == '?': clpa_errors.add(a) else: clpa_repl[a].add(c) return (segments, [clpa.segment2clpa(x) for x in clpa_analysis], invalid, segment_count, lingpy_errors, clpa_errors, clpa_repl, general_errors)
def simple_profile(wordlist, ref='ipa', semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, bad_word="<???>", bad_sound="<?>", clts=None, unknown_sound="!{0}"): """ Create an initial Orthography Profile using Lingpy's clean_string procedure. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(int) words = [wordlist[idx, ref] for idx in wordlist] for word in pb(words, desc='iterating over words'): if isinstance(word, list): word = ' '.join(word) cleaned_string = clean_string(word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0] # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += 1 bad_words.add(word) else: for segment in cleaned_string.split(' '): profile[segment] += 1 for segment in [x for x in word if x not in cleaned_string]: profile[segment] += 1 nulls.add(segment) for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True), desc='preparing profile'): sclass = token2class(s, 'dolgo') if s in bad_words: ipa = bad_word.format(s) elif sclass == '0' and s not in nulls: ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts: sound = clts.get(s, False) if not sound: ipa = '!' + s else: ipa = text_type(sound) else: ipa = s yield s, ipa, text_type(f), codepoint(s)
def context_profile(wordlist, ref='ipa', col="doculect", semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, brackets=None, splitters='/,;~', merge_geminates=True, clts=False, bad_word="<???>", bad_sound="<?>", unknown_sound="!{0}", examples=2, max_entries=100): """ Create an advanced Orthography Profile with context and doculect information. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A wordlist from which you want to derive an initial orthography profile. ref : str (default="ipa") The name of the reference column in which the words are stored. col : str (default="doculect") Indicate in which column the information on the language variety is stored. semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. splitters : str The characters which force the automatic splitting of an entry. clts : dict (default=None) A dictionary(like) object that converts a given source sound into a potential target sound, using the get()-method of the dictionary. Normally, we think of a CLTS instance here (that is: a cross-linguistic transcription system as defined in the pyclts package). bad_word : str (default="«???»") Indicate how words that could not be parsed should be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. bad_sound : str (default="«?»") Indicate how sounds that could not be converted to a sound class be handled. Note that both "bad_word" and "bad_sound" are format-strings, so you can add formatting information here. unknown_sound : str (default="!{0}") If with_clts is set to True, use this string to indicate that sounds are classified as "unknown sound" in the CLTS framework. examples : int(default=2) Indicate the number of examples that should be printed out. Returns ------- profile : generator A generator of tuples (three items), indicating the segment, its frequency, the conversion to sound classes in the Dolgopolsky sound-class model, and the unicode-codepoints. """ clts_ = clts or {} nulls = set() bad_words = set() brackets = brackets or "([{『(₍⁽«)]})』⁾₎" profile = defaultdict(list) errors = set() for idx, word, language in pb(wordlist.iter_rows(ref, col), desc='iter words', total=len(wordlist)): log.info('processing {0}-{1}'.format(idx, word)) if isinstance(word, list): word = ' '.join(word) if word.strip(): try: cleaned_string = clean_string( word, semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, brackets=None, ignore_brackets=False, split_entries=False, preparse=None, rules=None, merge_geminates=merge_geminates)[0].split(' ') # retain whole word if there are splitters in the word if [x for x in cleaned_string if x in brackets + splitters]: profile[word] += [(language, word)] bad_words.add(word) else: context_pre = ['^'] + (len(cleaned_string) - 1) * [''] context_post = (len(cleaned_string) - 1) * [''] + ['$'] for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string): profile[ctxA + segment + ctxB] += [(language, word)] for segment in [ x for x in word if x not in ' '.join(cleaned_string) ]: profile[segment] += [(language, word)] nulls.add(segment) except: errors.add(idx) log.warn('problem parsing {0}'.format(word)) for s in '^$': yield s, 'NULL', '', '', '', '' for idx, (s, entries) in pb(enumerate( sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)): sclass = token2class(s.strip('^$'), 'dolgo') words, langs = [l[1] for l in entries ][:max_entries], [l[0] for l in entries][:max_entries] languages = ', '.join( sorted(set(langs), key=lambda x: langs.count(x), reverse=True)) frequency = str(len(langs)) codepoints = codepoint(s) examples_ = ', '.join( sorted(set(words), key=lambda x: words.count(x), reverse=True)[:examples]) if s in bad_words: ipa = bad_word.format(s) elif sclass == '0': ipa = bad_sound.format(s) elif s in nulls: ipa = 'NULL' elif clts_: sound = clts_.get(s.strip('^$'), False) if not sound: ipa = '!' + s.strip('^$') else: ipa = text_type(sound) else: ipa = s.strip('^$') yield s, ipa, examples_, languages, frequency, codepoints
def tokenizer(self): from lingpy.sequence.sound_classes import clean_string return lambda _, s, **kw: clean_string(s)