def my_tokenizer(form, prf): value = form.strip() for form in split_text(value, separators='/,~', strip=True): value = form.strip() form = "^%s$" % form.replace(" ", "{} ") form = strip_brackets(form, brackets={'[': ']'}) i = 0 tokens = [] while True: added = False for length in range(len(form[i:]), 0, -1): needle = form[i:i + length] if needle in prf: tokens.append(prf[needle]) i += length added = True break if not added: if form[i] == ' ': tokens.append("#") else: tokens.append('<%s>' % form[i]) i += 1 if i == len(form): break # Remove NULLs tokens = [token for token in tokens if token != "NULL"] return ' '.join(tokens)
def cogids2cogid(wordlist, ref="cogids", cognates="cogid", morphemes="morphemes"): C, M = {}, {} current = 1 for concept in wordlist.rows: base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_") idxs = wordlist.get_list(row=concept, flat=True) cogids = defaultdict(list) for idx in idxs: M[idx] = [c for c in wordlist[idx, ref]] for cogid in basictypes.ints(wordlist[idx, ref]): cogids[cogid] += [idx] for i, (cogid, idxs) in enumerate( sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True)): for idx in idxs: if idx not in C: C[idx] = current M[idx][M[idx].index(cogid)] = base else: M[idx][M[idx].index(cogid)] = "_" + base.lower() current += 1 wordlist.add_entries(cognates, C, lambda x: x) if morphemes: wordlist.add_entries(morphemes, M, lambda x: x)
def wikibooks(): with open('wikibooks.txt') as f: data = f.readlines() out = [] gsr = defaultdict(dict) for i, line in enumerate(data): line = strip_brackets(line.strip().replace('\t', ' '), brackets={'(': ')'}) if line.startswith('*'): if not line[1] == ' ': line = line.replace('*', '* ') elms = line.split(' ') if elms and len(elms) > 1: kgsc = elms[1].split('/') if len(kgsc) == 1: schuessler = '' karlgren = kgsc[0] elif len(kgsc) == 2: karlgren = kgsc[1] schuessler = kgsc[0] else: print('[ERROR:schuessler/karlgren] {0}'.format(line)) try: char = elms[2].split('|')[-1][0] except IndexError: print('[ERROR:character] {0}'.format(line)) char = '' mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]] if len(karlgren) not in [4, 5, 6]: print('[ERROR:karlgren] {0}'.format(line, karlgren)) elif not sinopy.is_chinese(char): print('[ERROR:char] {0}'.format(line)) elif char: pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' out += [(char, pinyin, 'Old_Chinese', karlgren[:4], karlgren, '', 'Karlgren1954')] for reading in mch: out += [(char, pinyin, 'Middle_Chinese', '', karlgren, reading, 'Wikibooks2016a')] gsr[char][reading] = [pinyin, reading, karlgren] with open('karlgren.tsv', 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n' ) for i, line in enumerate(out): f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n') return gsr
def normalize_grapheme(text): """ Apply simple, non-CLTS, normalization. """ new_text = unicodedata.normalize("NFD", text) if new_text[0] == "(" and new_text[-1] == ")": new_text = new_text[1:-1] new_text = strip_brackets(new_text) if new_text: return new_text
def clean(self, form, item=None): """ Called when a row is added to a CLDF dataset. :param form: :return: None to skip the form, or the cleaned form as string. """ if form not in self.missing_data: if self.normalize_whitespace: form = re.sub(r'\s+', ' ', form.strip()) for source, target in self.replacements: form = form.replace(source, target) if self.strip_inside_brackets: return text.strip_brackets(form, brackets=self.brackets) return form
def clean_entry(string): """simple replacements to enhance segmentation""" st = [(' (GEM)', ''), (' (NEP)', ''), (' LO$', ''), (' L$', ''), (' C$', ''), (' ING$', ''), (' TIB', ''), (' NEP', ''), ('?', ''), (':', 'ː'), ('.', ''), ('=', ''), (' ', '_'), ('_LF', ''), ('*', ''), ('^', ''), ('$', '')] mapper = dict(zip('0123456789', '⁰¹²³⁴⁵⁶⁸⁹')) mapper.update(dict(st)) string = re.split(';|/|,|>', string)[0] string = '^' + string + '$' for s, t in st: string = string.replace(s, t) for s in [' ', '_', '-']: string = string.strip(s) string = ''.join([x for x in string if not is_chinese(x)]) return strip_brackets(''.join([mapper.get(s, s) for s in string.strip()]))
def clean_form(self, item, form): if form not in ["*", "---", ""]: return split_text(strip_brackets(form), ",;/")[0]
def clean_string( sequence, semi_diacritics='hsʃ̢ɕʂʐʑʒw', merge_vowels=False, segmentized=False, rules=None, ignore_brackets=True, brackets=None, split_entries=True, splitters='/,;~', preparse=None, merge_geminates=True, normalization_form="NFC"): """ Function exhaustively checks how well a sequence is understood by \ LingPy. Parameters ---------- semi_diacritics : str Indicate characters which can occur both as "diacritics" (second part in a sound) or alone. merge_vowels : bool (default=True) Indicate whether consecutive vowels should be merged. segmentized : False Indicate whether the input string is already segmentized or not. If set to True, items in brackets can no longer be ignored. rules : dict Replacement rules to be applied to a segmentized string. ignore_brackets : bool If set to True, ignore all content within a given bracket. brackets : dict A dictionary with opening brackets as key and closing brackets as values. Defaults to a pre-defined set of frequently occurring brackets. split_entries : bool (default=True) Indicate whether multiple entries (with a comma etc.) should be split into separate entries. splitters : str The characters which force the automatic splitting of an entry. preparse : list List of tuples, giving simple replacement patterns (source and target), which are applied before every processing starts. Returns ------- cleaned_strings : list A list of cleaned strings which are segmented by space characters. If splitters are encountered, indicating that the entry contains two variants, the list will contain one for each element in a separate entry. If there are no splitters, the list has only size one. """ sequence = unicodedata.normalize(normalization_form, sequence) rules = rules or {} preparse = preparse or [] # replace white space if not indicated otherwise if segmentized: segment_list = [sequence.split(' ') if not isinstance(sequence, (list, tuple)) else sequence] else: for s, t in preparse: sequence = sequence.replace(s, t) segment_list = [] if ignore_brackets: new_sequence = strip_brackets(sequence, brackets=brackets) else: new_sequence = sequence # splitting needs to be done afterwards if split_entries: new_sequences = split_text(new_sequence, splitters, brackets='' if not ignore_brackets else brackets) else: new_sequences = [new_sequence] for new_sequence in new_sequences: segments = ipa2tokens( re.sub(r'\s+', '_', new_sequence.strip()), semi_diacritics=semi_diacritics, merge_vowels=merge_vowels, merge_geminates=merge_geminates) segment_list += [segments] out = [] for segments in segment_list: segments = [rules.get(s, s) for s in segments] out += [' '.join(segments)] return out
def test_strip_brackets(): strings = ['arm((h)an[d])', '(hand)arm', 'a(hand)r(hand)m(hand)', 'arm⁽hand⁾'] for string in strings: assert text.strip_brackets(string) == 'arm' assert text.strip_brackets('arm<hand>', brackets={"<": ">"}) == 'arm'
query = r""" select ?lexemeId ?lemma WHERE {{ ?lexemeId <http://purl.org/dc/terms/language> wd:Q1860; wikibase:lemma ?lemma. FILTER (regex(?lemma, '^{0}$')) }} """ findings = defaultdict(list) mappings, concepticon = get_mappings() visited = set() words = defaultdict(list) for w, vals in mappings['en'].items(): w2 = strip_brackets(w.lower().strip()) if w2 not in words: words[w2] = vals[0][0] with open('wikidata.tsv', 'r') as f: for line in f: if line.strip(): cid = line.split('\t')[0] visited.add(cid) print(len(words)) with open('fails.tsv', 'r') as f: for line in f: visited.add(line.strip()) current = ''
def name(self): m = re.search(r'tree\s+(?P<name>[^=]+)\s*=', strip_brackets(self, {'[': ']'})) if m: return m.group('name').strip()
def clean_form(self, item, form): if form not in ["*", "---", "-"]: form = strip_brackets(split_text(form, separators=";,/")[0]) return form.replace(" ", "_")