def generate_typos(word, distance, d=0): yield (word, d) if d == distance: return # add letter for pos in range(len(word)): for letter in ALPHABET: yield from generate_typos(word[:pos] + letter + word[pos:], distance, d + 1) # remove letter for pos in range(len(word)): yield from generate_typos(word[:pos] + word[pos + 1:], distance, d + 1) # change letter for pos in range(len(word)): for letter in generate_letter_typos(word[pos]): yield from generate_typos(word[:pos] + letter + word[pos + 1:], distance, d + 1) # swap letters for pos in range(len(word) - 1): yield from generate_typos(word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:], distance, d + 1) if __name__ == '__main__': grams1 = grams.load_grams('../1grams_cleaned', 1) pool = multiprocessing.Pool() data = pool.map(possible_typos, grams1[0]) with open('typos.dat', 'wb') as f: pickle.dump(data, f)
# remove letter for pos in range(len(word)): yield from generate_typos(word[:pos] + word[pos + 1:], distance, d + 1) # change letter for pos in range(len(word)): for letter in ALPHABET: if letter != word[pos]: yield from generate_typos(word[:pos] + letter + word[pos + 1:], distance, d + 1) # swap letters for pos in range(len(word) - 1): yield from generate_typos(word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:], distance, d + 1) def fix_line(line): return fix_typos(line.strip(), dictionary, unigrams, bigrams) if __name__ == '__main__': #generate_dictionary('../slownik_do_literowek.txt') dictionary = load_dictionary('../slownik_do_literowek.dat') unigrams = [grams.load_grams('../1grams_min_cleaned', 1)] bigrams = [grams.load_grams('../2grams_min_cleaned', 2)] unigrams.append(sum(unigrams[0][3][0])) bigrams.append(sum(bigrams[0][3][0])) print('Loaded!') pool = multiprocessing.Pool() print('\n'.join(pool.map(fix_line, sys.stdin)))
if a in ALT and ALT[a] == b: break elif a in REVALT and REVALT[a] == b: break elif i + 1 < len(word) and (a, word[i + 1]) == (typo[i + 1], b): trans_ab[(a, b)] += count break else: break grams1 = grams.load_grams('../1grams_cleaned', 1) paired = defaultdict(lambda: 0) for word in grams1[0]: if word not in DICTIONARY: continue count = grams.find_ngram([word], *grams1) for i in range(len(word)): if i + 1 < len(word): paired[(word[i], word[i + 1])] += count for (key, value) in trans_ab.items(): trans_ab[key] = value / (value + paired[key]) for (key, value) in sorted(trans_ab.items(), key=lambda x: -x[1]): print(''.join(key) + ':', value)
from collections import defaultdict import grams import random import re import sys WORD = 1 TAG = 2 WORDS = {} WORDS2 = defaultdict(lambda: dict()) TAGS = defaultdict(lambda: list()) grams1 = grams.load_grams('../1grams_min_cleaned', 1) grams2 = grams.load_grams('../2grams_min_cleaned', 2) for idx in range(len(grams1[3][0])): WORDS[grams1[0][grams1[3][1][idx]]] = grams1[3][0][idx] for idx in range(len(grams2[3][0])): WORDS2[grams2[0][grams2[3][1][idx]]][grams2[0][grams2[3][2][idx]]] = grams2[3][0][idx] regex = re.compile(r'[^\w ]', re.UNICODE | re.IGNORECASE) with open('../morfeuszTagsAndBasesForNKJP.txt', 'r') as f: for line in f: word, base, *tags = line.strip().split() word = regex.sub('', word).strip() if word not in WORDS: continue for tag in tags: TAGS[tag].append(word)
#!/bin/env python3 import grams import random import sys words, words_index, words_position, connections, connections_index = grams.load_grams( '../2grams', 2) def upper_bound(val, arr, s=0, e=None, key=lambda x: x): if e is None: e = len(arr) while s < e: mid = (s + e) // 2 if val >= key(arr[mid]): s = mid + 1 else: e = mid return s def choose_simple(i, s, e): if s == e: return None cid = connections_index[random.randint(s, e - 1)] wid = connections[i + 1][cid] return wid
#!/bin/env python3 import grams import itertools import math import random WINDOW = 2 words, words_index, words_position, connections, connections_index = grams.load_grams( '../{}grams'.format(WINDOW), WINDOW) cache = {} def process_sentence(sentence, done=None): if done is None: done = [] if not sentence: yield done return for i in range(len(sentence)): if sentence[i].startswith('-'): break done.append(sentence[i]) else: yield done return
#!/bin/env python3 import grams import random import sys words, words_index, words_position, connections, connections_index = grams.load_grams('../2grams', 2) def upper_bound(val, arr, s=0, e=None, key=lambda x: x): if e is None: e = len(arr) while s < e: mid = (s + e) // 2 if val >= key(arr[mid]): s = mid + 1 else: e = mid return s def choose_simple(i, s, e): if s == e: return None cid = connections_index[random.randint(s, e - 1)] wid = connections[i+1][cid] return wid def choose_ranked(i, s, e): if s == e:
#!/bin/env python3 import grams import itertools import random WINDOW = 2 words, words_index, words_position, connections, connections_index = grams.load_grams('../{}grams'.format(WINDOW), WINDOW) def upper_bound(val, arr, s=0, e=None, key=lambda x: x): if e is None: e = len(arr) while s < e: mid = (s + e) // 2 if val >= key(arr[mid]): s = mid + 1 else: e = mid return s def find_ngram(ngram): s = 0 e = len(connections_index) for i, word in enumerate(ngram): word_id = words_index[upper_bound(word, words_index, key=lambda idx: words[idx]) - 1] if words[word_id] != word: return 0 word_position = words_position[word_id]