def __init__(self): unigrams = UnigramHolder() print("%i unigrams" % len(unigrams)) bigrams = BigramHolder.load() print("%i bigrams" % len(bigrams)) self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY) print("%i stumps" % len(self.matcher.stump_dict)) self.tokenizer = Tokenizer() self.rule_based_postprocessor = RuleBasedPostprocessor()
def test_postprocessor(self): self.assertEqual("The cat eats fish.", RuleBasedPostprocessor.correct("The cat eats fish.")) self.assertEqual("The cat, who likes fish.", RuleBasedPostprocessor.correct("The cat , who likes fish .")) self.assertEqual("Bla's 'statement': \"bli.\"", RuleBasedPostprocessor.correct("Bla 's 'statement' : \" bli . \"")) self.assertEqual('""', RuleBasedPostprocessor.correct('" "')) self.assertEqual('bla "" bli', RuleBasedPostprocessor.correct('bla " " bli')) self.assertEqual("I ate 123 apples.", RuleBasedPostprocessor.correct("I ate 1 2 3 apples.")) self.assertEqual("I use character-based language models.", RuleBasedPostprocessor.correct("I use character - based language models .")) self.assertEqual("bla (bli) blu", RuleBasedPostprocessor.correct("bla ( bli ) blu")) self.assertEqual("()", RuleBasedPostprocessor.correct("( )")) self.assertEqual("bla () bli", RuleBasedPostprocessor.correct("bla ( ) bli"))
def correct(self, sequence: str) -> str: sequence = sequence.replace(' ', '') word_locations = self.locate_words(sequence) solutions = [] for i in range(len(sequence)): candidates = [] beginnings = word_locations[i] if len(beginnings) == 0: beginnings = [i] for b in beginnings: if b == 0: token = sequence[:(i + 1)] candidate = self._new_candidate(token) candidates.append(candidate) elif solutions[b - 1] is not None: previous = solutions[b - 1] token = sequence[b:(i + 1)] candidate = self._expand_candidate(previous, token) candidates.append(candidate) solutions.append(self._pick_best_candidate(candidates)) final_solution = solutions[-1] predicted = ' '.join(final_solution.tokens) if self.bigram_postprocessor is not None: predicted = self.bigram_postprocessor.correct(predicted) predicted = RuleBasedPostprocessor.correct(predicted) return predicted
class WordSegment: def __init__(self): load() self.postprocessor = RuleBasedPostprocessor() def correct(self, sequence: str) -> str: sequence = ''.join(sequence.split()) segmented = ' '.join(segment(sequence)) #print(segmented) predicted = reinsert_punctuation(segmented, sequence) #print(predicted) predicted = self.postprocessor.correct(predicted) return predicted
class FuzzyGreedyCorrector: PENALTY = 0.1 def __init__(self): unigrams = UnigramHolder() print("%i unigrams" % len(unigrams)) bigrams = BigramHolder.load() print("%i bigrams" % len(bigrams)) self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY) print("%i stumps" % len(self.matcher.stump_dict)) self.tokenizer = Tokenizer() self.rule_based_postprocessor = RuleBasedPostprocessor() def correct(self, sequence: str): tokens = self.tokenizer.tokenize(sequence) texts = [token.text for token in tokens] predicted = "" t_i = 0 while t_i < len(texts): if t_i > 0: predicted += ' ' text = texts[t_i] if not text.isalpha(): predicted += text t_i += 1 continue # try merge: if t_i + 1 < len(texts) and texts[t_i + 1].isalpha(): _, bigram_frequency = self.matcher.fuzzy_bigram_frequency( text, texts[t_i + 1]) merge = text + texts[t_i + 1] _, merge_frequency = self.matcher.fuzzy_unigram_frequency( merge) if merge_frequency * self.PENALTY > bigram_frequency: predicted += merge t_i += 2 continue # try split: if len(text) > 1: _, unigram_frequency = self.matcher.fuzzy_unigram_frequency( text) split, _, split_frequency = self.matcher.best_fuzzy_split( text, lower_bound=unigram_frequency) if split_frequency * self.PENALTY > unigram_frequency: predicted += ' '.join(split) t_i += 1 continue predicted += text t_i += 1 predicted = self.rule_based_postprocessor.correct(predicted) return predicted
class BigramDynamicCorrector: def __init__(self): self.model = BigramModel() self.rule_based_postprocessor = RuleBasedPostprocessor() def is_token(self, text) -> bool: return self.model.unigrams.is_unigram(text) def locate_words(self, text: str) -> List[List[str]]: located_words = [[] for _ in text] for i in range(len(text)): for j in range(i + 1, min(i + MAX_WORD_LEN, len(text)) + 1): word = text[i:j] if self.is_token(word) or len(word) == 1: located_words[i].append(word) return located_words def correct(self, sequence: str) -> str: sequence = sequence.replace(' ', '') words_at_position = self.locate_words(sequence) solutions = [{} for _ in sequence] for position in range(len(sequence)): words = words_at_position[position] for word in words: end_pos = position + len(word) - 1 if position == 0: p = self.model.get_unigram_probability(word) + EPSILON solutions[end_pos][word] = Solution(word, word, np.log(p)) else: for previous_word in solutions[position - 1]: prefix_solution = solutions[position - 1][previous_word] bigram = (prefix_solution.last_token, word) p = self.model.get_probability(bigram) + EPSILON score = prefix_solution.score + np.log(p) if word not in solutions[end_pos] or score > solutions[ end_pos][word].score: solutions[end_pos][word] = Solution( prefix_solution.sequence + ' ' + word, word, score) predicted = sequence best_score = -np.inf for last_word in solutions[-1]: solution = solutions[-1][last_word] if solution.score > best_score: predicted = solution.sequence best_score = solution.score predicted = self.rule_based_postprocessor.correct(predicted) return predicted
class LeftToRightCorrector: def __init__(self): self.unigrams = UnigramHolder() self.bigrams = BigramHolder.load() self.tokenizer = Tokenizer() self.postprocessor = RuleBasedPostprocessor() def try_merge(self, token: str, next: str) -> bool: return self.unigrams.get(token + next) > self.bigrams.get( (token, next)) def best_split(self, token: str) -> str: best = token best_freqency = self.unigrams.get(token) best_unigram_frequency = best_freqency for i in range(1, len(token)): left, right = token[:i], token[i:] frequency = self.bigrams.get((left, right)) unigram_frequency = min(self.unigrams.get(left), self.unigrams.get(right)) if frequency > best_freqency or ( frequency == best_freqency and unigram_frequency > best_unigram_frequency): best = left + ' ' + right best_freqency = frequency best_unigram_frequency = unigram_frequency return best def correct(self, sequence: str) -> str: tokens = self.tokenizer.tokenize(sequence) texts = [token.text for token in tokens] predicted = "" t_i = 0 while t_i < len(texts): if t_i > 0: predicted += ' ' if t_i + 1 < len(texts) and self.try_merge(texts[t_i], texts[t_i + 1]): predicted += texts[t_i] + texts[t_i + 1] t_i += 2 else: predicted += self.best_split(texts[t_i]) t_i += 1 predicted = self.postprocessor.correct(predicted) return predicted
def __init__(self): load() self.postprocessor = RuleBasedPostprocessor()
def __init__(self): self.unigrams = UnigramHolder() self.bigrams = BigramHolder.load() self.tokenizer = Tokenizer() self.postprocessor = RuleBasedPostprocessor()
import project from src.interactive.sequence_generator import interactive_sequence_generator from src.postprocessing.rule_based import RuleBasedPostprocessor if __name__ == "__main__": for sequence in interactive_sequence_generator(): predicted = RuleBasedPostprocessor.correct(sequence) print(predicted)
def test_combined_case(self): self.assertEqual("bla (\"bli\") blu", RuleBasedPostprocessor.correct("bla ( \" bli \" ) blu"))
def test_quotation_beginning(self): self.assertEqual("\"bla\"", RuleBasedPostprocessor.correct("\" bla \""))
def __init__(self): self.model = BigramModel() self.rule_based_postprocessor = RuleBasedPostprocessor()