Exemplo n.º 1
0
def count_unigrams(n_sequences: int):
    total_start = timestamp()

    tokenizer = Tokenizer()
    counts_delim = {}
    counts_no_delim = {}

    tokenization_time = 0

    for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)):
        start = timestamp()
        tokens = tokenizer.tokenize(sequence)
        tokens[0].delimiter_before = True
        tokenization_time += time_diff(start)
        for token in tokens:
            counts = counts_delim if token.delimiter_before else counts_no_delim
            if token.text not in counts:
                counts[token.text] = 1
            else:
                counts[token.text] += 1
        if (s_i + 1) % K10 == 0:
            print("%ik sequences, %.2f s total time, %.2f s tokenization" %
                  ((s_i + 1) / K, time_diff(total_start), tokenization_time))
        if (s_i + 1) % M == 0:
            print("saving...")
            dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT)
            dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
Exemplo n.º 2
0
 def __init__(self):
     unigrams = UnigramHolder()
     print("%i unigrams" % len(unigrams))
     bigrams = BigramHolder.load()
     print("%i bigrams" % len(bigrams))
     self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY)
     print("%i stumps" % len(self.matcher.stump_dict))
     self.tokenizer = Tokenizer()
     self.rule_based_postprocessor = RuleBasedPostprocessor()
Exemplo n.º 3
0
def search_tokens():
    n = int(sys.argv[2])
    tokenizer = Tokenizer()
    for query in interactive_sequence_generator():
        if query.startswith(' '):
            query_token = Token(query[1:], True)
        else:
            query_token = Token(query, False)
        for sequence in Wikipedia.training_sequences(n):
            tokens = tokenizer.tokenize(sequence)
            if query_token in tokens:
                print(sequence)
Exemplo n.º 4
0
def count_bigrams(n_sequences: int):
    tokenizer = Tokenizer()
    holder = BigramHolder()
    for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)):
        tokens = tokenizer.tokenize(sequence)
        texts = [token.text for token in tokens]
        for i in range(len(tokens) - 1):
            bigram = texts[i:(i + 2)]
            holder.increment(bigram)
        if (s_i + 1) % K10 == 0:
            print("%ik sequences" % ((s_i + 1) / K))
        if (s_i + 1) % M == 0:
            print("saving...")
            holder.save()
    holder.save()
Exemplo n.º 5
0
class FuzzyGreedyCorrector:
    PENALTY = 0.1

    def __init__(self):
        unigrams = UnigramHolder()
        print("%i unigrams" % len(unigrams))
        bigrams = BigramHolder.load()
        print("%i bigrams" % len(bigrams))
        self.matcher = FuzzyMatcher(unigrams, bigrams, self.PENALTY)
        print("%i stumps" % len(self.matcher.stump_dict))
        self.tokenizer = Tokenizer()
        self.rule_based_postprocessor = RuleBasedPostprocessor()

    def correct(self, sequence: str):
        tokens = self.tokenizer.tokenize(sequence)
        texts = [token.text for token in tokens]
        predicted = ""
        t_i = 0
        while t_i < len(texts):
            if t_i > 0:
                predicted += ' '
            text = texts[t_i]
            if not text.isalpha():
                predicted += text
                t_i += 1
                continue
            # try merge:
            if t_i + 1 < len(texts) and texts[t_i + 1].isalpha():
                _, bigram_frequency = self.matcher.fuzzy_bigram_frequency(
                    text, texts[t_i + 1])
                merge = text + texts[t_i + 1]
                _, merge_frequency = self.matcher.fuzzy_unigram_frequency(
                    merge)
                if merge_frequency * self.PENALTY > bigram_frequency:
                    predicted += merge
                    t_i += 2
                    continue
            # try split:
            if len(text) > 1:
                _, unigram_frequency = self.matcher.fuzzy_unigram_frequency(
                    text)
                split, _, split_frequency = self.matcher.best_fuzzy_split(
                    text, lower_bound=unigram_frequency)
                if split_frequency * self.PENALTY > unigram_frequency:
                    predicted += ' '.join(split)
                    t_i += 1
                    continue
            predicted += text
            t_i += 1
        predicted = self.rule_based_postprocessor.correct(predicted)
        return predicted
class LeftToRightCorrector:
    def __init__(self):
        self.unigrams = UnigramHolder()
        self.bigrams = BigramHolder.load()
        self.tokenizer = Tokenizer()
        self.postprocessor = RuleBasedPostprocessor()

    def try_merge(self, token: str, next: str) -> bool:
        return self.unigrams.get(token + next) > self.bigrams.get(
            (token, next))

    def best_split(self, token: str) -> str:
        best = token
        best_freqency = self.unigrams.get(token)
        best_unigram_frequency = best_freqency
        for i in range(1, len(token)):
            left, right = token[:i], token[i:]
            frequency = self.bigrams.get((left, right))
            unigram_frequency = min(self.unigrams.get(left),
                                    self.unigrams.get(right))
            if frequency > best_freqency or (
                    frequency == best_freqency
                    and unigram_frequency > best_unigram_frequency):
                best = left + ' ' + right
                best_freqency = frequency
                best_unigram_frequency = unigram_frequency
        return best

    def correct(self, sequence: str) -> str:
        tokens = self.tokenizer.tokenize(sequence)
        texts = [token.text for token in tokens]
        predicted = ""
        t_i = 0
        while t_i < len(texts):
            if t_i > 0:
                predicted += ' '
            if t_i + 1 < len(texts) and self.try_merge(texts[t_i],
                                                       texts[t_i + 1]):
                predicted += texts[t_i] + texts[t_i + 1]
                t_i += 2
            else:
                predicted += self.best_split(texts[t_i])
                t_i += 1
        predicted = self.postprocessor.correct(predicted)
        return predicted
 def __init__(self):
     self.unigrams = UnigramHolder()
     self.bigrams = BigramHolder.load()
     self.tokenizer = Tokenizer()
     self.postprocessor = RuleBasedPostprocessor()
Exemplo n.º 8
0
 def __init__(self, n: Optional[int]):
     self.tokenizer = Tokenizer()
     self.holder = UnigramHolder(n)
     self.bigrams = BigramHolder.load()
Exemplo n.º 9
0
class UnigramCorrector:
    def __init__(self, n: Optional[int]):
        self.tokenizer = Tokenizer()
        self.holder = UnigramHolder(n)
        self.bigrams = BigramHolder.load()

    def split_candidates(self, token: Token) -> List[CorrectionCandidate]:
        text = token.text
        candidates = []
        for i in range(1, len(text)):
            left = text[:i]
            right = text[i:]
            frequency = self.bigrams.get((left, right))
            if frequency > 0:
                tokens = [
                    Token(left, token.delimiter_before),
                    Token(right, True)
                ]
                candidates.append(
                    CorrectionCandidate(frequency, tokens, False, False))
        return candidates

    def merge_candidates(self, token: Token, previous_token: Optional[Token],
                         next_token: Optional[Token]):
        candidates = []
        if previous_token is not None:
            merged = previous_token.text + token.text
            frequency = self.holder.get(merged)
            if frequency > 0:
                candidates.append(
                    CorrectionCandidate(
                        frequency,
                        [Token(merged, previous_token.delimiter_before)],
                        consume_previous=True,
                        consume_next=False))
        if next_token is not None:
            merged = token.text + next_token.text
            frequency = self.holder.get(merged)
            if frequency > 0:
                candidates.append(
                    CorrectionCandidate(
                        frequency, [Token(merged, token.delimiter_before)],
                        consume_previous=False,
                        consume_next=True))
        return candidates

    def select_best_candidate(self, candidates: List[CorrectionCandidate]):
        best_score = -1
        best = None
        for candidate in candidates:
            if candidate.score > best_score:
                best_score = candidate.score
                best = candidate
        return best

    def repair_token(self, token: Token, next_token: Optional[Token],
                     predicted_tokens: List[Token]) -> int:
        candidates = [
            CorrectionCandidate(self.holder.get(token.text), [token], False,
                                False)
        ]
        candidates.extend(self.split_candidates(token))
        previous_token = predicted_tokens[-1] if len(
            predicted_tokens) > 0 else None
        candidates.extend(
            self.merge_candidates(token, previous_token, next_token))
        if len(candidates) > 0:
            best_candidate = self.select_best_candidate(candidates)
            if best_candidate.consume_previous:
                predicted_tokens.pop()
            predicted_tokens.extend(best_candidate.tokens)
            return 2 if best_candidate.consume_next else 1
        predicted_tokens.append(token)
        return 1

    def correct(self, sequence: str) -> str:
        tokens = self.tokenizer.tokenize(sequence)
        n_tokens = len(tokens)
        tokens.append(None)
        predicted_tokens = []
        t_i = 0
        while t_i < n_tokens:
            token = tokens[t_i]
            next_token = tokens[t_i + 1]
            t_i += self.repair_token(token, next_token, predicted_tokens)
        predicted = tokens2sequence(predicted_tokens)
        return predicted
import project
from src.ngram.tokenizer import Tokenizer
from src.interactive.sequence_generator import interactive_sequence_generator

if __name__ == "__main__":
    tokenizer = Tokenizer()
    for sequence in interactive_sequence_generator():
        tokens = tokenizer.tokenize(sequence)
        print(tokens)