Пример #1
0
    def learn_text(self, text, allow_new_words):
        """ Count n-grams and add words to the auto-learn models. """
        if self.auto_learn_models:
            tokens, spans = pypredict.tokenize_text(text)

            # There are too many false positives with trailing
            # single quotes, remove them.
            # Do this here, because we still want "it's", etc. to
            # incrementally provide completions.
            for i, token in enumerate(tokens):
                if token.endswith("'"):
                    token = token[:-1]
                    if not token:  # shouldn't happen
                        token = "<unk>"
                    tokens[i] = token

            # if requested, drop unknown words
            if allow_new_words:
                token_sections = [tokens]
            else:
                token_sections = self._drop_new_words(tokens, spans,
                                                      self.persistent_models)
            models = self._model_cache.get_models(self.auto_learn_models)
            for model in models:
                for tokens in token_sections:
                    model.learn_tokens(tokens)

            _logger.info("learn_text: tokens=" + repr(token_sections))

            # debug: save all learned text for later parameter optimization
            if config.log_learn:
                fn = os.path.join(config.user_dir, "learned_text.txt")
                with open(fn, "a") as f:
                    f.write(text + "\n")
Пример #2
0
 def learn_scratch_text(self, text):
     """ Count n-grams and add words to the scratch models. """
     tokens, spans = pypredict.tokenize_text(text)
     models = self._model_cache.get_models(self.scratch_models)
     for model in models:
         # print("scratch learn", model, tokens)
         model.learn_tokens(tokens, True)
Пример #3
0
    def setUp(self):
        self._tmp_dir = tempfile.TemporaryDirectory(prefix="test_onboard_")
        self._dir = self._tmp_dir.name

        text = "word1 word2 word3 word4 word5 word6"
        tokens, _spans = pypredict.tokenize_text(text)

        # prepare contents of error-free models
        self._model_contents = []
        self._models = []
        for i in range(0, self.MAX_ORDER):
            order = i + 1
            fn = os.path.join(self._dir, "order{}.lm".format(order))
            if order == 1:
                model = pypredict.UnigramModel()
            else:
                model = pypredict.DynamicModel(order)
            model.learn_tokens(tokens)
            model.save(fn)

            with open(fn, encoding="UTF-8") as f:
                lines = f.readlines()

            self._models.append(model)
            self._model_contents.append([fn, lines])
Пример #4
0
    def setUp(self):
        self._tmp_dir = tempfile.TemporaryDirectory(prefix="test_onboard_")
        self._dir = self._tmp_dir.name

        text = "word1 word2 word3 word4 word5 word6"
        tokens, _spans = pypredict.tokenize_text(text)

        # prepare contents of error-free models
        self._model_contents = []
        self._models = []
        for i in range(0, self.MAX_ORDER):
            order = i + 1
            fn = os.path.join(self._dir, "order{}.lm".format(order))
            if order == 1:
                model = pypredict.UnigramModel()
            else:
                model = pypredict.DynamicModel(order)
            model.learn_tokens(tokens)
            model.save(fn)

            with open(fn, encoding="UTF-8") as f:
                lines = f.readlines()

            self._models.append(model)
            self._model_contents.append([fn, lines])
Пример #5
0
 def learn_scratch_text(self, text):
     """ Count n-grams and add words to the scratch models. """
     tokens, spans = pypredict.tokenize_text(text)
     models = self._model_cache.get_models(self.scratch_models)
     for model in models:
         #print("scratch learn", model, tokens)
         model.learn_tokens(tokens, True)
Пример #6
0
    def learn_text(self, text, allow_new_words):
        """ Count n-grams and add words to the auto-learn models. """
        if self.auto_learn_models:
            tokens, spans = pypredict.tokenize_text(text)

            # Remove trailing single quote, too many false positives.
            # Do this here, because we still want "it's", etc. to
            # incrementally provide completions.
            for i, token in enumerate(tokens):
                if token.endswith("'"):
                    token = token[:-1]
                    if not token: # shouldn't happen
                        token = "<unk>"
                    tokens[i] = token

            models = self._model_cache.get_models(self.auto_learn_models)
            for model in models:
                model.learn_tokens(tokens, allow_new_words)
            _logger.info("learn_text: tokens=" + repr(tokens[:10]))

            # debug: save all learned text for later parameter optimization
            if config.log_learn:
                fn = os.path.join(config.user_dir, "learned_text.txt")
                with open(fn, "a") as f:
                    f.write(text + "\n")
Пример #7
0
 def tokenize_text(self, text):
     """
     Let the service find the words in text.
     """
     if 1:
         # avoid the D-Bus round-trip while we can
         tokens, spans = pypredict.tokenize_text(text)
     else:
         tokens, spans = self._call_method("tokenize_text", ([], []), text)
     return tokens, spans
Пример #8
0
 def tokenize_text(self, text):
     """
     Let the service find the words in text.
     """
     if 1:
         # avoid the D-Bus round-trip while we can
         tokens, spans = pypredict.tokenize_text(text)
     else:
         tokens, spans = self._call_method("tokenize_text", ([], []),
                                           text)
     return tokens, spans
Пример #9
0
 def tokenize_text(self, text):
     """
     Let the service find the words in text.
     """
     tokens, spans = pypredict.tokenize_text(text)
     return tokens, spans
Пример #10
0
 def tokenize_text(self, text):
     """
     Let the service find the words in text.
     """
     tokens, spans = pypredict.tokenize_text(text)
     return tokens, spans