示例#1
0
def test_segment(input_text):
    tokenizer = HMMTokenizer()
    tokenizer.load_model()

    result = tokenizer.segment(input_text)

    pytest.helpers.assert_token_equals(result, input_text)
示例#2
0
def test_persist(tmpdir):
    temp_path = tmpdir.mkdir("hmm")
    temp_path_str = str(temp_path)

    tokenizer = HMMTokenizer()
    tokenizer.train_one_line(["我", "是", "中国人"])
    tokenizer.train_one_line(["你", "打", "人"])
    tokenizer.do_train()
    tokenizer.persist_to_dir(temp_path_str)

    assert len(temp_path.listdir()) == 3
示例#3
0
 def init_hmm_tokenizer(self):
     if self.hmm_tokenizer is None:
         self.hmm_tokenizer = HMMTokenizer(self.model_dir)
         self.hmm_tokenizer.load_model()
示例#4
0
class TokenizerLoader(object):
    Defaults = BaseDefaults

    factories = {
        'max_match_forward_tokenizer':
        lambda nlp, **cfg: MaxMatchForwardTokenizer(**cfg),
        'max_match_backward_tokenizer':
        lambda nlp, **cfg: MaxMatchBackwardTokenizer(**cfg),
        'max_match_bidirectional_tokenizer':
        lambda nlp, **cfg: MaxMatchBidirectionalTokenizer(**cfg),
        'dag_tokenizer':
        lambda nlp, **cfg: DAGTokenizer(**cfg),
        'hmm_tokenizer':
        lambda nlp, **cfg: HMMTokenizer(**cfg),
        'crf_tokenizer':
        lambda nlp, **cfg: CRFTokenizer(**cfg)
    }

    def __init__(self, meta=None, **kwargs):
        self.meta = {} if meta is None else meta
        self.tokenizers = {}

    def create_tokenizer(self, name, config=dict()):
        """Create a pipeline component from a factory.

        name (unicode): Factory name to look up in `Language.factories`.
        config (dict): Configuration parameters to initialise component.
        RETURNS (callable): Pipeline component.
        """
        if name not in self.factories:
            raise KeyError(Errors.E002.format(name=name))
        factory = self.factories[name]
        return factory(self, **config)

    def add_tokenizer(self, component, name=None):
        if issubclass(BaseTokenizer, component.__class__):
            msg = Errors.E003.format(component=repr(component), name=name)
            if isinstance(component,
                          basestring_) and component in self.factories:
                msg += Errors.E004.format(component=component)
            raise ValueError(msg)
        if name is None:
            if hasattr(component, 'name'):
                name = component.name
            elif hasattr(component, '__name__'):
                name = component.__name__
            elif (hasattr(component, '__class__')
                  and hasattr(component.__class__, '__name__')):
                name = component.__class__.__name__
            else:
                name = repr(component)
        if name in self.tokenizers:
            raise ValueError(
                Errors.E007.format(name=name, opts=self.tokenizers.keys()))

        self.tokenizers[name] = component

    def from_disk(self, path, disable=tuple()):
        path = util.ensure_path(path)
        deserializers = OrderedDict()
        loader_name_to_tokenizer = defaultdict(list)
        loader_name_to_class = dict()
        loader_name_to_instance = dict()
        for name, tokenizer in self.tokenizers.items():
            if name in disable:
                continue

            # TODO: why using this in spacy
            # if not hasattr(tokenizer, 'to_disk'):
            #     continue

            loader_class = tokenizer.get_loader()
            loader_name = loader_class.get_name()
            loader_name_to_tokenizer[loader_name].append(tokenizer)

            if name not in loader_name_to_class:
                loader_name_to_class[loader_name] = loader_class

        for loader_name, loader_class in loader_name_to_class.items():
            loader_config = self.meta.get('loader_config',
                                          {}).get(loader_name, {})
            loader_name_to_instance[loader_name] = loader_class.instance(
                **loader_config)

        for loader_name, tokenizer in loader_name_to_tokenizer.items():
            loader_instance = loader_name_to_instance[loader_name]

            # if hasattr(loader_instance, 'skip_load_from_disk'):
            #     continue

            deserializers[
                loader_name] = lambda p, loader_instance=loader_instance, tokenizer=tokenizer: loader_instance.from_disk(
                    p, tokenizer), loader_instance.get_model_dir()

        exclude = {p: False for p in disable}
        util.from_disk(path, deserializers, exclude)
        return self

    def get_tokenizer(self):
        def assemble_max_match_bidirectional_tokenizer(forward_tokenizer,
                                                       backward_tokenizer):
            if forward_tokenizer and backward_tokenizer:
                max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer(
                )
                max_match_bidirectional_tokenizer.forward_tokenizer = forward_tokenizer
                max_match_bidirectional_tokenizer.backward_tokenizer = backward_tokenizer

                return max_match_bidirectional_tokenizer

            return None

        forward_tokenizer = self.tokenizers.get('max_match_forward_tokenizer')
        backward_tokenizer = self.tokenizers.get(
            'max_match_backward_tokenizer')

        tokenizer = Tokenizer()
        tokenizer.max_match_forward_tokenizer = forward_tokenizer
        tokenizer.max_match_backward_tokenizer = backward_tokenizer
        tokenizer.max_match_bidirectional_tokenizer = assemble_max_match_bidirectional_tokenizer(
            forward_tokenizer, backward_tokenizer)
        tokenizer.hmm_tokenizer = self.tokenizers.get('hmm_tokenizer')
        tokenizer.dag_tokenizer = self.tokenizers.get('dag_tokenizer')
        tokenizer.crf_tokenizer = self.tokenizers.get('crf_tokenizer')

        return tokenizer
示例#5
0
class Tokenizer(object):
    def __init__(self, model_dir=None):
        if model_dir is None:
            model_dir = default_model_dir

        self.model_dir = model_dir

        self.dag_tokenizer = None  # type: DAGTokenizer
        self.hmm_tokenizer = None  # type: HMMTokenizer
        self.max_match_forward_tokenizer = None  # type: MaxMatchForwardTokenizer
        self.max_match_backward_tokenizer = None  # type: MaxMatchBackwardTokenizer
        self.max_match_bidirectional_tokenizer = None  # type: MaxMatchBidirectionalTokenizer
        self.crf_tokenizer = None  # type: CRFTokenizer

    def init_dag_tokenizer(self):
        if self.dag_tokenizer is None:
            self.dag_tokenizer = DAGTokenizer(self.model_dir)
            self.dag_tokenizer.load_model()

    def cut_by_DAG(self, message):
        self.init_dag_tokenizer()
        return self.dag_tokenizer.segment(message)

    def init_hmm_tokenizer(self):
        if self.hmm_tokenizer is None:
            self.hmm_tokenizer = HMMTokenizer(self.model_dir)
            self.hmm_tokenizer.load_model()

    def cut_by_HMM(self, message):
        self.init_hmm_tokenizer()
        return self.hmm_tokenizer.segment(message)

    def cut_by_joint_model(self, message):
        solutions = [self.cut_by_DAG(message), self.cut_by_HMM(message)]
        merge_solutions = MergeSolutions()
        best_solution = merge_solutions.merge(solutions)

        return best_solution

    cut = cut_by_DAG

    def init_max_match_forward_tokenizer(self):
        if self.max_match_forward_tokenizer is None:
            self.max_match_forward_tokenizer = MaxMatchForwardTokenizer()
            self.max_match_forward_tokenizer.load_model()

    def cut_by_max_match_forward(self, message):
        self.init_max_match_forward_tokenizer()
        return self.max_match_forward_tokenizer.segment(message)

    def init_max_match_backward_tokenizer(self):
        if self.max_match_backward_tokenizer is None:
            self.max_match_backward_tokenizer = MaxMatchBackwardTokenizer()
            self.max_match_backward_tokenizer.load_model()

    def cut_by_max_match_backward(self, message):
        self.init_max_match_backward_tokenizer()
        return self.max_match_backward_tokenizer.segment(message)

    def init_max_match_bidirectional_tokenizer(self):
        if self.max_match_bidirectional_tokenizer is None:
            self.max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer(
            )
            self.max_match_bidirectional_tokenizer.load_model()

    def cut_by_max_match_bidirectional(self, message):
        self.init_max_match_bidirectional_tokenizer()
        return self.max_match_bidirectional_tokenizer.segment(message)

    def init_crf_tokenizer(self):
        if self.crf_tokenizer is None:
            self.crf_tokenizer = CRFTokenizer()
            self.crf_tokenizer.load_model()

    def cut_by_CRF(self, message):
        self.init_crf_tokenizer()
        return self.crf_tokenizer.segment(message)

    def load_custom_dict(self, dict_file):
        # TODO: not implement yet
        pass

    def add_word(self, word, freq=None):
        # TODO: not implement yet
        pass

    def del_word(self, word):
        # TODO: not implement yet
        pass

    def load_user_dict(self, dict_file):
        return self.dag_tokenizer.dict_data.load_user_dict(dict_file)

    @property
    def mini_log_freq(self):
        # TODO: not implement yet
        pass

    @property
    def average_log_freq(self):
        # TODO: not implement yet
        pass