예제 #1
0
파일: __init__.py 프로젝트: zcaspar/spaCy
    def from_bytes(self, data, **kwargs):
        pkuseg_features_b = b""
        pkuseg_weights_b = b""
        pkuseg_processors_data = None

        def deserialize_pkuseg_features(b):
            nonlocal pkuseg_features_b
            pkuseg_features_b = b

        def deserialize_pkuseg_weights(b):
            nonlocal pkuseg_weights_b
            pkuseg_weights_b = b

        def deserialize_pkuseg_processors(b):
            nonlocal pkuseg_processors_data
            pkuseg_processors_data = srsly.msgpack_loads(b)

        deserializers = OrderedDict((
            ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
            ("pkuseg_features", deserialize_pkuseg_features),
            ("pkuseg_weights", deserialize_pkuseg_weights),
            ("pkuseg_processors", deserialize_pkuseg_processors),
        ))
        util.from_bytes(data, deserializers, [])

        if pkuseg_features_b and pkuseg_weights_b:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
                with open(tempdir / "features.pkl", "wb") as fileh:
                    fileh.write(pkuseg_features_b)
                with open(tempdir / "weights.npz", "wb") as fileh:
                    fileh.write(pkuseg_weights_b)
                try:
                    import pkuseg
                except ImportError:
                    raise ImportError(
                        "pkuseg not installed. To use this model, " +
                        _PKUSEG_INSTALL_MSG)
                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
            if pkuseg_processors_data:
                (
                    user_dict,
                    do_process,
                    common_words,
                    other_words,
                ) = pkuseg_processors_data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)

        return self
예제 #2
0
 def load_pkuseg_processors(path):
     try:
         import pkuseg
     except ImportError:
         if self.use_pkuseg:
             raise ImportError(self._pkuseg_install_msg)
     if self.pkuseg_seg:
         data = srsly.read_msgpack(path)
         (user_dict, do_process, common_words, other_words) = data
         self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
         self.pkuseg_seg.postprocesser.do_process = do_process
         self.pkuseg_seg.postprocesser.common_words = set(common_words)
         self.pkuseg_seg.postprocesser.other_words = set(other_words)
예제 #3
0
 def pkuseg_update_user_dict(self, words, reset=False):
     if self.pkuseg_seg:
         if reset:
             try:
                 import pkuseg
                 self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
             except ImportError:
                 if self.use_pkuseg:
                     msg = ("pkuseg not installed: unable to reset pkuseg "
                            "user dict. Please " + _PKUSEG_INSTALL_MSG)
                     raise ImportError(msg)
         for word in words:
             self.pkuseg_seg.preprocesser.insert(word.strip(), '')