def __init__(self, model_name="default", user_dict="default", postag=False): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config self.postag = postag if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) elif model_name in config.available_models: config.modelDir = os.path.join( config.pkuseg_home, model_name, ) download_model(config.model_urls[model_name], config.pkuseg_home, config.model_hash[model_name]) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict is None: file_name = None other_names = None else: if user_dict not in config.available_models: file_name = user_dict else: file_name = None if model_name in config.models_with_dict: other_name = os.path.join( config.pkuseg_home, model_name, model_name + "_dict.pkl", ) default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [other_name, default_name] else: default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [default_name] self.preprocesser = Preprocesser(file_name) # self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(None, other_names) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx) if postag: download_model(config.model_urls["postag"], config.pkuseg_home, config.model_hash[model_name]) postag_dir = os.path.join( config.pkuseg_home, "postag", ) self.tagger = Postag(postag_dir)
class pkuseg: def __init__(self, model_name="default", user_dict="default", postag=False): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config self.postag = postag if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) elif model_name in config.available_models: config.modelDir = os.path.join( config.pkuseg_home, model_name, ) download_model(config.model_urls[model_name], config.pkuseg_home, config.model_hash[model_name]) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict is None: file_name = None other_names = None else: if user_dict not in config.available_models: file_name = user_dict else: file_name = None if model_name in config.models_with_dict: other_name = os.path.join( config.pkuseg_home, model_name, model_name + "_dict.pkl", ) default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [other_name, default_name] else: default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [default_name] self.preprocesser = Preprocesser(file_name) # self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(None, other_names) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx) if postag: download_model(config.model_urls["postag"], config.pkuseg_home, config.model_hash[model_name]) postag_dir = os.path.join( config.pkuseg_home, "postag", ) self.tagger = Postag(postag_dir) # print("finish") def _cut(self, text): """ 直接对文本分词 """ examples = list(self.feature_extractor.normalize_text(text)) length = len(examples) all_feature = [] # type: List[List[int]] for idx in range(length): node_feature_idx = self.feature_extractor.get_node_features_idx( idx, examples) # node_feature = self.feature_extractor.get_node_features( # idx, examples # ) # node_feature_idx = [] # for feature in node_feature: # feature_idx = self.feature_extractor.feature_to_idx.get(feature) # if feature_idx is not None: # node_feature_idx.append(feature_idx) # if not node_feature_idx: # node_feature_idx.append(0) all_feature.append(node_feature_idx) _, tags = _inf.decodeViterbi_fast(all_feature, self.model) words = [] current_word = None is_start = True for tag, char in zip(tags, text): if is_start: current_word = char is_start = False elif "B" in self.idx_to_tag[tag]: words.append(current_word) current_word = char else: current_word += char if current_word: words.append(current_word) return words def cut(self, txt): """分词,结果返回一个list""" txt = txt.strip() ret = [] if not txt: return ret imary = txt.split() # 根据空格分为多个片段 # 对每个片段分词 for w0 in imary: if not w0: continue # 根据用户词典拆成更多片段 lst, isword = self.preprocesser.solve(w0) for w, isw in zip(lst, isword): if isw: ret.append(w) continue output = self._cut(w) ret.extend(self.postprocesser(output)) if self.postag: tags = self.tagger.tag(ret) ret = list(zip(ret, tags)) return ret