def __init__(self, vocabs=base + '/data/nagisa_v001.dict', params=base + '/data/nagisa_v001.model', hp=base + '/data/nagisa_v001.hp', single_word_list=None): # Load vocaburary files vocabs = utils.load_data(vocabs) self._uni2id, self._bi2id, self._word2id, self._pos2id, self._word2postags = vocabs self._id2pos = {v: k for k, v in self._pos2id.items()} self.id2pos = self._id2pos self.postags = [postag for postag in self._pos2id.keys()] # Load a hyper-parameter file self._hp = utils.load_data(hp) # Construct a word segmentation model and a pos tagging model self._model = model.Model(self._hp, params) # If a word is included in the single_word_list, # it is recognized as a single word forcibly. self.pattern = None if single_word_list: single_word_list = [ utils.preprocess(w) for w in single_word_list if len(w) > 1 ] if len(single_word_list) > 0: self.pattern = re.compile('|'.join(single_word_list))
def __init__(self, vocabs=None, params=None, hp=None, single_word_list=None): if vocabs is None: vocabs = base + '/data/nagisa_v001.dict' if params is None: params = base + '/data/nagisa_v001.model' if hp is None: hp = base + '/data/nagisa_v001.hp' # Load vocaburary files vocabs = utils.load_data(vocabs) self._uni2id, self._bi2id, self._word2id, self._pos2id, self._word2postags = vocabs self._id2pos = {v:k for k, v in self._pos2id.items()} self.id2pos = self._id2pos self.postags = [postag for postag in self._pos2id.keys()] # Load a hyper-parameter file self._hp = utils.load_data(hp) # Construct a word segmentation model and a pos tagging model self._model = model.Model(self._hp, params) # If a word is included in the single_word_list, # it is recognized as a single word forcibly. self.pattern = None if single_word_list: single_word_list = [utils.preprocess(w) for w in single_word_list if len(w) > 1] single_word_list = [w.replace('(', '\(').replace(')', '\)') for w in single_word_list] single_word_list = sorted(single_word_list, key=lambda x:-len(x)) if len(single_word_list) > 0: self.pattern = re.compile('|'.join(single_word_list)) # If use_noun_heuristic is True, nouns are more lilely to appear. if u'名詞' in self._pos2id: self.use_noun_heuristic = True else: self.use_noun_heuristic = False