예제 #1
0
    def __init__(self,
                 vocabs=base + '/data/nagisa_v001.dict',
                 params=base + '/data/nagisa_v001.model',
                 hp=base + '/data/nagisa_v001.hp',
                 single_word_list=None):
        # Load vocaburary files
        vocabs = utils.load_data(vocabs)
        self._uni2id, self._bi2id, self._word2id, self._pos2id, self._word2postags = vocabs
        self._id2pos = {v: k for k, v in self._pos2id.items()}
        self.id2pos = self._id2pos
        self.postags = [postag for postag in self._pos2id.keys()]
        # Load a hyper-parameter file
        self._hp = utils.load_data(hp)
        # Construct a word segmentation model and a pos tagging model
        self._model = model.Model(self._hp, params)

        # If a word is included in the single_word_list,
        # it is recognized as a single word forcibly.
        self.pattern = None
        if single_word_list:
            single_word_list = [
                utils.preprocess(w) for w in single_word_list if len(w) > 1
            ]
            if len(single_word_list) > 0:
                self.pattern = re.compile('|'.join(single_word_list))
예제 #2
0
    def __init__(self, vocabs=None, params=None, hp=None, single_word_list=None):
        if vocabs is None:
            vocabs = base + '/data/nagisa_v001.dict'
        if params is None:
            params = base + '/data/nagisa_v001.model'
        if hp is None:
            hp = base + '/data/nagisa_v001.hp'

        # Load vocaburary files
        vocabs = utils.load_data(vocabs)
        self._uni2id, self._bi2id, self._word2id, self._pos2id, self._word2postags = vocabs
        self._id2pos = {v:k for k, v in self._pos2id.items()}
        self.id2pos  = self._id2pos
        self.postags = [postag for postag in self._pos2id.keys()]
        # Load a hyper-parameter file
        self._hp = utils.load_data(hp)
        # Construct a word segmentation model and a pos tagging model
        self._model = model.Model(self._hp, params)

        # If a word is included in the single_word_list,
        # it is recognized as a single word forcibly.
        self.pattern = None
        if single_word_list:
            single_word_list = [utils.preprocess(w) for w in single_word_list if len(w) > 1]
            single_word_list = [w.replace('(', '\(').replace(')', '\)')
                                for w in single_word_list]
            single_word_list = sorted(single_word_list, key=lambda x:-len(x))
            if len(single_word_list) > 0:
                self.pattern = re.compile('|'.join(single_word_list))

        # If use_noun_heuristic is True, nouns are more lilely to appear.
        if u'名詞' in self._pos2id:
            self.use_noun_heuristic = True
        else:
            self.use_noun_heuristic = False