예제 #1
0
    def __init__(self,
                 nouns,
                 noun_pos_features=None,
                 stems=None,
                 eomis=None,
                 extract_eomi=False,
                 extract_stem=False,
                 verbose=True):

        if not noun_pos_features:
            noun_pos_features = self._load_default_noun_pos_features()

        if not stems:
            stems = self._load_default_stems()

        if not eomis:
            eomis = self._load_default_eomis()

        self._nouns = nouns
        self._noun_pos_features = noun_pos_features
        self._stems = stems
        self._eomis = eomis
        self.verbose = verbose
        self.extract_eomi = extract_eomi
        self.extract_stem = extract_stem

        self._stem_surfaces = {
            l
            for stem in stems for l in _conjugate_stem(stem)
        }
        self.lrgraph = None
예제 #2
0
def main():
    testset = {
        '가', '감미롭', '곱', '구르', '그렇', '긋', '깨닫', '끄', '낫', '덥', '돌아오', '동그랗',
        '들', '벗', '사오', '삼가', '시퍼렇', '아깝', '아니꼽', '아니하', '아름답', '영원하', '오',
        '이', '이르', '좋', '주', '줍', '트', '파랗', '푸', '푸르', '하'
    }

    for stem in testset:
        print('{} -> {}'.format(stem, _conjugate_stem(stem)))
예제 #3
0
 def _transform_stem_as_surfaces(self):
     surfaces = set()
     for stem in self._stems:
         try:
             for l in _conjugate_stem(stem):
                 surfaces.add(l)
         except Exception as e:
             print('Exception stem = {}, {}'.format(stem, e))
             continue
     return surfaces
예제 #4
0
    def extract(self,
                condition=None,
                min_eomi_score=0.3,
                min_eomi_frequency=1,
                reset_lrgraph=True):

        # reset covered eojeol count and extracted eomis
        self._num_of_covered_eojeols = 0
        self._eomis = {}

        self._stem_surfaces = {
            l
            for stem in self._stems for l in _conjugate_stem(stem)
        }

        # base prediction
        candidates = self._candidates_from_stem_surfaces(condition)

        prediction_scores = self._batch_prediction(candidates, min_eomi_score,
                                                   self.min_num_of_features)

        eomi_surfaces = {
            eomi: score
            for eomi, score in prediction_scores.items()
            if (score[1] >= min_eomi_score)
        }

        if self.verbose:
            message = 'eomi lemmatization with {} candidates'.format(
                len(eomi_surfaces))
            self._print(message, replace=False, newline=True)

        self.lrgraph.reset_lrgraph()
        lemmas = self._eomi_lemmatize(eomi_surfaces)

        lemmas = {
            eomi: score
            for eomi, score in lemmas.items()
            if (score[0] >= min_eomi_frequency) and (
                score[1] >= min_eomi_score)
        }

        if self.logpath:
            with open(self.logpath + '_eomi_prediction_score.log',
                      'w',
                      encoding='utf-8') as f:
                f.write('eomi frequency score\n')

                for word, score in sorted(prediction_scores.items(),
                                          key=lambda x: -x[1][1]):
                    f.write('{} {} {}\n'.format(word, score[0], score[1]))

        if self.verbose:
            message = '{} eomis extracted with min frequency = {}, min score = {}'.format(
                len(lemmas), min_eomi_frequency, min_eomi_score)
            self._print(message, replace=False, newline=True)

        self._check_covered_eojeols(lemmas)  # TODO with lemma

        self._eomis = lemmas

        if reset_lrgraph:
            self.lrgraph.reset_lrgraph()

        del self._stem_surfaces

        lemmas_ = {
            eomi: EomiScore(score[0], score[1])
            for eomi, score in lemmas.items()
        }
        return lemmas_