示例#1
0
class koNounExtract(object):

    def __init__(self):
        self.util = Util()
        self.kolastChar = koLastCharCheck();

        self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText")

    def dictGenerate(self, sentence):


        return mecabDictLine
示例#2
0
def extract_multi_noun(text, item_counter=0):
    text = text.strip()

    for text_array in text.split("\n"):
        text_array = text_array.strip()

        line_array_multi_noun_score = {}

        text_array = re.sub(r'[\w.-]+@[\w.-]+.\w+', '', text_array)
        text_array = re.sub(
            r'(http|ftp|https)://([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?',
            '', text_array)

        multi_noun_list, multi_noun_list_score = extract_mecab_multi_noun(
            text_array, item_counter=item_counter)

        if len(multi_noun_list):
            for index, word in enumerate(multi_noun_list):
                if Util().check_email(word):
                    continue
                else:
                    add_flag = True
                    for char in word:
                        if char in [
                                "'", "`", ",", "'", "\"", "|", "!", "@", "#",
                                "$", "%", "^", "&", "*", "(", ")", "-", "_",
                                "=", "+", "<", ">", ".", ";", ":", "ㄱ", "ㄴ",
                                "ㄲ", "ㅂ", "ㅃ", "ㅈ", "ㅉ", "ㄷ", "ㄸ", "ㄱ", "ㅁ",
                                "ㅇ", "ㄹ", "ㅎ", "ㅅ", "ㅆ", "ㅍ", "ㅊ", "ㅌ", "ㅋ",
                                "ㅛ", "ㅕ", "ㅑ", "ㅐ", "ㅔ", "ㅗ", "ㅓ", "ㅏ", "ㅣ",
                                "ㅠ", "ㅜ", "ㅡ", " "
                        ]:
                            add_flag = False

                    if word == '기자' or word == str(date.today().day) + '일' \
                            or word.strip() == "" \
                            or len(word.strip()) == 1 \
                            :
                        add_flag = False

                    if check_en_stopword(word):
                        add_flag = False

                    if add_flag:
                        word_score = {word: multi_noun_list_score[word]}
                        line_array_multi_noun_score.update(word_score)
                    # print(line_number, word)

    return sorted_dict(line_array_multi_noun_score)
示例#3
0
class mecabDictGenerate(object):
    def __init__(self):
        self.util = Util()
        self.kolastChar = koLastCharCheck()

        self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText")

    def dictGenerate(self, word, posTag="NNG", kind=''):
        isLastChar = self.kolastChar.lastKoTextCheck(word)
        if kind:
            kind = kind + '-'

        # 포커스 인,,,,NNG,*,T,포커스 인,*,*,*,*
        if isLastChar == 1:
            # print(line + ",,,,NNG,*,T,추가-" + line + ",*,*,*,*")
            # write_file.writelines(word + ",,,,NNG,*,T,추가-" + word + ",*,*,*,*" + "\n")
            mecabDictLine = word + ",,,," + posTag + ",*,T," + kind + word + ",*,*,*,*"
        else:
            # print(line + ",,,,NNG,*,F,추가-" + line + ",*,*,*,*")
            # write_file.writelines(word + ",,,,NNG,*,F,추가-" + word + ",*,*,*,*" + "\n")
            mecabDictLine = word + ",,,," + posTag + ",*,F," + kind + word + ",*,*,*,*"
        return mecabDictLine

    def mecab_dict_gen_from_file(self,
                                 input_filename,
                                 output_filename,
                                 posTag='NNG',
                                 kind=''):

        read_file = open(input_filename, mode='r', encoding='utf-8')
        write_file = open(output_filename, mode='w', encoding='utf-8')

        linenum = 0
        while True:
            word = read_file.readline()
            word = word.strip()
            linenum += 1
            if not word:
                break

            mecabDictLine = self.dictGenerate(word, posTag=posTag, kind=kind)
            print(mecabDictLine)
            write_file.writelines(mecabDictLine + "\n")

        print("LINE NUMBER END : ", linenum)

        write_file.close()
        read_file.close()
示例#4
0
def remove_stopword(multi_noun, multi_noun_score, stop_word=[]):
    if len(stop_word) == 0 or stop_word == None:
        stop_word = all_stop_word

    check_multi_noun = []
    check_multi_noun_score = {}

    for noun in multi_noun:
        if noun not in stop_word \
                and not Util().is_int(noun) \
                and not str(noun).endswith('니다') \
                and not str(noun).endswith('이다'):
            check_multi_noun.append(noun)
            check_multi_noun_score[noun] = multi_noun_score[noun]

    return sorted_dict(check_multi_noun_score)
示例#5
0
class koSoySpacing(object):
    def __init__(self):
        self.util = Util()

    def train(self, filename):
        verbose = False
        mc = 10  # min_count
        ft = 0.3  # force_abs_threshold
        nt = -0.3  # nonspace_threshold
        st = 0.3  # space_threshold

        model = CountSpace()

        rootDirPath = self.util.getRootPath("SmiToText.SmiToText")
        corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt"
        model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model"

        ### 학습
        # model.train(corpus_fname)
        # model.save_model(model_fname, json_format=False)

        ## 모델 로드
        model.load_model(model_fname, json_format=False)

        #sent = '이건진짜좋은영화 라라랜드진짜좋은영화'
        # sent = '그일단그구성원인사람들과,,'
        sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.'

        sent_input = sent.replace(" ", "")

        # with parameters
        setn_output_1, tags = model.correct(doc=sent_input,
                                            verbose=verbose,
                                            force_abs_threshold=ft,
                                            nonspace_threshold=nt,
                                            space_threshold=st,
                                            min_count=mc)

        # without parameters
        setn_output_2, tags = model.correct(sent_input)

        print(sent)
        print(setn_output_1)
        print(setn_output_2)
示例#6
0
def check_stopword(multi_noun, multi_noun_score, stop_word=[]):
    if len(stop_word) == 0 or stop_word == None:
        stop_word = all_stop_word
    check_multi_noun = []
    check_multi_noun_score = {}

    for noun in multi_noun:
        # print(noun.replace(' ', ''))
        # print(len(
        #         set(stop_word).difference(noun.replace(' ', ''))) == len(stop_word))

        if len(set(stop_word).difference(noun.split())) == len(stop_word) \
                and len(set(stop_word).difference([noun.replace(' ', '')])) == len(stop_word) \
                and not Util().is_int(noun) \
                and not str(noun).endswith('니다') \
                and not str(noun).endswith('이다'):
            check_multi_noun.append(noun)
            check_multi_noun_score[noun] = multi_noun_score[noun]

    return sorted_dict(check_multi_noun_score)
示例#7
0
import argparse
import os
from nltk.tokenize import sent_tokenize

from SmiToText.tokenizer.nltk import nltkSentTokenizer
from SmiToText.util.util import Util
from SmiToText.tokenizer import mecab
''' 
Mecab 을 이용한 명사 추출기 
Pos tagger 이용

'''

util = Util()


def expect_nng_text(text):
    nng_list = []
    vv_list = []

    replace_char_list = [
        '[', ']', '\'', '\"', ')', '(', '「', '」', '-', '’', ':', '/', '”', '“',
        '〃', '~', '-', 'ㆍ', '○', '◇', '△', '〈', '〉', '·', '+', '…', '#', '='
    ]

    check_word_end = [',', '.', ':', ';', '!', '?', '\"', '\'']

    check_vv = [
        'EC',
        'VV',
        'X',
示例#8
0
 def __init__(self):
     self.util = Util()
示例#9
0
        else:
            features.append('eos')
        return features

    def sent2words(self, sent):
        return [word for word, tag in sent]

    def sent2tags(self, sent):
        return [tag for word, tag in sent]

    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]


if __name__ == '__main__':
    util = Util()

    rootDirPath = util.getRootPath('SmiToText.SmiToText')

    koSpaceCheck = koCrfSpacing()

    #### 학습
    koSpaceCheck.raw2corpus(
        rootDirPath + '/data/koDetokenizerData/ko_law_common_space.txt',
        rootDirPath + '/data/koDetokenizerData/ko_law_common_space.txt.copus')
    koSpaceCheck.raw2corpus(
        rootDirPath + '/data/koDetokenizerData/kospacing_Train.txt',
        rootDirPath + '/data/koDetokenizerData/kospacing_Train.txt.copus')
    # raw_train.txt에 뭔가 긴 글이 있음

    train_sents = koSpaceCheck.corpus2sent(
示例#10
0
    def __init__(self):
        self.util = Util()
        self.kolastChar = koLastCharCheck()

        self.rootDirPath = self.util.getRootPath("SmiToText.SmiToText")