Пример #1
0
def get_raw_sentence(path):
    mecab = Mecab()

    with open(path, 'r', encoding='utf-8-sig') as f:
        sentences = f.readlines()

    sentences = list(map(make_raw_sentence, sentences))
    mecab.morphs()
    sentences = [sentence.strip() for sentence in sentences]
    return sentences
Пример #2
0
def clean_korean_documents_simple_version(documents):
    #텍스트 정제 (HTML 태그 제거)
    for i, document in enumerate(documents):
        document = BeautifulSoup(document, 'html.parser').text
        #print(document) #스토리가 진짜 너무 노잼
        documents[i] = document

    #텍스트 정제 (특수기호 제거)
    for i, document in enumerate(documents):
        document = re.sub(r'[^ ㄱ-ㅣ가-힣]', '', document)  #특수기호 제거, 정규 표현식
        #print(document) stale and uninspired
        documents[i] = document

    #텍스트 정제 (형태소 추출)
    for i, document in enumerate(documents):
        eunjeon = Mecab()
        clean_words = []
        for word in eunjeon.morphs(document):
            clean_words.append(word)
        #print(clean_words) #['스토리', '진짜', '노잼']
        document = ' '.join(clean_words)
        #print(document) #스토리 진짜 노잼
        documents[i] = document

    return documents
Пример #3
0
def predict_(request):
    if request.method == "GET":
        start = time.time()
        review = request.GET["review"]
        print("Get Review _________________________________")

        pkl = joblib.load('./restapi/reviewSentiment.pkl')
        model = FastText.load('./restapi/FastText_embedding.model').wv
        print("load Model __________________________________")

        okt = Mecab()
        review_text = re.sub
        review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", review)
        word_review = okt.morphs(review_text)
        word_review = ' '.join(word_review)
        print("preprocessing _______________________")

        feature_vector = np.zeros((100), dtype=np.float32)
        num_words = 0
        index2word_set = set(model.wv.index2word)
        for w in word_review.split():
            if w in index2word_set:
                num_words += 1
                feature_vector = np.add(feature_vector, model.wv[w])
        feature_vector = np.divide(feature_vector, num_words)
        print("predict _____________________________________")

        result = pkl.predict([feature_vector])
        print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
        if result[0] == 1:
            return redirect('emotion/1')
        else:
            return redirect('emotion/0')
class Chuncker:
    def __init__(self):

        self.tagger = Mecab()
        self.Bi_charcter_feature = []

    def get_feautre(self, query):
        self.Bi_charcter_feature = []

        TKs = self.tagger.morphs(query)

        for TK in TKs:
            if len(TK) > 1:
                for i in range(1, len(TK)):
                    self.Bi_charcter_feature.append(str(TK[i - 1:i + 1]))

        #print(self.Bi_charcter_feature)

    def get_chunk_score(self, paragraph):
        score = 0

        for ch_feat in self.Bi_charcter_feature:
            if paragraph.find(ch_feat) != -1:
                score += 1

        return 1 + score / len(self.Bi_charcter_feature)
Пример #5
0
def embed(data):
    mecab = Mecab()
    inputs = []
    labels = []
    for encode_raw in data['encode']:
        encode_raw = mecab.morphs(encode_raw)
        encode_raw = list(
            map(lambda x: encode_raw[x] if x < len(encode_raw) else '#',
                range(encode_length)))
        if (embed_type == 'onehot'):
            bucket = np.zeros(vector_size, dtype=float).copy()
            input = np.array(
                list(
                    map(
                        lambda x: onehot_vectorize(bucket, x)
                        if x in model.wv.index2word else np.zeros(vector_size,
                                                                  dtype=float),
                        encode_raw)))
        else:
            input = np.array(
                list(
                    map(
                        lambda x: model[x] if x in model.wv.index2word else np.
                        zeros(vector_size, dtype=float), encode_raw)))
        inputs.append(input.flatten())

    for decode_raw in data['decode']:
        label = np.zeros(label_size, dtype=float)
        np.put(label, decode_raw, 1)
        labels.append(label)
    return inputs, labels
Пример #6
0
def make_keyword(sample_data):
    '''
    :param sample_data: DataFrame
    :return: content column의 내용 토큰화-> 상위 10개 키워드 리스트 반환
    '''
    # 불용어 정의
    stopwords = ['어때','아하','어때요','니깐','니까','거든','을까','할까','거든요','많이','조금','습니당','습니다','입니다','니다','여러분','라도','만나','어디',
                 '이렇게','저렇게','은데','한데','응','아직','응응','그래','오키','든요','어떻게','왜','감사','고맙','죄송','랑','이랑','지만','하지만',
                 '화이팅','파이팅','습니다','슴당','아요','에요','예요','아용','가용','바로','그냥','수정','파일','보내','올려','이모티콘', '따로',
                 '다고', '구나', 'ㅠㅠㅠㅠ', 'ㅠㅠㅠ', '잖아', '그거', '부분', '어제', '내일', '오늘', '을까요', '괜찮', '으면', '해야',
                 'ㅇㅋ', '각자', '이건', '이거', '상관없', '사진', '께서', '드릴게요', '오후', '오전', '우선', '걸로', '이번', '해도', '할까요', '월요일',
                 '화요일', '수요일', '목요일', '금요일', '토요일', '일요일', '까지', '드려요', '너무', '해요', '네네', '오늘', '다음', '아서', '셔서', '올리',
                 '진짜', '오빠', '누나', '언니', '의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에',
                 '와', '한', '하다', '다', '고', '을', '하', '있', '게', '보', '없', '세요', '아요', '습니다', '이', '있', '하', '것', '들',
                 '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '어요',
                 '네요', '대하', '오', '말', '일', '그렇', '이나', '위하', '는데', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않',
                 '없', '나', '사람', '주', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하',
                 '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', '많', '그리고', '좋', '크',
                 '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', '그러', '속', '하나', '집', '살', '모르', '적', '월', '데',
                 '자신', '안', '어떤', '내', '내', '경우', '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻',
                 '여자', '개', '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓', '그럼', '혹시', '니다',
                 '에서', '아침', '점심', '저녁', '해서', '어서', '감사', '수고', '저희', '근데', '일단', '나요', '부터', '합니다', '니까', '안녕', '입니다']
    file_extension_list=make_file_extension_list("resource\\filename_extension_list.txt")
    for extension in file_extension_list:
        x=extension.replace(".","")
        stopwords.append(x)

    # 토큰화 및 불용어 제거
    tokenizer = Mecab()
    tokenized = []
    for sentence in sample_data['Content']:
        temp = tokenizer.morphs(sentence)  # 토큰화
        temp = [word for word in temp if not word in stopwords]  # 불용어 제거
        temp = [word for word in temp if len(word) > 1]
        tokenized.append(temp)

    # 전처리한 단어 데이터 데이터 프레임 구조로 변환
    vocab = FreqDist(np.hstack(tokenized))
    vocab = pd.DataFrame(vocab, {'count': [1]})
    vocab = vocab.transpose()
    vocab = vocab.sort_values(by='count', ascending=False)
    vocab.reset_index(inplace=True)


    # 상위 언급 10개 단어 추출
    dataf_10 = vocab.iloc[0:10]
    dataf_10 = dataf_10.reset_index()
    data10_dic = dataf_10['index']
    data10_dic = pd.Series(data10_dic)
    data10_list = list(data10_dic)
    print("키워드 추출완료\n")
    return data10_list,vocab
Пример #7
0
 def isKnown(self, text):
     if len(text) == 0: return True
     m = Mecab()
     # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
     # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
     for i in m.morphs(text):
         if m.pos(
                 i
         )[0][1] == 'UNKNOWN':  # or maybe include when first letter is 'S' too?
             # print(i)
             # it is not RP
             return False
     return True
Пример #8
0
def train_vector_model(str_buf):

    mecab = Mecab()
    str_buf = train_data_list['encode']
    pos1 = mecab.pos(''.join(str_buf))
    pos2 = ' '.join(list(map(lambda x: '\n'
                             if x[1] in ['SF'] else x[0], pos1))).split('\n')
    morphs = list(map(lambda x: mecab.morphs(x), pos2))
    print(str_buf)
    model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1)
    model.build_vocab(morphs)
    model.train(morphs, epochs=model.epochs, total_examples=model.corpus_count)
    return model
Пример #9
0
def train_vector_model(str_buf):
    mecab = Mecab()
    str_buf = train_data_list['encode']
    #mecab로 POS Tagging
    pos1 = mecab.pos(''.join(str_buf))
    #문장별로 list로 나눔 마침표등이 존재시 줄바꾸기 (문장이길경우)
    pos2 = ' '.join(list(map(lambda x: '\n'
                             if x[1] in ['SF'] else x[0], pos1))).split('\n')
    #단어구성을 위한 형태소단위 문장 쪼개기
    morphs = list(map(lambda x: mecab.morphs(x), pos2))
    model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1)
    model.build_vocab(morphs)
    model.train(morphs, total_examples=model.corpus_count, epochs=model.iter)
    return model
Пример #10
0
def pos_tag(sentences):

    # eunjeon 형태소분석기 설정
    tagger = Mecab()

    # 문장 품사 변수 초기화
    sentences_pos = []

    # 모든 문장 반복
    for sentence in sentences:
        # 특수기호 제거
        sentence = re.sub(RE_FILTER, "", sentence)

        # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
        sentence = " ".join(tagger.morphs(sentence))
        sentences_pos.append(sentence)

    return sentences_pos
Пример #11
0
def inference_embed(data):
    mecab = Mecab()
    encode_raw = mecab.morphs(data)
    encode_raw = list(
        map(lambda x: encode_raw[x] if x < len(encode_raw) else '#',
            range(encode_length)))
    if (embed_type == 'onehot'):
        bucket = np.zeros(vector_size, dtype=float).copy()
        input = np.array(
            list(
                map(
                    lambda x: onehot_vectorize(bucket, x)
                    if x in model.wv.index2word else np.zeros(vector_size,
                                                              dtype=float),
                    encode_raw)))
    else:
        input = np.array(
            list(
                map(
                    lambda x: model[x] if x in model.wv.index2word else np.
                    zeros(vector_size, dtype=float), encode_raw)))
    return input
Пример #12
0
stopwords = [
    '의', '가', '며', '들', '는', '됨', '좀', '걍', '과', '를', '을', '으로', '에', '와', '한',
    '하다', '개체', '으나', '관리', '번호', '면', '함', '쪽', '줄', '신고자'
    '혼종', '고양이', '묘', '발견', '추정', '생후', '개월', '남음', '믹스', '구조', '음', '고'
]

# #### 3) 토큰화 (tokenized) - 형태소 분석기 1. Mecab

# +
tokenizer = Mecab()

tokenized = []
for sentence in text_df['specialmark']:
    temp = []
    temp = tokenizer.morphs(sentence)  # 토큰화
    temp = [word for word in temp if not word in stopwords]  # 불용어 제거
    tokenized.append(temp)

print(tokenized[:30])
# -

new = pd.DataFrame(tokenized)
new

# #### 3) 토큰화 (tokenized) - 형태소 분석기 2. Okt

sample_data = text_df.copy()

tokenizer2 = Okt()
Пример #13
0
    def __init__(self, inputPath, index=3, words=2, standard=0.3):

        # CSV, TXT 파일, 또는 기사 원문에서 복합단어를 추출
        # 파이썬 버전 3.6
        # 설치할 패키지: kss, eunjeon, pandas
        # 차후 eunjeon에서 konlpy로 이전 예정

        # 리눅스 환경 mecab-ko-dic 설치과정
        # wget -c https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/최신버전-mecab-ko-dic.tar.gz
        # tar zxfv  최신버전-mecab-ko-dic.tar.gz
        # cd 최신버전-mecab-ko-dic
        # ./configure
        # make
        # make check
        # sudo make install
        # 위 과정을 거치면 /usr/local/lib/mecab/dic/mecab-ko-dic 경로에 mecab-ko-dic 설치

        # 입력변수
        # inputPath: CSV 또는 TXT 파일의 위치 (너무 길 경우 원문 스트링으로 인식하여 분석)
        # outputPath: 출력할 텍스트 파일의 위치
        # index: CSV 테이블에서 불러올 텍스트의 행 번호
        # words: 복합단어를 이루는 단어 갯수 (기본:2)
        # standards: 요구사항을 충족하는 TR+PMI 점수의 최소치 (임시:0.3)

        # inputPath가 길 경우 원문으로 인식
        if len(inputPath) > 50: self.data = inputPath
        else: self.data = self.extractText(inputPath)

        # 파일을 호출해서 행 번호(index)에 있는 값을 TXT에 저장
        txt = self.clean_str(self.readValue(self.data, index))

        # target = "문서전체내용"
        self.target = self.clean_str(txt)
        m = Mecab()
        # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
        # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)

        # wTotal = (int)문서 내 명사 갯수
        # fList = [["문장1명사1", "문장1명사2", ...], ["문장2명사1", "문장2명사2", ...], ...]
        # mList = ["문장1명사1", "문장1명사2", ..., "문장2명사1", "문장2명사2" ...] 중복 제거
        # lList = [["문장1형태소1", "문장1형태소2", ...], ["문장2형태소1", "문장2형태소2", ...], ...]
        self.wTotal = len(m.nouns(self.target))
        self.fList = self.nounExt(kss.split_sentences(self.target))
        self.mList = list(
            dict.fromkeys([item for sublist in self.fList
                           for item in sublist]))
        self.lList = []
        for i in kss.split_sentences(self.target):
            l = []
            for j in m.morphs(i):
                l.append(j)
            self.lList.append(l)

        # N그램 변수 (형태소의 갯수, 또는 문자수. 문자수 기반 N그램 현재 사용불가능)
        self.ngram = 8
        # 복합단어를 이룰 단어의 갯수
        self.nOfWords = words
        # 제동변수
        self.df = 0.85
        # 텍스트랭크 반복횟수 (임시:16)
        self.defIteration = 16

        # allCW = [["단어1", "단어2", ...], ["단어a", "단어b", ...], ...] 복합단어의 가능성이 있는 모든 명사 리스트의 리스트
        self.allCW = []
        for i in range(len(self.fList)):
            n = self.genCW(self.fList[i])
            for j in n:
                # 문서를 검색하는 방식
                # if self.complexSearch(j, self.target) > 1 and j not in self.allCW: # 띄어쓰기 경우의 수를 모두 검색 (사용가능, 비활성)
                if self.searchSpaceless(
                        j, self.target
                ) > 1 and j not in self.allCW:  # 본문 그대로 검색 (활성)
                    self.allCW.append(j)
        # 일부분 중복되는 복합단어를 탐지한 뒤 추가
        # self.allCW += self.detectRedundant(self.allCW) # (사용가능, 비활성)

        # trdic = {"단어1": TR1, "단어2": TR2, ...} (기존방식) (활성))
        self.trdic = self.calculateTROld(self.mList, self.fList,
                                         self.defIteration)

        # trdic = {"단어1": TR1, "단어2": TR2, ...} (N그램 방식) (사용가능, 비활성)
        # self.trdic = self.calculateTR(self.mList, self.lList, self.ngram, self.defIteration)

        # pmiList = [PMI1, PMI2, ...] allCW의 복합단어의 PMI 점수 리스트
        pmiList = []
        for i in self.allCW:
            pmiList.append(self.getPMI(i, self.wTotal, self.target))

        # trmpiList = [TRPMI1, TRPMI2, ...] 복합단어를 구성하는 TR의 기하평균 곱하기 복합단어의 PMI
        trpmiList = []
        for i in range(len(self.allCW)):
            k = self.allCW[i]
            key = 1
            for j in k:
                key *= self.trdic[j]
            key **= (1 / len(k))
            key *= pmiList[i]
            trpmiList.append(key)

        #gluedCW = ["복합단어1", "복합단어2", ...] allCW의 단어 구성 리스트를 합친 스트링 리스트
        gluedCW = []
        for i in self.allCW:
            gluedCW.append(''.join(i))

        # compDict = {"복합단어1": 1.11, "복합단어2": 2.22, ...}
        # 중복된 복합단어가 없는 경우
        if len(self.detectDuplicates(gluedCW)) == 0:
            self.compDict = dict(zip(gluedCW, trpmiList))
        # 중복된 복합단어가 있는 경우
        else:
            self.compDict = self.eliminateDuplicates(gluedCW, trpmiList)

        self.out = []
        for i in self.compDict.items():
            if i[1] > standard:
                self.out.append(i[0])
Пример #14
0
from eunjeon import Mecab
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

docs = [
    '먼저 텍스트의 각 단어를 나누어 토큰화합니다.', '텍스트의 단어로 토큰화해야 딥러닝에서 인식됩니다.',
    '토큰화한 결과는 딥러닝에서 사용할 수 있습니다.'
]

m = Mecab()
x = m.morphs('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.')
x = [' '.join(x), 'xxx', 'zzz', 'gasdgasdg']
x = np.array(x)
print(x)
print('[MeCab 형태소 분석기]')
print(m.morphs('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.'))
print('[Keras 문장 단어 분석기]')
print(text_to_word_sequence('제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.'))
token = Tokenizer()
token.fit_on_texts(x)

print("[몇개의 단어]")
print(token.word_counts)  # 몇개의 단어가 나오는지
print("[몇개의 문장]")
print(token.document_count)  # 몇개의 문장이 나오는지
print("[각 단어들이 몇 개의 문장에 나오는가]")
print(token.word_docs)  # 각 단어들이 몇 개의 문장에 나오는가
print("[각 단어에 매겨진 인덱스 값]")
Пример #15
0
    def __init__(self, inputText, inputCorpus=None):

        # 원문 문서에서 신조어 추출
        # 파이썬 버전 3.6
        # 설치할 패키지: eunjeon, pandas
        # 차후 eunjeon에서 konlpy로 이전 예정

        # 리눅스 환경 mecab-ko-dic 설치과정
        # wget -c https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/최신버전-mecab-ko-dic.tar.gz
        # tar zxfv  최신버전-mecab-ko-dic.tar.gz
        # cd 최신버전-mecab-ko-dic
        # ./configure
        # make
        # make check
        # sudo make install
        # 위 과정을 거치면 /usr/local/lib/mecab/dic/mecab-ko-dic 경로에 mecab-ko-dic 설치

        # 현 문제점: 말뭉치 인풋을 받지 않음
        # Need to implement Corpus input and put it together into one string, or find other way to search the corpus

        # doc = "원문전체 문자열"
        self.doc = self.clean_str(inputText)
        # corpus = 말뭉치. 데이터 형태 미정 (사용불가능, 비활성)
        if inputCorpus == None: self.corpus = ' ' + self.doc
        else: self.corpus = self.clean_str(' ' + inputCorpus)

        # wTotal = 말뭉치 총 어절 수
        self.wTotal = self.corpus.count(' ')

        l = self.doc.split(' ')
        self.eoList = [i for i in l if len(re.sub(r'[0-9]+', '-', i)) >= 3]

        # 괄호가 포함된 어절 출력
        missed = []
        for i in self.eoList:
            if i.count("(") > 0 and i.count(")") > 0:
                missed.append(i[i.find("(") + 1:i.find(")")])
                continue
            if i.count("(") > 0:
                missed.append(i.split("(", 1)[1])
            if i.count(")") > 0:
                missed.append(i[:-1])
        parenthesisless = [
            x for x in self.eoList if not '(' in x and not ')' in x
        ] + [x for x in self.eoList if '(' in x and ')' in x]
        parenthesisless += missed
        self.eoList = parenthesisless  # 괄호가 한 쪽만 포함된 어절을 모두 제거하고 괄호 속 어절을 포함

        ############################################################################################################################################
        # 없는부분
        # [LP, UM, RP] 형태가 가능한 모든 조합을 리스트로 구축
        self.posUMpairList = []
        for i in range(len(self.eoList)):
            for j in self.splitEojeol(self.eoList[i]):
                # RP가 알려진 단어로 이루어져있는지 확인(확인된 경우 KRP라고 부름) 후 등록
                # if self.isAfterNoun(j[2]) and len(j[1]) > 1:
                if self.isKnown(j[2]):
                    self.posUMpairList.append(j)

        # partialEoList: 모든 부분어절의 리스트: ["어절1부분1", "어절1부분2", ...] # (사용가능, 비활성)
        # self.partialEoList = []
        # for i in self.eoList:
        #     for j in self.eojeolPart(i):
        #         self.partialEoList.append(j)

############################################################################################################################################

# lplist: 모든 어절의 2자 이상의 LP부분 리스트: [["어절1LP1", "어절1LP2", ...], ["어절2LP1", "어절2LP2", ...], ...]
        self.lplist = []
        iter = self.eoList[:]
        iter = list(dict.fromkeys(iter))
        for i in iter:
            if len(i) > 1: self.lplist.append(self.genLP(i))

        # 명사로 추정되는 문자열 리스트 추출 -> extnouns
        self.extnouns = []
        for i in self.lplist:
            scores = []
            finalscore = 0
            chosen = ''
            for j in range(len(i)):
                # 현재는 단순히 말뭉치에 띄어쓰기+단어가 검색된 갯수만 찾지만 본래 어절의 좌측부분만 검색하도록 해야 함
                # 문제점1: 말뭉치는 클렌징이 되어있지 않음
                # 문제점2: 기존에 이미 발견된 명사를 제외한 말뭉치에서 검색해야 함
                scores.append(self.corpus.count(' ' + i[j]) / self.wTotal)
            for j in range(len(scores)):
                if j >= len(scores) - 1:
                    chosen = i[j]
                    finalscore = scores[j]
                    break
                # 예: 마스터투자운 -> 마스터투자운용 빈도수가 크게 차이가 안 날 경우 넘어가지만
                # 마스터투자운용 -> 마스터투자운용은 빈도수가 크게 차이가 나기 때문에 그 직전에 명사로 채택
                if scores[j] > scores[j + 1] * 1.1:
                    chosen = i[j]
                    finalscore = scores[j]
                    break
                finalscore = scores[j]
            # 빈도율이 2/어절수 이상인 경우 채택
            if finalscore >= 2 / self.wTotal: self.extnouns.append(chosen)
        self.extnouns = list(dict.fromkeys(self.extnouns))

        ############################################################################################################################################
        # 없는부분

        # 여기서 Mecab은 단일 글자가 어떠한 글자인지 판단하기 위해 사용
        m = Mecab()
        # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
        # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
        # 한글이 아닌 문자가 갈라지는 경우 제외
        # 예: ['신한BN', 'P파리', '바자산운용으로부터'], ['', '320', '0억원에'] 등
        temp = self.posUMpairList[:]  # temp와 포인터가 같으면 곤란하기 때문에 새로 메모리 할당
        for i in self.posUMpairList:
            # LP가 빈 문자열이 아니고 LP의 마지막 글자와 UM의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거
            if len(i[0]) > 0 and m.pos(i[0][-1])[0][1][0] == 'S' and m.pos(
                    i[1][0])[0][1][0] == 'S':
                temp.remove(i)
                # RP가 빈 문자열이 아니고 UM의 마지막 글자와 RP의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거
            elif len(i[2]) > 0 and m.pos(i[1][-1])[0][1][0] == 'S' and m.pos(
                    i[2][0])[0][1][0] == 'S':
                temp.remove(i)
                # # UM에 괄호가 한 쪽만 포함된 경우 제거
            elif '(' in i[1] and ')' not in i[1]:
                temp.remove(i)
            elif ')' in i[1] and '(' not in i[1]:
                temp.remove(i)
        # 결과물은 LP+UM+KRP의 리스트
        self.posUMpairList = temp

        # candidates: 신조어 최종 후보 리스트
        self.candidates = []
        for i in self.posUMpairList:
            # KRP가 비어있는 경우: UM을 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록
            if i[2] == '' and self.corpus.count(i[1]) >= 2:
                self.candidates.append(i[0] + i[1])
            # KRP가 비어있지 않은 경우: UM+KRP[0](KRP의 첫 형태소)를 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록
            elif i[2] != '' and self.corpus.count(i[1] +
                                                  m.morphs(i[2])[0]) >= 2:
                self.candidates.append(i[0] + i[1])

        # 서로를 포함하는 어절 빈도수 기준으로 정리
        temp = []
        for i in range(len(self.candidates) - 1):
            if self.candidates[i] in self.candidates[i + 1]:
                if self.wordFreq(
                        self.candidates[i], self.corpus) > self.wordFreq(
                            self.candidates[i + 1], self.corpus) * 1.1:
                    temp.append(self.candidates[i])
            elif self.candidates[i - 1] in self.candidates[i]:
                if self.wordFreq(self.candidates[i - 1],
                                 self.corpus) * 0.9 < self.wordFreq(
                                     self.candidates[i], self.corpus):
                    temp.append(self.candidates[i])
            else:
                temp.append(self.candidates[i])
        if self.wordFreq(self.candidates[-2],
                         self.corpus) * 0.9 < self.wordFreq(
                             self.candidates[-1], self.corpus):
            temp.append(self.candidates[-1])
        self.candidates = temp
        self.candidates = list(dict.fromkeys(self.candidates))

        # 여기서 Mecab은 기존에 등록된 명사인지 아닌지 판단하기 위해 사용
        # 기존에 등록된 명사 제외
        temp = []
        for i in self.candidates:
            if len(m.pos(i)) > 1 or m.pos(i)[0][1][0] != 'N':
                temp.append(i)
        self.candidates = temp
from eunjeon import Mecab

tagger = Mecab()

sentence = '아무문장이다.'
print(tagger.morphs(sentence))
print(tagger.nouns(sentence))

#아나콘다 환경
#python 3.6
#대소 비교가 가능한 숫자(뭐 ~km) 같은거 2차언 배열로 순위 매기기
#대소 비교 불가능한 건 0 으로 처리하고 1순위부터 시작해서 2차원 배열 생성

# '/usr/local/lib/mecab/dic/mecab-ko-dic'
Пример #17
0
for i in range(documentCount):
    if i < 300:
        f = open("Finance/Finance%05d.txt" % i, 'r', -1, "utf-8")
    elif i < 600:
        f = open("Social/Social%05d.txt" % (i - 300), 'r', -1, "utf-8")
    else:
        f = open("Science/Science%05d.txt" % (i - 600), 'r', -1, "utf-8")
    data = f.read()
    f.close()

    data = data[139:]
    contentsText += data
    data = re.sub(
        "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "",
        data)
    data = m.morphs(data)
    articleMemory.append(data)

# 특수 문자 & 기호 제거
token = re.sub(
    "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "",
    contentsText)  # 특수 기호 제외
# 형태소 분리
temp = m.morphs(token)
# BagOfWords 적용
sequencesText = util.BagOfWords(temp)
# StopWord 적용
sequencesText = util.stopWord(sequencesText[0],
                              sequencesText[1],
                              deleteRate=0.02)
# TF-IDF 구하기
Пример #18
0
# One Hot Vector를 통한 출력
from eunjeon import Mecab

ona_data = [['안녕', '만나서 반가워'], ['넌 누구니', ' 나는 AI봇이란다.'],
            ['피자 주문 할게', '음료도 주문해줘'], ['음료는 뭘로', '콜라로 해줘']]

mecab = Mecab()
train_Data = list(map(lambda x: mecab.morphs(' '.join(x)), ona_data))
# .morphs() 문장을 형태소 단위로 끊어준다.
import itertools

train_data = list(itertools.chain.from_iterable(train_Data))
print(list(train_data))

import numpy as np

bucket = np.zeros(len(train_data), dtype=np.float)
for word in train_data:
    bucket_temp = bucket.copy()
    np.put(bucket_temp, train_data.index(word), 1)
    #print(bucket_temp)

# Word to Vector (By Gensim)
# W2V를 통해 출력해보자
from gensim.models import word2vec

train_data = [train_data]
print(train_data)

model = word2vec.Word2Vec(size=50, window=2, min_count=1, iter=100)
model.build_vocab(train_data)
Пример #19
0
class Rouge:
    DEFAULT_METRICS = {"rouge-n"}
    DEFAULT_N = 1
    STATS = ["f", "p", "r"]
    AVAILABLE_METRICS = {"rouge-n", "rouge-l", "rouge-w"}
    AVAILABLE_LENGTH_LIMIT_TYPES = {"words", "bytes"}
    REMOVE_CHAR_PATTERN = re.compile("[^A-Za-z0-9가-힣]")

    def __init__(
        self,
        metrics=None,
        max_n=None,
        limit_length=True,
        length_limit=1000,
        length_limit_type="words",
        apply_avg=True,
        apply_best=False,
        use_tokenizer=True,
        alpha=0.5,
        weight_factor=1.0,
    ):
        self.metrics = metrics[:] if metrics is not None else Rouge.DEFAULT_METRICS
        for m in self.metrics:
            if m not in Rouge.AVAILABLE_METRICS:
                raise ValueError("Unknown metric '{}'".format(m))

        self.max_n = max_n if "rouge-n" in self.metrics else None
        # Add all rouge-n metrics
        if self.max_n is not None:
            index_rouge_n = self.metrics.index("rouge-n")
            del self.metrics[index_rouge_n]
            self.metrics += [
                "rouge-{}".format(n) for n in range(1, self.max_n + 1)
            ]
        self.metrics = set(self.metrics)

        self.limit_length = limit_length
        if self.limit_length:
            if length_limit_type not in Rouge.AVAILABLE_LENGTH_LIMIT_TYPES:
                raise ValueError(
                    "Unknown length_limit_type '{}'".format(length_limit_type))

        self.length_limit = length_limit
        if self.length_limit == 0:
            self.limit_length = False
        self.length_limit_type = length_limit_type

        self.use_tokenizer = use_tokenizer
        if use_tokenizer:
            self.tokenizer = Mecab()

        self.apply_avg = apply_avg
        self.apply_best = apply_best
        self.alpha = alpha
        self.weight_factor = weight_factor
        if self.weight_factor <= 0:
            raise ValueError("ROUGE-W weight factor must greater than 0.")

    def tokenize_text(self, text):
        if self.use_tokenizer:
            return self.tokenizer.morphs(text)
        else:
            return text

    @staticmethod
    def split_into_sentences(text):
        return text.split("\n")

    @staticmethod
    def _get_ngrams(n, text):
        ngram_set = collections.defaultdict(int)
        max_index_ngram_start = len(text) - n
        for i in range(max_index_ngram_start + 1):
            ngram_set[tuple(text[i:i + n])] += 1
        return ngram_set

    @staticmethod
    def _split_into_words(sentences):
        return list(itertools.chain(*[_.split() for _ in sentences]))

    @staticmethod
    def _get_word_ngrams_and_length(n, sentences):
        assert len(sentences) > 0
        assert n > 0

        tokens = Rouge._split_into_words(sentences)
        return Rouge._get_ngrams(n, tokens), tokens, len(tokens) - (n - 1)

    @staticmethod
    def _get_unigrams(sentences):
        assert len(sentences) > 0

        tokens = Rouge._split_into_words(sentences)
        unigram_set = collections.defaultdict(int)
        for token in tokens:
            unigram_set[token] += 1
        return unigram_set, len(tokens)

    @staticmethod
    def _compute_p_r_f_score(
        evaluated_count,
        reference_count,
        overlapping_count,
        alpha=0.5,
        weight_factor=1.0,
    ):
        precision = 0.0 if evaluated_count == 0 else overlapping_count / float(
            evaluated_count)
        if weight_factor != 1.0:
            precision = precision**(1.0 / weight_factor)
        recall = 0.0 if reference_count == 0 else overlapping_count / float(
            reference_count)
        if weight_factor != 1.0:
            recall = recall**(1.0 / weight_factor)
        f1_score = Rouge._compute_f_score(precision, recall, alpha)
        return {"f": f1_score, "p": precision, "r": recall}

    @staticmethod
    def _compute_f_score(precision, recall, alpha=0.5):
        return (0.0 if (recall == 0.0 or precision == 0.0) else precision *
                recall / ((1 - alpha) * precision + alpha * recall))

    @staticmethod
    def _compute_ngrams(evaluated_sentences, reference_sentences, n):
        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_ngrams, _, evaluated_count = Rouge._get_word_ngrams_and_length(
            n, evaluated_sentences)
        reference_ngrams, _, reference_count = Rouge._get_word_ngrams_and_length(
            n, reference_sentences)

        # Gets the overlapping ngrams between evaluated and reference
        overlapping_ngrams = set(evaluated_ngrams.keys()).intersection(
            set(reference_ngrams.keys()))
        overlapping_count = 0
        for ngram in overlapping_ngrams:
            overlapping_count += min(evaluated_ngrams[ngram],
                                     reference_ngrams[ngram])

        return evaluated_count, reference_count, overlapping_count

    @staticmethod
    def _compute_ngrams_lcs(evaluated_sentences,
                            reference_sentences,
                            weight_factor=1.0):
        def _lcs(x, y):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(int)
            dirs = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        vals[i, j] = vals[i - 1, j - 1] + 1
                        dirs[i, j] = "|"
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"

            return vals, dirs

        def _wlcs(x, y, weight_factor):
            m = len(x)
            n = len(y)
            vals = collections.defaultdict(float)
            dirs = collections.defaultdict(int)
            lengths = collections.defaultdict(int)

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i - 1] == y[j - 1]:
                        length_tmp = lengths[i - 1, j - 1]
                        vals[i, j] = (vals[i - 1, j - 1] +
                                      (length_tmp + 1)**weight_factor -
                                      length_tmp**weight_factor)
                        dirs[i, j] = "|"
                        lengths[i, j] = length_tmp + 1
                    elif vals[i - 1, j] >= vals[i, j - 1]:
                        vals[i, j] = vals[i - 1, j]
                        dirs[i, j] = "^"
                        lengths[i, j] = 0
                    else:
                        vals[i, j] = vals[i, j - 1]
                        dirs[i, j] = "<"
                        lengths[i, j] = 0

            return vals, dirs

        def _mark_lcs(mask, dirs, m, n):
            while m != 0 and n != 0:
                if dirs[m, n] == "|":
                    m -= 1
                    n -= 1
                    mask[m] = 1
                elif dirs[m, n] == "^":
                    m -= 1
                elif dirs[m, n] == "<":
                    n -= 1
                else:
                    raise UnboundLocalError("Illegal move")

            return mask

        if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
            raise ValueError("Collections must contain at least 1 sentence.")

        evaluated_unigrams_dict, evaluated_count = Rouge._get_unigrams(
            evaluated_sentences)
        reference_unigrams_dict, reference_count = Rouge._get_unigrams(
            reference_sentences)

        # Has to use weight factor for WLCS
        use_WLCS = weight_factor != 1.0
        if use_WLCS:
            evaluated_count = evaluated_count**weight_factor
            reference_count = 0

        overlapping_count = 0.0
        for reference_sentence in reference_sentences:
            reference_sentence_tokens = reference_sentence.split()
            if use_WLCS:
                reference_count += len(
                    reference_sentence_tokens)**weight_factor
            hit_mask = [0 for _ in range(len(reference_sentence_tokens))]

            for evaluated_sentence in evaluated_sentences:
                evaluated_sentence_tokens = evaluated_sentence.split()

                if use_WLCS:
                    _, lcs_dirs = _wlcs(
                        reference_sentence_tokens,
                        evaluated_sentence_tokens,
                        weight_factor,
                    )
                else:
                    _, lcs_dirs = _lcs(reference_sentence_tokens,
                                       evaluated_sentence_tokens)
                _mark_lcs(
                    hit_mask,
                    lcs_dirs,
                    len(reference_sentence_tokens),
                    len(evaluated_sentence_tokens),
                )

            overlapping_count_length = 0
            for ref_token_id, val in enumerate(hit_mask):
                if val == 1:
                    token = reference_sentence_tokens[ref_token_id]
                    if evaluated_unigrams_dict[
                            token] > 0 and reference_unigrams_dict[token] > 0:
                        evaluated_unigrams_dict[token] -= 1
                        reference_unigrams_dict[ref_token_id] -= 1

                        if use_WLCS:
                            overlapping_count_length += 1
                            if (ref_token_id + 1 < len(hit_mask)
                                    and hit_mask[ref_token_id + 1]
                                    == 0) or ref_token_id + 1 == len(hit_mask):
                                overlapping_count += overlapping_count_length**weight_factor
                                overlapping_count_length = 0
                        else:
                            overlapping_count += 1

        if use_WLCS:
            reference_count = reference_count**weight_factor

        return evaluated_count, reference_count, overlapping_count

    def get_scores(self, hypothesis, references):
        if isinstance(hypothesis, str):
            hypothesis, references = [hypothesis], [references]

        if type(hypothesis) != type(references):
            raise ValueError("'hyps' and 'refs' are not of the same type")

        if len(hypothesis) != len(references):
            raise ValueError("'hyps' and 'refs' do not have the same length")
        scores = {}
        has_rouge_n_metric = (len([
            metric
            for metric in self.metrics if metric.split("-")[-1].isdigit()
        ]) > 0)
        if has_rouge_n_metric:
            scores.update(self._get_scores_rouge_n(hypothesis, references))
            # scores = {**scores, **self._get_scores_rouge_n(hypothesis, references)}

        has_rouge_l_metric = (len([
            metric
            for metric in self.metrics if metric.split("-")[-1].lower() == "l"
        ]) > 0)
        if has_rouge_l_metric:
            scores.update(
                self._get_scores_rouge_l_or_w(hypothesis, references, False))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, False)}

        has_rouge_w_metric = (len([
            metric
            for metric in self.metrics if metric.split("-")[-1].lower() == "w"
        ]) > 0)
        if has_rouge_w_metric:
            scores.update(
                self._get_scores_rouge_l_or_w(hypothesis, references, True))
            # scores = {**scores, **self._get_scores_rouge_l_or_w(hypothesis, references, True)}

        return scores

    def _get_scores_rouge_n(self, all_hypothesis, all_references):
        metrics = [
            metric for metric in self.metrics
            if metric.split("-")[-1].isdigit()
        ]

        if self.apply_avg or self.apply_best:
            scores = {
                metric: {stat: 0.0
                         for stat in Rouge.STATS}
                for metric in metrics
            }
        else:
            scores = {
                metric: [{stat: []
                          for stat in Rouge.STATS}
                         for _ in range(len(all_hypothesis))]
                for metric in metrics
            }

        for sample_id, (hypothesis, references) in enumerate(
                zip(all_hypothesis, all_references)):
            assert isinstance(hypothesis, str)
            has_multiple_references = False
            if isinstance(references, list):
                has_multiple_references = len(references) > 1
                if not has_multiple_references:
                    references = references[0]

            # Prepare hypothesis and reference(s)
            hypothesis = self._preprocess_summary_as_a_whole(hypothesis)
            references = ([
                self._preprocess_summary_as_a_whole(reference)
                for reference in references
            ] if has_multiple_references else
                          [self._preprocess_summary_as_a_whole(references)])

            # Compute scores
            for metric in metrics:
                suffix = metric.split("-")[-1]
                n = int(suffix)

                # Aggregate
                if self.apply_avg:
                    # average model
                    total_hypothesis_ngrams_count = 0
                    total_reference_ngrams_count = 0
                    total_ngrams_overlapping_count = 0

                    for reference in references:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams(hypothesis, reference, n)
                        total_hypothesis_ngrams_count += hypothesis_count
                        total_reference_ngrams_count += reference_count
                        total_ngrams_overlapping_count += overlapping_ngrams

                    score = Rouge._compute_p_r_f_score(
                        total_hypothesis_ngrams_count,
                        total_reference_ngrams_count,
                        total_ngrams_overlapping_count,
                        self.alpha,
                    )

                    for stat in Rouge.STATS:
                        scores[metric][stat] += score[stat]
                else:
                    # Best model
                    if self.apply_best:
                        best_current_score = None
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            if best_current_score is None or score[
                                    "r"] > best_current_score["r"]:
                                best_current_score = score

                        for stat in Rouge.STATS:
                            scores[metric][stat] += best_current_score[stat]
                    # Keep all
                    else:
                        for reference in references:
                            (
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                            ) = Rouge._compute_ngrams(hypothesis, reference, n)
                            score = Rouge._compute_p_r_f_score(
                                hypothesis_count,
                                reference_count,
                                overlapping_ngrams,
                                self.alpha,
                            )
                            for stat in Rouge.STATS:
                                scores[metric][sample_id][stat].append(
                                    score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for metric in metrics:
                for stat in Rouge.STATS:
                    scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _get_scores_rouge_l_or_w(self,
                                 all_hypothesis,
                                 all_references,
                                 use_w=False):
        metric = "rouge-w" if use_w else "rouge-l"
        if self.apply_avg or self.apply_best:
            scores = {metric: {stat: 0.0 for stat in Rouge.STATS}}
        else:
            scores = {
                metric: [{stat: []
                          for stat in Rouge.STATS}
                         for _ in range(len(all_hypothesis))]
            }

        for sample_id, (hypothesis_sentences,
                        references_sentences) in enumerate(
                            zip(all_hypothesis, all_references)):
            assert isinstance(hypothesis_sentences, str)
            has_multiple_references = False
            if isinstance(references_sentences, list):
                has_multiple_references = len(references_sentences) > 1
                if not has_multiple_references:
                    references_sentences = references_sentences[0]

            # Prepare hypothesis and reference(s)
            hypothesis_sentences = self._preprocess_summary_per_sentence(
                hypothesis_sentences)
            references_sentences = ([
                self._preprocess_summary_per_sentence(reference)
                for reference in references_sentences
            ] if has_multiple_references else [
                self._preprocess_summary_per_sentence(references_sentences)
            ])

            # Compute scores
            # Aggregate
            if self.apply_avg:
                # average model
                total_hypothesis_ngrams_count = 0
                total_reference_ngrams_count = 0
                total_ngrams_overlapping_count = 0

                for reference_sentences in references_sentences:
                    (
                        hypothesis_count,
                        reference_count,
                        overlapping_ngrams,
                    ) = Rouge._compute_ngrams_lcs(
                        hypothesis_sentences,
                        reference_sentences,
                        self.weight_factor if use_w else 1.0,
                    )
                    total_hypothesis_ngrams_count += hypothesis_count
                    total_reference_ngrams_count += reference_count
                    total_ngrams_overlapping_count += overlapping_ngrams

                score = Rouge._compute_p_r_f_score(
                    total_hypothesis_ngrams_count,
                    total_reference_ngrams_count,
                    total_ngrams_overlapping_count,
                    self.alpha,
                    self.weight_factor if use_w else 1.0,
                )
                for stat in Rouge.STATS:
                    scores[metric][stat] += score[stat]
            else:
                # Best model
                if self.apply_best:
                    best_current_score = None
                    best_current_score_wlcs = None
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            total_hypothesis_ngrams_count,
                            total_reference_ngrams_count,
                            total_ngrams_overlapping_count,
                            self.alpha,
                            self.weight_factor if use_w else 1.0,
                        )

                        if use_w:
                            reference_count_for_score = reference_count**(
                                1.0 / self.weight_factor)
                            overlapping_ngrams_for_score = overlapping_ngrams
                            score_wlcs = (overlapping_ngrams_for_score /
                                          reference_count_for_score)**(
                                              1.0 / self.weight_factor)

                            if (best_current_score_wlcs is None
                                    or score_wlcs > best_current_score_wlcs):
                                best_current_score = score
                                best_current_score_wlcs = score_wlcs
                        else:
                            if best_current_score is None or score[
                                    "r"] > best_current_score["r"]:
                                best_current_score = score

                    for stat in Rouge.STATS:
                        scores[metric][stat] += best_current_score[stat]
                # Keep all
                else:
                    for reference_sentences in references_sentences:
                        (
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                        ) = Rouge._compute_ngrams_lcs(
                            hypothesis_sentences,
                            reference_sentences,
                            self.weight_factor if use_w else 1.0,
                        )
                        score = Rouge._compute_p_r_f_score(
                            hypothesis_count,
                            reference_count,
                            overlapping_ngrams,
                            self.alpha,
                            self.weight_factor,
                        )

                        for stat in Rouge.STATS:
                            scores[metric][sample_id][stat].append(score[stat])

        # Compute final score with the average or the the max
        if (self.apply_avg or self.apply_best) and len(all_hypothesis) > 1:
            for stat in Rouge.STATS:
                scores[metric][stat] /= len(all_hypothesis)

        return scores

    def _preprocess_summary_as_a_whole(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            # By words
            if self.length_limit_type == "words":
                summary = " ".join(sentences)
                all_tokens = summary.split()  # Counting as in the perls script
                summary = " ".join(all_tokens[:self.length_limit])

            # By bytes
            elif self.length_limit_type == "bytes":
                summary = ""
                current_len = 0
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)

                    if current_len + sentence_len < self.length_limit:
                        if current_len != 0:
                            summary += " "
                        summary += sentence
                        current_len += sentence_len
                    else:
                        if current_len > 0:
                            summary += " "
                        summary += sentence[:self.length_limit - current_len]
                        break
        else:
            summary = " ".join(sentences)

        summary = Rouge.REMOVE_CHAR_PATTERN.sub(" ", summary.lower()).strip()

        tokens = self.tokenize_text(Rouge.REMOVE_CHAR_PATTERN.sub(
            " ", summary))
        preprocessed_summary = [" ".join(tokens)]

        return preprocessed_summary

    def _preprocess_summary_per_sentence(self, summary):
        sentences = Rouge.split_into_sentences(summary)

        # Truncate
        if self.limit_length:
            final_sentences = []
            current_len = 0
            # By words
            if self.length_limit_type == "words":
                for sentence in sentences:
                    tokens = sentence.strip().split()
                    tokens_len = len(tokens)
                    if current_len + tokens_len < self.length_limit:
                        sentence = " ".join(tokens)
                        final_sentences.append(sentence)
                        current_len += tokens_len
                    else:
                        sentence = " ".join(tokens[:self.length_limit -
                                                   current_len])
                        final_sentences.append(sentence)
                        break
            # By bytes
            elif self.length_limit_type == "bytes":
                for sentence in sentences:
                    sentence = sentence.strip()
                    sentence_len = len(sentence)
                    if current_len + sentence_len < self.length_limit:
                        final_sentences.append(sentence)
                        current_len += sentence_len
                    else:
                        sentence = sentence[:self.length_limit - current_len]
                        final_sentences.append(sentence)
                        break
            sentences = final_sentences

        final_sentences = []
        for sentence in sentences:
            sentence = Rouge.REMOVE_CHAR_PATTERN.sub(" ",
                                                     sentence.lower()).strip()

            tokens = self.tokenize_text(
                Rouge.REMOVE_CHAR_PATTERN.sub(" ", sentence))

            sentence = " ".join(tokens)

            final_sentences.append(sentence)

        return final_sentences
Пример #20
0
    return sample

# title_data = data[["label", "title"]]
gd = data.groupby('label').apply(sampling_func, n_sample=13000)
gd.index.names = ["temp_label", None]
gd = gd.reset_index(level=[0])
gd = gd.drop(["temp_label"], axis=1)
gd = shuffle(gd)

mecab_processed_data = gd.copy()
etri_processed_data = gd.copy()
soynlp_processed_data = gd.copy()
spm_processed_data = gd.copy()

mecab = Mecab()
mecab_processed_data["title"] = mecab_processed_data["title"].progress_apply(lambda x: " ".join(mecab.morphs(x)))

def concat_text_with_pos(setence):
    tag = Mecab()
    pos = tag.pos(setence)
    temp = []
    for p in pos:
        temp.append(p[0] + "/" + p[1])
    
    s = ' '.join(temp)
    return s
etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos)


word_extractor = WordExtractor(
    min_frequency=100,
Пример #21
0
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# [('열심히','MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['코딩', '당신', '연휴', '여행']
'''
한글 형태소 분석기 중에 가장 속도가 빠른 Mecab은 konlpy 엔진에 포함되어 있지 않다.
아래는 eunjeon 패키지를 이용하여 python에서 mecab을 활용하는 예시이다.
'''
from eunjeon import Mecab  # KoNLPy style mecab wrapper
tagger = Mecab()
print(tagger.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['열심히', '코딩', '한', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '봐요']
print(tagger.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# [('열심히', 'MAG'), ('코딩', 'NNG'), ('한', 'XSA+ETM'), ('당신', 'NP'), (',', 'SC'), ('연휴', 'NNG'), ('에', 'JKB'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가', 'VV'), ('봐요', 'EC+VX+EC')]
print(tagger.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
# ['코딩', '당신', '연휴', '여행']
'''
2) 정제(Normalization) - https://wikidocs.net/21693
1. 규칙에 기반한 표기가 다른 단어들의 통합
2. 대, 소문자 통합
3. 정규 표현식(Regular Expression)
'''
'''
3) 어간 추출(Stemming) and 표제어 추출(Lemmatization) - https://wikidocs.net/21707
1. 표제어 추출(Lemmatization)
2. 어간 추출(Stemming)
# ## 4.1 품사 추출

# In[14]:

data_pos = []
for sentence in movie_data['new_text']:
    data_pos.append(mecab.pos(sentence))
data_pos[:3]

# ## 4.2 형태소 분석

# In[15]:

tokenized_data = []
for sentence in movie_data['new_text']:
    for text in mecab.morphs(sentence):
        tokenized_data.append(text)
tokenized_data[:10]

# * 상위 빈도 순으로 100개의 단어는 다음과 같습니다.

# In[16]:

# 상위 빈도 100개 단어

top_words = Counter(tokenized_data)

top_words.most_common(100)[:10]

# ## 4.3 명사 추출
Пример #23
0
from eunjeon import Mecab
import json
from collections import OrderedDict
import re

token = Mecab()

with open("output.txt", 'r',
          encoding='utf-8') as f:  # output.txt -> from crawling.py
    document = f.read()

list_morphs = token.morphs(document)

for k in range(0, len(list_morphs)):
    list_morphs[k] = re.sub(
        '[^a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]', '',
        list_morphs[k])  # Delete everything except English and Korean.

# Using the characteristics of the set to remove duplication.
list_clear = set(list_morphs)
list_clear = list(list_clear)

data = OrderedDict()

index = 1  # To index vocabulary
for i in range(0, len(list_clear)):
    if list_clear[i].strip(
    ) == '':  # Remove 'null-element' that is created during conversion from set to list.
        continue
    else:
        data[index] = list_clear[i].strip()
    if i < 1920:
        f = open("Finance/Finance%05d.txt" % i, 'r', -1, "utf-8")
    elif i < 3840:
        f = open("Social/Social%05d.txt" % (i - 1920), 'r', -1, "utf-8")
    else:
        f = open("Science/Science%05d.txt" % (i - 3840), 'r', -1, "utf-8")
    data = f.read()
    f.close()

    data = data[139:]

    data = re.sub(
        "[-=+,#/\?:%$.@*\"※~&%!\\'|\(\)\[\]\<\>`\'\\\\n\\\\t{}◀▶▲☞“”ⓒ◇]", "",
        data)
    data = data[:-117]
    data = m.morphs(data)

    data = util.stopWord(data, stopwoardList)

    articleMemory.append(data)

# 정답데이터 기사 불러오기
for i in range(resultDocumentCount):
    if i < 480:
        f = open("Finance/Finance%05d.txt" % (i + 1920), 'r', -1, "utf-8")
    elif i < 960:
        f = open("Social/Social%05d.txt" % (i + 1920 - 480), 'r', -1, "utf-8")
    else:
        f = open("Science/Science%05d.txt" % (i + 1920 - 960), 'r', -1,
                 "utf-8")
    data = f.read()
Пример #25
0
from eunjeon import Mecab


# 단어와 2차원 X축의 값, Y축의 값을 입력받아 2차원 그래프를 그린다
def plot_2d_graph(vocabs, xs, ys):
    plt.figure(figsize=(8, 6))
    plt.scatter(xs, ys, marker='o')
    for i, v in enumerate(vocabs):
        plt.annotate(v, xy=(xs[i], ys[i]))


sentences = [['안녕', '만나서 반가워'], ['넌 누구니', ' 나는 AI봇이란다.'],
             ['피자 주문 할게', '음료도 주문해줘'], ['음료는 뭘로', '콜라로 해줘']]

mecab = Mecab()
sentences = list(map(lambda x: mecab.morphs(' '.join(x)), sentences))

# 문장을 이용하여 단어와 벡터를 생성한다.
model = Word2Vec(sentences, size=50, window=2, min_count=1, iter=100)

# 단어벡터를 구한다.
word_vectors = model.wv

vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

# 단어간 유사도를 확인하다
print(word_vectors.similarity(w1='피자', w2='음료'))

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Пример #26
0
from eunjeon import Mecab

out_f = './data/prepro_ko_wiki.txt'
in_f = './data/ko_wiki.txt'

me = Mecab()
# cp949 에러 해결을 위해 encoding='utf-8' 추가
output = open(out_f, 'wt', encoding='utf-8')

with open(in_f, 'r', encoding='utf-8') as rf:
    lines = rf.readlines()
    i = 0

    for line in lines:
        temp_arr = me.morphs(line)
        line = bytes(' '.join(temp_arr), 'utf-8').decode('utf-8') + '\n'
        output.write(line)

        i = i + 1
        if i % 10000 == 0:
            print('Preprocessed ' + str(i) + ' articles')
    output.close()
    print('Preprocessing complete!')

Пример #27
0
# konlpy : 설치하기
# http://konlpy.org/ko/v0.5.2/install/#id2

from eunjeon import Mecab

# Mecab 함수를 tagger라는 이름으로 사용
tagger = Mecab()
# 문장에서 명사만 분류
tagger.nouns("고양이가 냐 하고 울면 나는 녜 하고 울어야지")

# 빛 아래 유령
poem = """
흘러내린 머리카락이 흐린 호박빛 아래 빛난다.
유영하며.
저건가보다.
세월의 힘을 이겨낸 마지막 하나 남은 가로등.
미래의 색, 역겨운 청록색으로 창백하게 바뀔 마지막 가로등
난 유영한다. 차분하게 과거에 살면서 현재의 공기를 마신다.
가로등이 깜빡인다.
나도 깜빡여준다.
"""
# 문장을 형태소 단위로 끊어줌
tagger.morphs(poem)

# 문장을 형태소단위로 끊고, 형태소 마다 품사를 분석
# 이때, ('지우개', 'NNG')등의 형식을 분류되는데, NNG는 일반명사를 뜻
# 자세한 품사태그는 링크를 참고 : https://m.blog.naver.com/PostView.nhn?blogId=aul-_-&logNo=221557243190
print(tagger.pos(poem))
# print(tagger.nouns(poem))