예제 #1
0
파일: evaluate.py 프로젝트: KNU-NLPlab/VTT
def control(input_msg):
    tagger = Komoran()

    dataset = Dataset('nsmc/ratings.txt',
                      tagger,
                      max_length=MAX_LENGTH,
                      batch_size=BATCH_SIZE)

    Z_DIM = 40
    H_DIM = 300
    C_DIM = 2

    model = RNN_VAE(dataset.num_words,
                    H_DIM,
                    Z_DIM,
                    C_DIM,
                    freeze_embeddings=False,
                    gpu=USE_CUDA,
                    gpu_id=GPU_ID)

    test_data = torch.LongTensor(
        dataset.sentence2idxs(tagger.morphs(input_msg))).unsqueeze(1)

    model.load_state_dict(torch.load('models/vae_epoch_300_400.bin'))
    results = model.controlSentence(test_data, t=0.5)

    return (dataset.idxs2sentence(results[0], no_pad=True),
            dataset.idxs2sentence(results[1], no_pad=True))
예제 #2
0
def main(base_path, pkl_lst):
    DataFrame = preprocess(base_path, pkl_lst)
    print('Spacing the document...')
    DataFrame = multicore_cpu(DataFrame,
                              spacing_doc,
                              n_cores=args.cpu_core,
                              spell=False)
    print('Spell checking...')
    checked_data = multicore_cpu(DataFrame,
                                 spell_check,
                                 n_cores=args.cpu_core,
                                 spell=True)
    checked_data.reset_index(drop=True, inplace=True)

    # tokenizing
    print('Tokenizing the document...')
    komoran = Komoran(userdic=args.token_dict)
    checked_data['tokenized_contents'] = checked_data['contents'].apply(
        lambda x: komoran.morphs(x))

    # filter documents
    checked_data['doc_length'] = checked_data['tokenized_contents'].apply(
        lambda x: len(x))
    final_data = checked_data.loc[checked_data['doc_length'] > args.token_cnt]
    final_data.reset_index(drop=True, inplace=True)

    # save the output data
    os.makedirs(args.save_path, exist_ok=True)
    with open(os.path.join(args.save_path, 'preprocessed_data.pickle'),
              'wb') as f:
        pickle.dump(final_data, f)
def main(corpora, output):
    filelist = os.listdir(corpora)
    tagger_stan = Komoran()
    tagger_jeju = Komoran(userdic='userdic.txt')  # TODO: If not userdic

    for file in filelist:
        book = openpyxl.load_workbook(os.path.join(corpora, file))
        sheet = book.get_sheet_by_name("Sheet")

        tagged = (bool(sheet.cell(row=1, column=3).value)
                  and bool(sheet.cell(row=1, column=4).value))

        if not tagged:
            for sample in sheet.rows:
                index = sample[0].row
                try:
                    stan = sample[0].value
                    pos_stan = ' '.join(tagger_stan.morphs(stan))
                    jeju = sample[1].value
                    pos_jeju = ' '.join(tagger_jeju.morphs(jeju))
                except:
                    continue
                else:
                    sheet.cell(row=index, column=3).value = pos_stan
                    sheet.cell(row=index, column=4).value = pos_jeju

            book.save(os.path.join(corpora, file))

        filename = file[:file.find('.')]
        if not os.path.exists(output):
            os.makedirs(output)
        output_dir = os.path.join(output, filename +
                                  '.txt')  # Exception: Output Dir not Exists
        output_file = open(output_dir, 'w')

        for sample in sheet.rows:
            try:
                line = '\t'.join([
                    s.value for s in sample[:5]
                ]) + '\n'  # Exception: s.value can be no string
            except TypeError:
                continue
            else:
                output_file.write(line)

        output_file.close()
        book.close()
예제 #4
0
def samerank(db, emotion_dict):  # Null값 전처리 파일 출력 - 1,2등 동순위 전처리

    komoran = Komoran()

    cursor = db.cursor()

    emotion = ['happy', 'enjoy', 'comfort', 'horror', 'angry', 'sad']

    sql = "SELECT DISTINCT title, artist, lyrics FROM musicl WHERE (DATE, ranking) IN (SELECT DATE, ranking FROM emoti_test WHERE rank1 IS NULL)"
    cursor.execute(sql)
    null_data = cursor.fetchall()
    null_data = pd.DataFrame(null_data, columns=['제목', '가수', '가사'])

    null_data_rating = pd.DataFrame(columns=['제목', '가수', '순위', '수치'])

    for title, singer, lyrics in null_data.values:

        # 형태소 나누기
        lyrics = lyrics.replace('\n', '')
        words_temp = komoran.morphs(lyrics)

        # 6개의 감정 섹션
        happy = 0
        enjoy = 0
        comfort = 0
        angry = 0
        horror = 0
        sad = 0

        # 가사의 감성 분석
        lyrics_emotion = pd.DataFrame(index=emotion)

        for word in words_temp:
            if word in emotion_dict['happy']: happy += 1
            if word in emotion_dict['enjoy']: enjoy += 1
            if word in emotion_dict['comfort']: comfort += 1
            if word in emotion_dict['angry']: angry += 1
            if word in emotion_dict['horror']: horror += 1
            if word in emotion_dict['sad']: sad += 1

        # 어떤 감성이 더 많이 나왔는지 정렬
        result_emotion = [happy, enjoy, comfort, angry, horror, sad]
        lyrics_emotion[0] = result_emotion
        rating = lyrics_emotion[0].sort_values(ascending=False).index
        value = lyrics_emotion[0].sort_values(ascending=False).values

        null_data_rating = null_data_rating.append(
            {
                '제목': title.strip(),
                '가수': singer,
                '순위': list(rating),
                '수치': list(value)
            },
            ignore_index=True)

    null_data_rating.to_excel('data/samepointSong.xlsx', encoding='utf-8')
예제 #5
0
파일: analysis.py 프로젝트: stannam/530B
def kor_tokenizer(list_sentences):
    komoran = Komoran(max_heap_size=1024)
    list_output = []
    for sentence in list_sentences:
        sentence = re.sub("[^가-힣\s]", "", sentence)
        tokenized_sentence = komoran.morphs(sentence)
        list_output.append(tokenized_sentence)
        with open('./result/tokens.pickle', 'wb') as f:
            pickle.dump(list_output, f, pickle.HIGHEST_PROTOCOL)
    return list_output
예제 #6
0
class Tagger:
    def __init__(self, mode: str = "nouns"):
        """
        konlpy pos tagger
        """
        self.tagger = Komoran()
        self.mode = mode  # nouns, morphs

    def __call__(self, *args, **kwargs) -> list:
        if self.mode == "nouns":
            return self.tagger.nouns(*args, **kwargs)
        elif self.mode == "morphs":
            return self.tagger.morphs(*args, **kwargs)
예제 #7
0
class DataLoader:
    def __init__(self,
                 path="curse_detection/dataset/long.txt",
                 one_hot=False,
                 max_len=30):
        self.path = path
        self.one_hot = one_hot  # True: [0~1, 0~1]  False: 0~1
        self.max_len = max_len

        self.komoran = Komoran()

    def get_data(self):
        x, y = self.load()
        x_train, x_test, y_train, y_test = self.split(x, y)
        return x_train, x_test, y_train, y_test

    @staticmethod
    def one_hot_encoding(y):
        # 원 핫 인코딩
        return np.eye(2)[y.astype("int8")]

    def load(self):
        with open(self.path, 'r', encoding='utf8') as f:
            data = f.read()
        data = data.split('\n')

        x, y = [], []
        for line in data:
            try:
                tmp = self.tokenize('|'.join(line.split('|')[:-1]))
            except UnicodeDecodeError:
                continue
            if len(tmp) > self.max_len:
                continue
            x.append(tmp)
            y.append(line.split('|')[-1].replace('"', ''))
        y = np.array(y, dtype=np.float32)
        if self.one_hot:
            y = self.one_hot_encoding(y)
        return x, y

    @staticmethod
    def split(x, y):
        # train test split
        x, y = shuffle(x, y)
        return train_test_split(x, y, test_size=0.1)

    def tokenize(self, text):
        return self.komoran.morphs(text)
예제 #8
0
def vectorize(train, val, test):
    parser = Komoran()

    temp_train = []
    for doc in train:
        temp_train.append(parser.morphs(doc))
    result_train = [' '.join(tokens) for tokens in temp_train]

    temp_val = []
    for doc in val:
        temp_val.append(parser.morphs(doc.replace("[[문단]] ", "")))
    result_val = [' '.join(tokens) for tokens in temp_val]

    temp_test = []
    for doc in test:
        temp_test.append(parser.morphs(doc))
    result_test = [' '.join(tokens) for tokens in temp_test]

    vect = CountVectorizer()
    X_train = vect.fit_transform(result_train)
    X_val = vect.transform(result_val)
    X_test = vect.transform(result_test)

    return X_train, X_val, X_test
예제 #9
0
def hyeongtae(filename):
    tokenizer = Komoran()
    tok_comments = []
    f = open(filename + ".txt", 'r+', -1, "utf-8")

    comments = f.read().splitlines()
    f.close()
    g = open("tok" + filename + ".txt", 'w', -1, "utf-8")
    for com in comments:
        tok_com_l = tokenizer.morphs(com)
        i = 0
        for tok in tok_com_l:
            g.write(tok)
            i += 1
            if i != len(tok_com_l):
                g.write(" ")
        g.write("\n")
예제 #10
0
파일: Word2vec.py 프로젝트: fora22/DL_Study
def make_token(input_file, output_file):
    komoran = Komoran()
    token_txt_file = open(output_file, "w", encoding="utf-8")
    list = []

    with open(input_file, 'r', encoding="utf-8") as f:
        text = f.readlines()
        num = 0

        for i in range(0, len(text)):
            sentence = text[i].strip()
            morphs = komoran.morphs(sentence)
            list.append(morphs)
            num += 1

        print(num)

        my_json_string = json.dumps(list, ensure_ascii=False)
        token_txt_file.write(my_json_string)
예제 #11
0
def run_komoran():
    komoran = Komoran()
    start_time = time.time()
    print('komoran 시작')
    komoran_morphs = komoran.morphs(news1)
    komoran_nouns = komoran.nouns(news1)
    komoran_pos = komoran.pos(news1)
    end_time = time.time()
    print('komoran 끝 - %s 초' % str(end_time - start_time))

    with open('komoran.txt', 'w', encoding='utf-8') as fstream:
        fstream.write('komoran time : %s s\n' % str(end_time - start_time))
        fstream.write('komoran_morphs\n')
        write_list(komoran_morphs, fstream)
        fstream.write('\n\n')

        fstream.write('komoran_nouns\n')
        write_list(komoran_nouns, fstream)
        fstream.write('\n\n')

        fstream.write('komoran_pos\n')
        write_pos(komoran_pos, fstream)
        fstream.write('\n')
예제 #12
0
def translate(inputSentence):
    encText = urllib.parse.quote(inputSentence)
    data = "source=en&target=ko&text=" + encText
    url = "https://openapi.naver.com/v1/papago/n2mt"

    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()
    if (rescode == 200):
        response_body = response.read()
        # print(response_body.decode('utf-8'))
    else:
        print("Error Code:" + rescode)

    jsonObject = json.loads(response_body.decode('utf-8'))
    korText = jsonObject.get("message").get("result").get("translatedText")

    komoran = Komoran()

    eng_pos = getEngOrigin.get_eng_origin(inputSentence)
    return [komoran.morphs(korText), komoran.pos(korText)], eng_pos
예제 #13
0
f = open('../NLP/sample_data/stopword_02.txt','rt',encoding='utf-8')  # Open file with 'UTF-8' 인코딩
text = f.read()
stopword = text.split('\n') 

# 품사 태깅으로 토큰화 (품사로 나눈 토큰화)
# from konlpy.tag import Kkma
# tokenizer = Kkma()
from konlpy.tag import Okt, Kkma, Komoran
okt = Okt()
kkma = Kkma()
komo = Komoran()

tag_data = []
for sentence in all_data['data']:
    temp_x = []
    temp_x = komo.morphs(sentence)
    temp_x = [word for word in temp_x if not word in stopword]
    tag_data.append(temp_x)

# 확인용 출력
# print('토큰화 된 샘플: ', tag_data[-10:-5])

### 불용어 제거 , 토큰화 전 ###
#      label                        data
# 35      0                몇 비비가 있다 그랬나
# 36      0  내 친구는 뭐 컴활 이런 게 더 어렵다고 그랬나
# 37      0                  실추라고 뭐 그랬나
# 39      0                         그랬나
# 42      0          선훈이 오빠가 여동생 있다 그랬나

### 불용어 제거 , 토큰화 후 ###
예제 #14
0
    def __init__(self, root, phase='train'):
        print("CustomDataset-> init")
        #count_vectorizer = make_vocab(root)
        self.root = root
        self.phase = phase
        self.labels = {}

        self.label_path = os.path.join(root, self.phase + '_hate.txt')
        with open(self.label_path, 'r',encoding="utf-8") as f:
            temp1 = []
            bias_list = []
            hate_list = []
            for line in f.readlines()[0:]:
                v = line.strip().split('\t')
                w = v[1]
                w = w.replace('!','')
                w = w.replace('.','')
                w = w.replace('^','')
                w = w.replace('♡','')
                w = w.replace('@','')
                w = w.replace('ㅎ','')
                w = w.replace('ㅉ','')
                w = w.replace('?','')
                w = w.replace('ㅜ','')
                w = w.replace('ㅠ','')
                w = w.replace('~','')
                w = w.replace('ㅋ','')
                w = w.replace('ㅡ','')
                w = w.replace('!','')
                w = w.replace('ㄷ','')
                w = w.replace('ㄹ','')
                w = w.replace('ㅇ','')
                w = w.replace(',','')
                w = w.replace('ㅈ','')
                w = w.replace('♥','')
                w = w.replace('ㅁ','')
                w = w.replace('ㅊ','')
                w = w.replace(';','')
                w = w.replace('ㄴ','')
                w = w.replace('ㆍ','')
                temp1.append(w)
                if phase != 'test':
                    bias_list.append(v[2])
                    hate_list.append(v[3])
        
        
        stopwords =['의','가','이','은','들','는','좀','잘',
                    '걍','과','도','를','으로','자','에','와','한','하다']
        
        comments_list = [] # 형태소로 자름
        
        okt = Okt()
        komoran =Komoran()
        tokenizer = RegexTokenizer()
        
        
        for sentence in temp1:
            temp_x =[]
            #temp_x= komoran.morphs(sentence,stem=True)
            temp_x= komoran.morphs(sentence)
            #temp_x = tokenizer.tokenize(sentence)
            temp_x = [word for word in temp_x if not word in stopwords]
            comments_list.append(temp_x) # 형태소로 잘리고
      
        
        vocab = FreqDist(np.hstack(comments_list)) #빈도수로 sort
        
        threshold = 2
        total_cnt = len(vocab)
        rare_cnt = 0
        total_freq = 0
        rare_freq = 0
        
        for key in vocab.keys():
            total_freq = total_freq + vocab[key] 
            if vocab[key] < threshold :
                rare_cnt = rare_cnt+1
                rare_freq = rare_freq + vocab[key]
                
        #         print('문장 집합(vocabulary)의 크기 :',total_cnt)
#         print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
#         print("문장 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
#         print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)
        
        vocab_size = total_cnt - rare_cnt + 2
        vocab = vocab.most_common(vocab_size) 

        word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
        word_to_index['pad'] = 0
        word_to_index['unk'] = 0
        encoded = []
        
        for line in comments_list: 
            temp = []
            for w in line: 
                try:
                    temp.append(word_to_index[w])
                except KeyError: 
                    temp.append(word_to_index['unk']) # unk의 인덱스로 변환
            encoded.append(temp)
        #print(encoded[0:5])
#         rint(encoded.size())
    
        #max_len = max(len(length) for length in encoded)
        max_len = 74 # batch_size        
#         print("here")
#         print(a)
#         print("encoded")
#         print(len(encoded))
#         print('문장의최대 길이 : %d' % max_len)
#         print('문장의최소 최소 길이 : %d' % min(len(length) for length in encoded))
#         print('문장의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))
        
        for line in encoded:
            if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
                line += [word_to_index['pad']] * (max_len - len(line))
                
        encoded = torch.LongTensor(encoded)
        #print(encoded[0:10])
        
        
     
        #encoded = pad_sequence(encoded,batch_first=True)
        #print(encoded.size)
        
#         print('패딩결과 최대 길이 : %d' % max(len(l) for l in encoded))
#         print('패딩결과의 최소 길이 : %d' % min(len(l) for l in encoded))
#         print('패딩결과의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))

        comments_vector = []
#         for comment in temp1:
#             comments_vector.append(count_vectorizer.transform([comment]).toarray()[0])
#         comments_vector = torch.FloatTensor(comments_vector)

        self.comments_vec = encoded  # 단어집합 숫자에 맞추고 pad, 한 결과 집합
        self.comments_list = temp1  # 문장 원본
                print(len(temp1))
예제 #15
0
import pandas as pd
from konlpy.tag import Komoran
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import DBSCAN
from gensim.models import Word2Vec

data = pd.read_excel("C:/Users/leevi/Documents/카카오톡 받은 파일/롱패딩_온라인쇼핑몰후기_종합.xlsx")

dic = {feed_code: {'content':content, "star":start} for feed_code, content, start in zip(data['feed_code'].values,data['content'].values,data['star'].values)}

komoran = Komoran()

cv_pos = []
cv_neg = []
for key in dic.keys():
    if type(dic[key]['content']) == str :
        dic[key]['morph'] = komoran.morphs(dic[key]['content'])
        if float(dic[key]['star']) >= 3:
            cv_pos.append(dic[key]['morph'])
        else:
            cv_neg.append(dic[key]['morph'])
    else:
        dic[key]['morph'] = None

model = Word2Vec(cv_pos,size=100, window=3,iter=10)

model.most_similar("정말")


예제 #16
0
hannanum = Hannanum()
kkma = Kkma()

text = args.text

print("-"*5,"원본 텍스트", "-"*5)
print(text)

print("-"*5, "Mecab", "-"*5)
print(mecab.morphs(text))

print("-"*5, "Okt", "-"*5)
print(okt.morphs(text))

print("-"*5, "Komoran", "-"*5)
print(komoran.morphs(text))

print("-"*5, "Hannanum", "-"*5)
print(hannanum.morphs(text))

print("-"*5, "Kkma", "-"*5)
print(kkma.morphs(text))

print("-"*5, "Khaiii", "-"*5)
tokens = []
for word in khaiii.analyze(text):
    tokens.extend([str(m).split('/')[0] for m in word.morphs])
print(tokens)

print("-"*5, "bert-base-multilingual-cased", "-"*5)
print(tokenizer.tokenize(text))
                Keys.PAGE_DOWN)

    komoran = Komoran()

    review_num = 0
    rank_num = 0
    hits = 0

    review_text = []
    review_rank = []
    good = 0
    bad = 0

    for item in driver.find_elements_by_class_name('UD7Dzf'):
        p_list = []
        morph_list = komoran.morphs(item.text)
        for m in morph_list:
            if m in kodict:
                p_list.append(kodict[m])
        print(p_list)
        train_data.append(p_list)
        review_num += 1

    print("-" * 80)

    for i in range(review_num):
        rank = driver.find_element_by_xpath(
            '//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div/div[2]/div/div['
            + str(i + 1) + ']/div/div[2]/div[1]/div[1]/div/span[1]/div/div')
        rank_string = rank.get_attribute("aria-label")
        #print(rank_string)
예제 #18
0
파일: show.py 프로젝트: nuls2he/crawling
komoran = Komoran()

kresult = []

for data in tresult:
    words = data[1]

    # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False
    state = True

    for word in words:
        try:
            print(komoran.pos(word))
            type = komoran.pos(word)[0][1]
            if type == 'NNG' or type == 'NNP':
                kresult.append([data[0], komoran.morphs(word)[0]])

                # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우
                exist = False
                # 예외 사전에 있는 단어는 INSERT 전에 필터링
                for exc in excdic:
                    sql = 'SELECT INSTR(%s, %s)'
                    cursor.execute(sql, (word, exc[0]))

                    count = cursor.fetchone()
                    if count[0] != 0:
                        print(word + '은(는) 예외 사전에 존재하는 단어입니다.')
                        exist = True
                        break

                if exist:
예제 #19
0
kor=Komoran(userdic='./user_dic.txt')

#제거 할 형태소
stopwords = ['의', '가', '이', '은', '들', '는', '과', '도', '를', '으로', '이', 'ㅋ',
             '자', '에', '와', '한', '하다', '을', '다', '에서', '하고', 'ㄴ', 'ㄹ',
             '아', '하', '있', '았', '것', '나', '라', '고', '지', '게', '어', '되', '보',
             '면', '거', '네', 'ㅁ', '었', '아서', '겠', '로', '만', 'ㅂ시다', 'ㄴ가',
             '는데', 'ㄴ다', '왜', '어서', '어요', 'ㅂ니다', '으면', '라고', 'ㄴ데', '요',
             '그렇', '부터', 'ㄴ다고', '처럼', '라는', '는지', '습니다', '이다', '죠', '네요',
             'ㅡ', '으니', 'ㄴ다는', 'ㄹ까', 'ㄴ지', '구나', '그리고', 'ㄴ다는데']

train_data_document = []
for sentence in X_train:
    temp_X = []
    temp_X = kor.morphs(sentence) # 토큰화
    #stopword에 등록한 조사 제거
    temp_X = [word for word in temp_X if not word in stopwords]
    train_data_document.append(temp_X)

test_data_document = []
for sentence in X_test:
    temp_X = []
    temp_X = kor.morphs(sentence) # 토큰화
    #stopword에 등록한 조사 제거
    temp_X = [word for word in temp_X if not word in stopwords]
    test_data_document.append(temp_X)

#형태소 정수인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_document)
예제 #20
0
text = "아름답지만 다소 복잡하기도 한 한국어는 전세계에서 13번째로 많이 사용되는 언어입니다."
# 코모란 형태소 분석
from konlpy.tag import Komoran
Komoran = Komoran()
print(Komoran.morphs(text))
print(Komoran.nouns(text))
print(Komoran.pos(text))
예제 #21
0
from konlpy.tag import Komoran

komoran = Komoran()
print(komoran.morphs('이것은 이밀란이다. 아니 저것은 이별이 아닌가?'))

print(text.split(' '))


# In[6]:


#코모란

from konlpy.tag import Komoran

#선언
komoran =Komoran()

#토큰화 : morphs

komoran_tokens = komoran.morphs(text)
print(komoran_tokens)


# In[9]:


# 한나눔

from konlpy.tag import Hannanum
hannanum = Hannanum()
hannanum_tokens=hannanum.morphs(text)
print(hannanum_tokens)


# In[10]:
예제 #23
0
from konlpy.tag import Komoran
from tensorflow.keras import Model

from model import ClassificationModel, input_shape

if __name__ == "__main__":
    komoran = Komoran()

    model_parent = ClassificationModel()
    model = model_parent.build_model()
    embedding = model_parent.embedding

    model.load_weights("curse_detection/weights-short.h5")

    att_model = Model(inputs=[model.input], outputs=model.layers[10].output)

    while True:
        inp = input(':')
        inp, mask = embedding([komoran.morphs(inp)])
        out = model.predict((inp, mask)).squeeze(1)
        att = att_model.predict((inp, mask))[1].squeeze(2)
        print(att)
        print(out)
from konlpy.tag import Komoran

# 코모란 형태소 분석기 객체 생성
komoran = Komoran()
text = "아버지가 방에 들어갑니다."

# 형태소 추출
morphs = komoran.morphs(text)
print(morphs)

# 형태소와 품사 태그 추출
pos = komoran.pos(text)
print(pos)

# 명사만 추출
nouns = komoran.nouns(text)
print(nouns)
epochs = 50

model = keras.models.load_model('good_bad_' + str(epochs) + '_epochs.h5')

model.summary()

with open('prediction_input.txt', encoding='utf-8') as p_input:
    input_lines = p_input.readlines()
p_input.close()

prediction_input = []

for line in input_lines:
    p_list = []
    morph_list = komoran.morphs(line)
    for m in morph_list:
        if m in kodict:
            p_list.append(kodict[m])
    prediction_input.append(p_list)

prediction_input = keras.preprocessing.sequence.pad_sequences(prediction_input,
                                                              value=0,
                                                              padding='post',
                                                              maxlen=256)

prediction = model.predict_classes(prediction_input)

for i in range(len(prediction)):
    print(input_lines[i])
    if (prediction[i] == 1):
예제 #26
0
def main():
    # def job():
    conn = pymysql.connect(host='192.168.0.61',
                           user='******',
                           password='******',
                           db='one_db',
                           charset='utf8mb4')
    cursor = conn.cursor()

    sql = 'SELECT ono, originaldata, siteno FROM test_original WHERE state = %s'
    cursor.execute(sql, 'N')

    original = cursor.fetchall()

    print('original data')
    print(original)

    # 신조어 필터링
    sql = 'SELECT word FROM tb_newdic'
    cursor.execute(sql)

    newdic = cursor.fetchall()

    # print('신조어 사전')
    # print(newdic)

    # 예외사전 데이터 가져오기
    sql = 'SELECT word FROM tb_excdic'
    cursor.execute(sql)

    excdic = cursor.fetchall()
    print('예외 사전')
    print(excdic)

    originalList = []
    for data in original:
        dataList = list(data)

        for word in newdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 신조어 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

                sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'
                cursor.execute(sql, (dataList[0], word[0], dataList[2]))
                conn.commit()

        for word in excdic:
            sql = 'SELECT INSTR(%s, %s)'
            cursor.execute(sql, (dataList[1], word[0]))

            count = cursor.fetchone()

            if count[0] != 0:
                print(dataList[1], '에서', word[0], '은(는) 예외 사전에 존재하는 단어입니다.')
                dataList[1] = dataList[1].replace(word[0], '')

        originalList.append(dataList)

    original = originalList

    # 트위터로 분석
    from konlpy.tag import Twitter
    twitter = Twitter()

    tresult = []

    for data in original:
        tresult.append([data[0], twitter.nouns(data[1]), data[2]])
        print(twitter.pos(data[1]))

    # 트위터 분석 결과 확인
    print('twitter result')
    print(tresult)

    # 코모란으로 분석
    from konlpy.tag import Komoran
    komoran = Komoran()

    kresult = []

    for data in tresult:
        words = data[1]

        # 문제 없이 분석과 처리과 완료되었는지 체크용, 완료 성공 시 True, 실패 시 False
        state = True

        for word in words:
            try:
                type = komoran.pos(word)[0][1]
                if type == 'NNG' or type == 'NNP':
                    kresult.append([data[0], komoran.morphs(word)[0]])

                    # 예외 사전에 존재 유무 체크용, True가 있는경우, False가 없는경우
                    exist = False
                    # 예외 사전에 있는 단어는 INSERT 전에 필터링
                    for exc in excdic:
                        sql = 'SELECT INSTR(%s, %s)'
                        cursor.execute(sql, (word, exc[0]))

                        count = cursor.fetchone()
                        if count[0] != 0:
                            print(word + '은(는) 사전의 ' + exc[0] + '와(과) 일치')
                            exist = True
                            break

                    if exist:
                        continue

                    # NNG, NNP 타입만 DB에 INSERT
                    # 예외 발생 시 rollback, 아닌 경우 commit으로 처리
                    sql = 'INSERT INTO test_keyword (ono, keyword, siteno) VALUES (%s, %s, %s)'

                    try:
                        if len(komoran.morphs(word)[0]) != 1:
                            cursor.execute(
                                sql,
                                (data[0], komoran.morphs(word)[0], data[2]))

                    except Exception as err:
                        state = False
                        print('ERROR : komoran result의 ' + str(data[0]) +
                              '번 글의 에서 insert 처리 중 오류 발생')
                        print(str(err))
                        conn.rollback()
                    else:
                        conn.commit()

            except Exception as err:
                state = False
                print('ERROR : komoran 키워드 분석 중 오류 발생')
                continue

        ssql = 'UPDATE test_original SET state = %s WHERE ono = %s'
        state = 'Y' if state == True else 'E'
        cursor.execute(ssql, (state, data[0]))

        conn.commit()

    # 코모란 분석 결과 확인
    print('komoran result')
    print(kresult)

    print('-----')
    print('끝')


# schedule.every().day.at("").do(job)
#
# while 1:
#     schedule.run_pending()
#     time.sleep(1)
예제 #27
0
            break
    return percent_list

argc = sys.argv

komoran = Komoran()

f = open(argc[1], 'rt')
list_length = -(int(argc[2]))

stc_dic = {}
read_dic = {}

stc = input()
start = time.time()
stc = komoran.morphs(stc)

percent_list = []

for ch in stc:
    if(ch not in stc_dic):
        stc_dic[ch] = 1
    else:
        stc_dic[ch] += 1

while True:

    percent = 0

    read = f.readline()
    if not read:
예제 #28
0
                C_DIM,
                freeze_embeddings=False,
                gpu=USE_CUDA,
                gpu_id=GPU_ID)

test_set = dataset.getTestData(100)

model.load_state_dict(torch.load('models/vae_epoch_300_400.bin'))
for test in test_set:
    results = model.controlSentence(test[0].unsqueeze(1), t=0.5)

    print('Original : ', dataset.idxs2sentence(test[0], no_pad=True))
    print('Positive : ', dataset.idxs2sentence(results[0], no_pad=True))
    print('Negative : ', dataset.idxs2sentence(results[1], no_pad=True))
    print()

tagger = Komoran()

while True:
    sentence = tagger.morphs(input())

    if len(sentence) == 0:
        break

    sentence = dataset.sentence2idxs(sentence).unsqueeze(dim=1)
    results = model.controlSentence(sentence, t=0.5)

    print('Positive : ', dataset.idxs2sentence(results[0], no_pad=True))
    print('Negative : ', dataset.idxs2sentence(results[1], no_pad=True))
    print()
예제 #29
0
'''

# Hannanum Class
from konlpy.tag import Hannanum
hannanum = Hannanum()
print(hannanum.analyze(u'롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.'))

#Kkma Class
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.morphs(u'공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.'))

# Komoran Class
from konlpy.tag import Komoran
komoran = Komoran()
print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))

# MeCab installation needed
from konlpy.tag import Mecab
mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")
print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))

# Twitter Class
# from konlpy.tag import Twitter
# twitter = Twitter()
# print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))

from konlpy.tag import Okt
twitter = Okt()
print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))
예제 #30
0
#보통명사(NNG)만 추출
tagged_text = kkma.pos(text)
[t[0] for t in tagged_text if t[1] == 'NNG']

# In[14]:

#명사만 추출할 경우: kkma.nouns()
kkma.nouns(text)

# ### 3.3 Komoran

# In[3]:

from konlpy.tag import Komoran
komoran = Komoran(max_heap_size=1024)  #heap memory; 변수 저장하는 메모리
print(komoran.morphs(text))  #형태소 분석만

# In[4]:

#품사 태깅
print(komoran.pos(text))  #ntags=42

# In[5]:

#일반명사(NNG)만 추출
tagged_text = komoran.pos(text)
[t[0] for t in tagged_text if t[1] == 'NNG']

# In[6]:

#명사만 추출할 경우: komoran.nouns()