Пример #1
0
 def __init__(self, filepath, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Twitter()
     self.filepath = filepath
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Пример #2
0
def tokenize_okt_noscreen(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(
        lambda x: text_tokenize(x['content'], okt, stopwords), axis=1)
    df['title_token'] = df.progress_apply(
        lambda x: text_tokenize(x['title'], okt, stopwords), axis=1)
    return df
Пример #3
0
    def __init__(self):
        self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자')  # 기자
        self.reg_email = re.compile(
            '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')  # 이메일
        self.reg_eng = re.compile('[a-z]+')  # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다
        self.reg_chi = re.compile("[\u4e00-\u9fff]+")  # 한자
        self.reg_sc = re.compile(
            "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+")  # 특수문자
        self.reg_date = re.compile(
            '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+')  # 날짜,시간,숫자

        self.twitter_obj = Twitter()
        self.stopwords = []
        self.noun_list = []
Пример #4
0
 def __init__(self, textIter, tagger=None):
     # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가
     f = open('형태소 보완.txt')
     dd = f.read()
     a = dd.split('\n')
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Twitter()
         self.tagger.add_dictionary(a, 'Noun')
     if type(textIter) == str:
         self.textIter = textIter.split('\n')
     else:
         self.textIter = textIter
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Пример #5
0
def run_twitter(news):
    twitter = Twitter()
    start_time = time.time()
    print('twitter 시작')
    #     twitter_morphs = twitter.morphs(news)
    twitter_nouns = twitter.nouns(news)
    #     twitter_pos = twitter.pos(news)
    end_time = time.time()
    #     print(twitter_pos)
    print('twitter 끝 - %s 초' % str(end_time - start_time))

    with open('twitter_noun.txt', 'w', encoding='utf-8') as fstream:
        #         fstream.write('twitter time : %s s\n' % str(end_time - start_time) )
        #         fstream.write('twitter_morphs\n')
        #         write_list(twitter_morphs, fstream)
        #         fstream.write('\n\n')
        #
        fstream.write('twitter_nouns\n')
        write_list(twitter_nouns, fstream)
        fstream.write('\n\n')
def Tokenizer(data):

    import pandas as pd
    from ckonlpy.tag import Twitter

    twitter = Twitter()

    #사용자 사전 추가
    txt = pd.read_csv('사용자 사전.txt', sep='\n')
    txt = txt['<사용자 사전>']
    for line in txt:
        twitter.add_dictionary(txt, 'Noun')

    # 데이터 가져오기
    data = data
    new_hashtags = data.hashtags.copy()

    # 토큰화
    for i in range(len(new_hashtags)):
        new_hashtags[i] = ' '.join(new_hashtags[i])

    tokenized = []

    for sentence in new_hashtags:
        tokens = twitter.morphs(sentence)
        tokenized.append(tokens)

    # 연속된 중복 제거
    new_tokenized = []

    for x in range(len(tokenized)):
        temp = []

        for y in range(len(tokenized[x]) - 1):
            if tokenized[x][y] != tokenized[x][y + 1]:
                temp.append(tokenized[x][y])

        new_tokenized.append(temp)

    return new_tokenized
Пример #7
0
class RawTagger:
    def __init__(self, textIter, tagger=None):
        # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가
        f = open('형태소 보완.txt')
        dd = f.read()
        a = dd.split('\n')
        if tagger:
            self.tagger = tagger
        else:
            self.tagger = Twitter()
            self.tagger.add_dictionary(a, 'Noun')
        if type(textIter) == str:
            self.textIter = textIter.split('\n')
        else:
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
Пример #8
0
class RawTaggerReader:
    def __init__(self, filepath, tagger=None):
        if tagger:
            self.tagger = tagger
        else:
            self.tagger = Twitter()
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
Пример #9
0
def tokenize_okt_noscreen(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    return df
Пример #10
0
import time
import string
import datetime
import csv
from ckonlpy.tag import Twitter
from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Chrome("c:/Users/yooat/Downloads/chromedriver/chromedriver")

driver.get('http://www.cheonan.go.kr/covid19/sub02_01.do')
time.sleep(1)
twitter = Twitter()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
name = soup.find_all("dl",class_="item")
f1 = open('corona.txt','w+t')

for test in name:
    if  "14일이" in test.get_text():
        continue
    f1.write(test.get_text() + "\n")
f1.close();

f1 = open('corona.txt','r')
nowDate = datetime.datetime.now()
c = csv.writer(open(nowDate.strftime("result_" + "%Y-%m-%d_%H-%M-%S") + ".csv","w",encoding="cp949"))
for l in f1:
    c.writerow(twitter.nouns(l))
time.sleep(3)
Пример #11
0
import urllib.request
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import MaxScoreTokenizer
from soynlp.tokenizer import LTokenizer
# -*- coding: utf-8 -*-
from ckonlpy.tag import Twitter

from pykospacing import spacing

#띄어 쓰기 자동으로 해줌
sent = "위 인수들을 사용할 때 고려해야 될점이있습니다. audio 데이터의 어떤 시점에 하나의 단어가 언급되고 있다면 그 단어는 잘려서 이상하게 인식될 것입니다. 이 harvard 데이터는 실험 목적으로 녹음된 것이기 때문에 초 단위로 잘라도 단어가 잘리지 않은 것 입니다."
new_sent = sent.replace(" ", '')
print(new_sent)
kospacing_sent = spacing(new_sent)
print(sent)
print(kospacing_sent)

#특정 단어 명사로 설정
twitter = Twitter()
#twitter.add_dictionary('띄어쓰기', 'Noun')
print(twitter.morphs(kospacing_sent))

Пример #12
0
    def token(self, title, ccontent, creplies):
        memory = psutil.Process(os.getpid())

        T_OR_title = []
        T_title = []
        T_OR_ccontent = []
        T_ccontent = []
        T_OR_creplies = []
        T_creplies = []

        twitter = Okt()  # 트위터 형태소 사전을 사용하기 위해 초기화
        twitter.add_dictionary('백래시', 'Noun')
        twitter.add_dictionary('문재앙', 'Noun')

        #### 타이틀 토큰화
        #print('1')
        for i in range(len(title)):

            a = twitter.pos(title[i])
            b = []
            #print('title[i]',i,title[i])
            for j in range(len(a)):
                if a[j][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                    b.append(a[j])
                    #print('3',j)
            T_OR_title.append(b)
            T_title.append(twitter.morphs(title[i]))

            #### ccontent 토큰화
            try:
                c = twitter.pos(str(ccontent[i]))
                d = []
                # print('ccontent[i]',i, ccontent[i])
                for w in range(len(c)):
                    if c[w][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                        d.append(c[w])
                        #print('4',w)
                T_OR_ccontent.append(d)
                T_ccontent.append(twitter.morphs(str(ccontent[i])))

            except RuntimeError as e:
                T_OR_ccontent.append('')
                T_ccontent.append(twitter.morphs(''))

            ### 댓글 토큰화
            #print('creplies[i]',i,creplies[i])

            if type(creplies[i]) == str:  # string형 댓글 토큰화
                a = [creplies[i]]  # string을 리스트로 변경
                e = twitter.pos(str(a))
                f = []
                for u in range(len(e)):
                    if e[u][1] != 'Punctuation':
                        f.append(e[u])
                    elif e[u][1] != 'KoreanParticle':
                        f.append(e[u])
                    else:
                        break
                    #print('5',u)
                T_OR_creplies.append(f)
                T_OR_creplies.append(twitter.pos(str(a)))
                T_creplies.append(twitter.morphs(str(a)))

            else:
                temp = []
                temp2 = []

                x = []

                for n in range(len(creplies[i])):  ### 리스트로 반환되는 댓글
                    h = twitter.pos(creplies[i][n])
                    #print('6',n)

                    for z in range(len(h)):
                        if h[z][1] != 'Punctuation':
                            x.append(h[z])
                        elif h[z][1] != 'KoreanParticle':
                            x.append(h[z])
                        else:
                            break
                    # print('7',z)
                    # print('8',)
                    #print('h',z,h)

                    temp.append(x)
                    temp2.append(twitter.morphs(creplies[i][n]))

                T_OR_creplies.append(temp)
                T_creplies.append(temp2)

        return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
Пример #13
0
spelled_sent = spell_checker.check(sent)

hanspell_sent = spelled_sent.checked
print(hanspell_sent)

test_sent = spell_checker.check(sent2)
test2 = test_sent.checked
print(test2)

# !pip install konlpy
# !pip install customized_konlpy

from ckonlpy.tag import Twitter

twitter = Twitter()

twitter.morphs('은경이는 사무실로 갔습니다.')
twitter.add_dictionary('은경이', 'Noun')

print(twitter.morphs('은경이는 사무실로 갔습니다.'))

# 오늘의 마지막 문제. 문법에 맞지 않은 한글 문장을 문법검사해서 잘 구성한 다음 토큰화해서 RNN신경망에 입력되기 전 데이터인
# 훈련 데이터와 라벨로 구성되게 하시오

test = "RNN신경망은 너무어려워서 하나도모르 겠습니다.\n 외않되는지매 일공부해 봐도모르 겠습니다.\n 살려주세 요."
gram_test = spell_checker.check(test)
rs_test = gram_test.checked
print(rs_test)

from tensorflow.keras.utils import to_categorical
Пример #14
0
#%%
import numpy as np
import pandas as pd
from collections import Counter
from pathlib import Path
import argparse
from ckonlpy.tag import Twitter
from ckonlpy.tag import Postprocessor
twitter = Twitter()
import ast

# 각 attr keyword가 review에 포함되면 각 attr에 해당하는 review들을 분류
#%%
attr_df = pd.read_excel(Path(
    "G:/공유 드라이브/속성사전/Data/Electronics/RiceCooker/2. Seed D_전기밥솥_copy_jk_20201130.xlsx"
),
                        sheet_name='1B')
review_df = pd.read_csv(
    Path("G:/공유 드라이브/속성사전/Data/Electronics/RiceCooker/review_20201129.csv"))

review_list = [(body, rating)
               for body, rating in zip(review_df['body'], review_df['rating'])]

# %%
#attr별로 리뷰 모아주려고 만든 dict
cooker_review = {}
for attr in attr_df['attrs']:
    cooker_review[attr] = []

#attr별로 filter 1차로 걸 키워드 모아놓은 dict
Пример #15
0
import math
import pickle
from tqdm import tqdm
from datetime import datetime
from pymongo import MongoClient
from collections import Counter

from konlpy.tag import Okt; okt = Okt()
from ckonlpy.tag import Twitter; spliter = Twitter()

def AddWord(add_word, pos):
    spliter.add_dictionary(add_word, pos)
    
def Tokenize_list(docs_list):
    # must input list [ , ] type
    tokenize_ = []
    for doc_ in docs_list:
        tokenize_.append(spliter.nouns(doc_))
        # tokenize_.append(spliter.morphs(doc_))  
    return tokenize_

def Tokenize_list_morphs(docs_list):
    # must input list [ , ] type
    tokenize_ = []
    for doc_ in docs_list:
        tokenize_.append(spliter.nouns(doc_))
        # tokenize_.append(spliter.morphs(doc_))  
    return tokenize_

  
def Tokenize_dict(docs_dict, keyname = 'content'):
Пример #16
0
 def __init__(self):
     self.twitter = Twitter()
Пример #17
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def DB_to_table(self, DBname='intake', keyword='intake'):
        self.query = \
        """
        SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect(
            "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh",
            "ghintake", DBname)
        df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        self.raw_data = df.as_matrix()

    # def hashtags_split(self, hashtags):
    #     hashtags_split = []
    #     for i in hashtags:
    #         hashtags_split.append(i.split('/'))
    #
    #     hashtags_list = []
    #
    #     for i in hashtags_split:
    #         temp = []
    #         for j in i:
    #             if self.isHangul(j):
    #                 t_hashtags = j.translate(self.non_bmp_map)
    #                 temp.append(t_hashtags)
    #         hashtags_list.append(temp)
    #     self.hashtags_list = hashtags_list
    #
    #     return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #18
0
from ckonlpy.tag import Twitter
from konlpy.tag import Hannanum, Kkma, Komoran, Okt
from eunjeon import Mecab

test_text = "확진자와 접촉자는 다중이용시설 이용을 삼가하고, 사회적 거리두기 운동에 동참하며, 진료소와 마스크 착용을 자제해주시기 바랍니다."

# Customized Konlpy
twitter = Twitter()
twitter.add_dictionary(["확진자", "접촉자", "다중이용시설", "사회적", "거리두기", "진료소"], "Noun")
twitter.add_dictionary(["드립니다", "하시기", "해주시고", "해주시기", "지켜주십시오"], "Verb")
print(f"Customized Konlpy : {twitter.nouns(test_text)}")

# Hannanum
hannanum = Hannanum()
print(f"Hannanum : {hannanum.nouns(test_text)}")

# Kkma
kkma = Kkma()
print(f"Kkma : {kkma.nouns(test_text)}")

# Komoran
komoran = Komoran()
print(f"Komoran : {komoran.nouns(test_text)}")

# Okt
okt = Okt()
print(f"Okt : {okt.nouns(test_text)}")

# Mecab
mecab = Mecab()
print(f"Mecab : {mecab.nouns(test_text)}")
Пример #19
0
def naver():
    from selenium import webdriver
    import re
    from selenium.webdriver.common.keys import Keys
    import time
    cr_name = 'naver'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)

    # 네이버 헤드라인 가져오는소스

    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    result = []
    res = []

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(driver_path=Main.driver_path,
                                          headless=Main.headless,
                                          download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Naver 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format(
        date)
    chrome.get(url)
    time.sleep(2)

    # scroll(3)
    for sun in range(4, 10):
        pr = chrome.find_elements_by_xpath(
            '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun))
        for p in pr:
            result.append(p.find_elements_by_tag_name('a'))
        # print(result)

        for i, q in enumerate(result):
            for e in q:
                res.append(e.get_attribute('href'))
    http = list(set(res))
    len(http)
    https = []

    for idx in range(len(http)):
        if http[idx].find('popularDay') >= 0:
            continue
        else:
            https.append(http[idx])

    files = pd.DataFrame()

    for i in range(len(https)):
        res = requests.get(https[i])
        soup = BeautifulSoup(res.content, 'html.parser')
        body = soup.select('._article_body_contents')
        files = files.append(
            pd.DataFrame(
                {
                    'Title':
                    soup.find('div', attrs={
                        'class': 'article_info'
                    }).h3.text,
                    'Contents':
                    re.sub(
                        '   ', '',
                        re.sub(
                            '    ', '',
                            re.sub(
                                '\t', '',
                                cleanText(body[0].text)
                                [(cleanText(body[0].text)).find('{}') + 2:]))),
                    'link':
                    https[i]
                },
                index=[i]))

    text2 = files.Contents
    # 텍스트파일에 저장 csv
    files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2),
                 index=False,
                 encoding='utf-8')

    # -------------------------------------

    # 사전만들기
    from ckonlpy.tag import Twitter
    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    import nltk
    tokens_ko = []

    for i in range(len(text2)):
        tokens_ko.append(t.nouns(text2[i]))

    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)

    data_1 = []
    for i in range(len(data)):
        for q in range(0, 1, 1):
            if len(data[i][0]) >= 2:
                data_1.append(data[i])

    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    import time
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    tmp_data = dict(data_1)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white',
                          max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path + "/naver_{}.png".format(date),
                bbox_inces='tight',
                dpi=400,
                pad_inches=0)
Пример #20
0
class PreprocessingText:
    def help(self):
        print("******PreprocessingText******")
        print("1) make_content_re(df['컬럼이름'](Series)) : 입력받은 열을 전처리 후 시리즈로 반환")
        print("2) add_noun_dict('list') : 명사 사전에 단어 추가")
        print("3) add_stopwords('list') : 불용어 사전에 단어 추가")
        print("4) tokenize(df['컬럼이름'](Series)) : 입력받은 열을 토큰화한 후 시리즈로 반환")
        print(
            "5) change_similar_words(토큰화된 문서(Series), 유의어 사전(dictionary)) : 유의어 사전을 기반으로 문서 내 유의어를 대표어로 변환하고, 변환된 문서를 시리즈로 반환한다."
        )
        print("*****************************")

    def __init__(self):
        self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자')  # 기자
        self.reg_email = re.compile(
            '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')  # 이메일
        self.reg_eng = re.compile('[a-z]+')  # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다
        self.reg_chi = re.compile("[\u4e00-\u9fff]+")  # 한자
        self.reg_sc = re.compile(
            "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+")  # 특수문자
        self.reg_date = re.compile(
            '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+')  # 날짜,시간,숫자

        self.twitter_obj = Twitter()
        self.stopwords = []
        self.noun_list = []

    def preprocessing(self, doc):
        tmp = re.sub(self.reg_reporter, '', doc)
        tmp = re.sub(self.reg_email, '', tmp)
        tmp = re.sub(self.reg_eng, '', tmp)
        tmp = re.sub(self.reg_chi, '', tmp)
        tmp = re.sub(self.reg_sc, ' ', tmp)
        tmp = re.sub(self.reg_date, '', tmp)
        return tmp

    def make_content_re(self, data):
        pp_data = data.apply(self.preprocessing)
        return pp_data

    def add_noun_dict(self, noun_list):
        self.twitter_obj.add_dictionary(noun_list, 'Noun')
        self.noun_list.extend(noun_list)
        print("추가한 명사")
        print(noun_list)

    def add_stopwords(self, stopword_list):
        self.stopwords.extend(stopword_list)
        print("추가한 불용어")
        print(stopword_list)

    def change_similar_words(self, tokenized_docs, similar_words_dict):
        changed_docs = []
        for doc in tokenized_docs:
            changed_doc = []
            for word in doc:
                if word in similar_words_dict.keys():
                    changed_doc.append(similar_words_dict[word])
                else:
                    changed_doc.append(word)
            changed_docs.append(changed_doc)
        return changed_docs

    def tokenize(self, data):
        print('추가한 명사:', self.noun_list)
        print('불용어: ', self.stopwords)
        tokenized_doc = data.apply(lambda x: self.twitter_obj.nouns(x))
        tokenized_doc_without_stopwords = tokenized_doc.apply(
            lambda x:
            [item.lower() for item in x if item not in self.stopwords])
        tokenized_data = tokenized_doc_without_stopwords
        return pd.Series(tokenized_data)
Пример #21
0
    idx2word = {idx: word for word, idx in word2idx.items()}

    # 가장 긴 샘플의 길이
    story_max_len = np.max(story_len)
    question_max_len = np.max(question_len)

    return word2idx, idx2word, story_max_len, question_max_len


# %%

word2idx, idx2word, story_max_len, question_max_len = preprocess_data(
    train_data, test_data)
print(word2idx)

twitter = Twitter()

twitter.add_dictionary('은경이', 'Noun')
twitter.add_dictionary('경임이', 'Noun')
twitter.add_dictionary('수종이', 'Noun')

# print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
# print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
# print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
# print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
# print(twitter.morphs('수종이는 사무실로 갔습니다.'))
# print(twitter.morphs('은경이는 침실로 갔습니다.'))


def tokenize(sent):
    return twitter.morphs(sent)
Пример #22
0
import string
import csv
from ckonlpy.tag import Twitter
twitter = Twitter()
f = open("Han.txt", mode = "r", encoding = "utf-8")
c = csv.writer(open("HanKeoRyeKonlPy.csv","w",encoding = "utf-8"))
for t in f:
    c.writerow(twitter.morphs(t))
    
Пример #23
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def hashtags_split(self, hashtags):
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))

        hashtags_list = []

        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list

        return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #24
0
def kor_preprocessing(q, q3, df):
    data = df.copy().reset_index(drop=True)
    temp = []

    data = data.str.join('').str.replace(r"\n", "")
    data = data.str.replace(pat=r'[^\w]', repl=r'', regex=True)

    for i in range(len(data)):
        okt = Okt()
        new = okt.normalize(data[i])  # 정규화

        new = only_hangle(new)
        new = emoticon_normalize(new,
                                 num_repeats=2)  # ㅋㅋㅋㅋㅋㅋ -> ㅋㅋ, ㅠㅠㅠㅠ -> ㅠㅠ

        data[i] = data[i].replace(" ", '')

        spacing = Spacing()
        new = spacing(data[i])  # Apply space preprocessing
        try:
            new = spell_checker.check(new).checked  # 오타 처리
        except:
            print(new)
        temp.append(new)

    data = pd.Series(temp)

    # 신조어 사전 추가
    token = Twitter()  # 추가
    adding_noun = [
        '식후감', '존맛', '개존맛', '꿀맛', '짱맛', '요기요', 'ㅈㅁㅌ', 'ㅃㄲ', '소확행', '민초', '치밥',
        '소맥', '넘사벽', '순삭', '빛삭', '광삭', '반반무', '반반무마니', '솔까말', '스압', '썸남', '썸녀',
        'jmt', 'jmtg', 'jmtgr', 'JMT', 'JMTG', 'JMTGR', '배불띠', '돈쭐', '쿨타임',
        '닥추', '강추', '유튜버', '홧팅', '팟팅', '단짠단짠', '단짠', '맵단', '맵달', '맛도리', '부조캐',
        '밍밍쓰', '노맛', '존노맛', '최애', '차애', '섭스', '서빗', '프레젠또', '존맛탱', '개존맛탱',
        '존맛탱구리', '킹맛', '댕맛', '뿌링클', '로제', '오레오', '로투스', '사장님', '싸장님', '사장뉨'
        '소소한', '프라프치노', ' 프라푸치노', '갓성비', '커엽', '굳잡', '굿잡', '굳굳', '이벵트', '이벵'
    ]

    for i in adding_noun:
        token.add_dictionary(i, 'Noun')  # 명사 추가

    adding_verb = ['맛나', '마이쩡', '마이쪙', '마시땅', '마시쩡', '마시쪙']

    for i in adding_verb:
        token.add_dictionary(i, 'Noun')  # 동사 추가

    token.add_dictionary('잘', 'Noun')  # 동사 추가

    token = Okt()
    # 불용어 사전
    with open('stop.txt', 'rt', encoding='UTF8') as f:
        stopwords = f.read().replace('\n', ' ')
    stopwords = stopwords.split(' ')

    result = []
    for i in range(len(data)):
        review = data[i]
        temp = (token.morphs(review, norm=True, stem=True))

        stopwords_removed_sentence = [
            word for word in temp if not word in stopwords
        ]  # 불용어 제거
        sentence = ''

        for s in stopwords_removed_sentence:
            sentence = sentence + ' ' + s
        result.append(sentence)
    q.put(result)
    q3.put(df)
Пример #25
0
from itertools import repeat

import re
from fileIO import openJsonFile, closeJsonFile, saveError
from dbIO import readDB, insertDB

import nltk
from nltk.corpus import stopwords
from konlpy.tag import Okt
from ckonlpy.tag import Twitter, Postprocessor
from ckonlpy.utils import load_wordset, load_ngram

# nltk.download('punkt')
# nltk.download('stopwords')
okt = Okt()
twitter = Twitter()
stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt',
                           encoding='ANSI')
customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt',
                                 encoding='ANSI')
stopwordsEN = customStopwordsEN.union(set(stopwords.words('english')))
ngrams = load_ngram('cleansing_data/korean_ngram.txt')
userdicts = load_wordset('cleansing_data/korean_user_dict.txt')
twitter.add_dictionary(list(userdicts), 'Noun', force=True)


def getJobGroups():
    res = requests.get(
        'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all'
    )
    html = res.text
Пример #26
0
def comm_date(comm_name, dates_array):
    for dates in dates_array:
        client = MongoClient('mongodb://*****:*****@\·\"\"\%\,\(\)\&]+', ' ',
                          text)
            text = re.sub('[\n\xa0\r]+', ' ', text)

            # 토큰화
            token = twitter.nouns(text)  # 명사만

            if token != []:
                tokened_texts.extend(token)

            print(dates, i, '/', len(idate_with_all))

        pickle_name = str(comm_name) + str(dates)
        with open(pickle_name, "wb") as fw:
            pickle.dump(tokened_texts, fw)
        print('저장완료')
Пример #27
0
import pandas as pd
#from konlpy.tag import Okt
from ckonlpy.tag import Twitter
from collections import Counter
import os
import re

print(os.getcwd())

client = MongoClient('mongodb://13.125.221.134:9046')
db = client.mongodb
# 컬렉션 객체 가져오기
# ilbe_coll = db['cleaned_ilbe']
coll = db['realnavernews']

twitter = Twitter()

keyword = '회담'

#cursor = coll.find({'cno' : {'$regex' : keyword}}).limit(1)

cursor = coll.find({}).sort([('_id', 1)])


def news_check():
    for text in cursor:
        yield text


gen = news_check()
Пример #28
0
def twitter():
    cr_name = 'twitter'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)


    import time
    import nltk
    keyword = Main.text()

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(
        driver_path=Main.driver_path,
        headless=Main.headless,
        download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Twitter 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword)
    chrome.get(url)
    time.sleep(3)


    # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div')


    # for i in range(15):
    #     for q in range(3):
    #         body = chrome.find_element_by_css_selector('body')
    #         body.send_keys(Keys.PAGE_DOWN)
    #         time.sleep(1)
    #     for ttt in tqdm(text2):
    #         result.append(ttt.text)
    #     time.sleep(1)
    #
    #
    # result2 = []
    # for i in range(len(result)):
    #     if i % 2 == 0:
    #         result2.append(result[i])
    # print(len(result2))
    #
    # result3 = []
    # for i in range(len(result2)):
    #     result3.append(cleanText(result2[i]))

    body = chrome.find_element_by_css_selector('body')
    text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div')

    for i in range(10):
        for q in range(3):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(1)
        for ttt in tqdm(text2):
            result.append(re.sub('\n', '', ttt.text))

    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    tokens_ko = []

    for i in range(len(result)):
        tokens_ko.append(t.nouns(result[i]))
    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))


    # 텍스트파일에 댓글 저장하기
    file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8')

    for review in result:
        file.write(review + '\n')

    file.close()

    tmp_data = dict(data)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white', max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
Пример #29
0
test_stories, test_questions, test_answers = read_data(TEST_FILE)

print('훈련용 스토리의 개수 :', len(train_stories))
print('훈련용 질문의 개수 :',len(train_questions))
print('훈련용 답변의 개수 :',len(train_answers))
print('테스트용 스토리의 개수 :',len(test_stories))
print('테스트용 질문의 개수 :',len(test_questions))
print('테스트용 답변의 개수 :',len(test_answers))

train_stories[3572]

train_questions[3572]

train_answers[3572]

twitter = Twitter()

print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
print(twitter.morphs('수종이는 사무실로 갔습니다.'))
print(twitter.morphs('은경이는 침실로 갔습니다.'))

twitter.add_dictionary('은경이', 'Noun')
twitter.add_dictionary('경임이', 'Noun')
twitter.add_dictionary('수종이', 'Noun')

print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
Пример #30
0
#형태소 분석
import os
import json
#from konlpy.tag import Okt
from ckonlpy.tag import Twitter

BASE_DIR = os.path.dirname(os.path.abspath(__file__))

file = open(os.path.join(BASE_DIR + '/t05/news1.txt'), 'r', encoding='UTF8')
text = file.read()
file.close()

#okt = Okt()
twitter = Twitter()
twitter.add_dictionary('K리그', 'Noun')

content = twitter.morphs(text)

num = 1
voca_dict = dict()
for word in content:
    voca_dict[num] = word
    num = num + 1

with open(os.path.join(BASE_DIR + '/t06', 'vocab.json'),
          'w+',
          encoding='UTF-8-sig') as json_file:
    json.dump(voca_dict, json_file, ensure_ascii=False)