Пример #1
0
def tokenize_okt_noscreen(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    return df
Пример #2
0
def tokenize_okt_noscreen(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(
        lambda x: text_tokenize(x['content'], okt, stopwords), axis=1)
    df['title_token'] = df.progress_apply(
        lambda x: text_tokenize(x['title'], okt, stopwords), axis=1)
    return df
def Tokenizer(data):

    import pandas as pd
    from ckonlpy.tag import Twitter

    twitter = Twitter()

    #사용자 사전 추가
    txt = pd.read_csv('사용자 사전.txt', sep='\n')
    txt = txt['<사용자 사전>']
    for line in txt:
        twitter.add_dictionary(txt, 'Noun')

    # 데이터 가져오기
    data = data
    new_hashtags = data.hashtags.copy()

    # 토큰화
    for i in range(len(new_hashtags)):
        new_hashtags[i] = ' '.join(new_hashtags[i])

    tokenized = []

    for sentence in new_hashtags:
        tokens = twitter.morphs(sentence)
        tokenized.append(tokens)

    # 연속된 중복 제거
    new_tokenized = []

    for x in range(len(tokenized)):
        temp = []

        for y in range(len(tokenized[x]) - 1):
            if tokenized[x][y] != tokenized[x][y + 1]:
                temp.append(tokenized[x][y])

        new_tokenized.append(temp)

    return new_tokenized
Пример #4
0
class RawTagger:
    def __init__(self, textIter, tagger=None):
        # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가
        f = open('형태소 보완.txt')
        dd = f.read()
        a = dd.split('\n')
        if tagger:
            self.tagger = tagger
        else:
            self.tagger = Twitter()
            self.tagger.add_dictionary(a, 'Noun')
        if type(textIter) == str:
            self.textIter = textIter.split('\n')
        else:
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
Пример #5
0
#형태소 분석
import os
import json
#from konlpy.tag import Okt
from ckonlpy.tag import Twitter

BASE_DIR = os.path.dirname(os.path.abspath(__file__))

file = open(os.path.join(BASE_DIR + '/t05/news1.txt'), 'r', encoding='UTF8')
text = file.read()
file.close()

#okt = Okt()
twitter = Twitter()
twitter.add_dictionary('K리그', 'Noun')

content = twitter.morphs(text)

num = 1
voca_dict = dict()
for word in content:
    voca_dict[num] = word
    num = num + 1

with open(os.path.join(BASE_DIR + '/t06', 'vocab.json'),
          'w+',
          encoding='UTF-8-sig') as json_file:
    json.dump(voca_dict, json_file, ensure_ascii=False)
Пример #6
0
    def twitter(self):
        cr_name = 'twitter'
        # 이미지파일 저장 장소 확인
        save_path = os.path.join(self.img_path, cr_name)
        if os.path.isdir(save_path):
            print(cr_name + ' 이미지 경로 확인 완료')
        elif os.path.isdir(self.img_path):
            os.mkdir(save_path)
        else:
            os.mkdir(self.img_path)
            os.mkdir(save_path)

        text_save_path = os.path.join(self.text_path, cr_name)
        if os.path.isdir(text_save_path):
            print(cr_name + ' 텍스트 경로 확인 완료')
        elif os.path.isdir(self.text_path):
            os.mkdir(text_save_path)
        else:
            os.mkdir(self.text_path)
            os.mkdir(text_save_path)
        keyword = self.scan_name

        # if self.platform == 'linux':
        #     print('System platform : Linux')
        #     self.driver_path = './static/lib/webDriver/chromedriver_lnx'
        #     from pyvirtualdisplay import Display
        #     self.display = Display(visible=0, size=(800, 600))
        #     self.display.start()
        # 웹 셋팅
        if self.platform == 'linux':

            display = Display(visible=0, size=(1024, 768))
            display.start()

            options = Options()
            options.binary_location = "/usr/bin/google-chrome"

            # chrome_options = webdriver.ChromeOptions()
            options.headless = True
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-dev-shm-usage')

            chrome = webdriver.Chrome(executable_path=self.driver_path,
                                      options=options)
        else:
            chrome = self.generate_chrome(driver_path=self.driver_path,
                                          headless=self.headless,
                                          download_path=self.DOWNLOAD_DIR)

        # 웹접속 - 네이버 이미지 접속
        print("Twitter 접속중")
        # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
        # driver.implicitly_wait(30)

        url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword)
        chrome.get(url)
        chrome.implicitly_wait(30)

        body = chrome.find_element_by_css_selector('body')
        text2 = chrome.find_elements_by_css_selector(
            '#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div'
        )
        result = []

        for i in range(10):
            for q in range(3):
                body.send_keys(Keys.PAGE_DOWN)
                time.sleep(1)
            for ttt in text2:
                result.append(re.sub('\n', '', ttt.text))
        print(result)

        time.sleep(1)
        if self.platform == 'linux':
            chrome.close()
            display.stop()

        t = Twitter()

        t.add_dictionary(self.sajun(), 'Noun')
        print('단어사전 추출완료')
        tokens_ko = []

        for i in range(len(result)):
            tokens_ko.append(t.nouns(result[i]))

        final = []
        for _, q in enumerate(tokens_ko):
            for i in range(len(q)):
                final.insert(-1, q[i])
        print('형태소분석 완료!')
        ko = nltk.Text(final, name="첫번째")
        data = ko.vocab().most_common(1000)

        # 텍스트파일에 댓글 저장하기
        file = open(text_save_path + '/twitter{}.txt'.format(self.date),
                    'w',
                    encoding='utf-8')

        for review in result:
            file.write(review + '\n')

        file.close()

        tmp_data = dict(data)

        wordcloud = WordCloud(
            font_path=self.fontPath, background_color='white',
            max_words=230).generate_from_frequencies(tmp_data)
        plt.figure(figsize=(10, 8))
        plt.imshow(wordcloud)
        plt.axis('off'), plt.xticks([]), plt.yticks([])
        plt.tight_layout()
        plt.subplots_adjust(left=0,
                            bottom=0,
                            right=1,
                            top=1,
                            hspace=0,
                            wspace=0)
        plt.savefig(save_path + "/twitter_{}.png".format(self.date),
                    bbox_inces='tight',
                    dpi=400,
                    pad_inches=0)
Пример #7
0
from konlpy.tag import Okt
from ckonlpy.tag import Twitter, Postprocessor
from ckonlpy.utils import load_wordset, load_ngram

# nltk.download('punkt')
# nltk.download('stopwords')
okt = Okt()
twitter = Twitter()
stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt',
                           encoding='ANSI')
customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt',
                                 encoding='ANSI')
stopwordsEN = customStopwordsEN.union(set(stopwords.words('english')))
ngrams = load_ngram('cleansing_data/korean_ngram.txt')
userdicts = load_wordset('cleansing_data/korean_user_dict.txt')
twitter.add_dictionary(list(userdicts), 'Noun', force=True)


def getJobGroups():
    res = requests.get(
        'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all'
    )
    html = res.text
    soup = BeautifulSoup(html, "html.parser")

    jobGroups = []
    for elements in soup.find("div",
                              class_="_2h5Qtv_8mK2LOH-yR3FTRs").find_all("li"):
        href = elements.find("a")["href"]
        span = elements.find("span")
        jobGroup = {
Пример #8
0
    def token(self, title, ccontent, creplies):
        memory = psutil.Process(os.getpid())

        T_OR_title = []
        T_title = []
        T_OR_ccontent = []
        T_ccontent = []
        T_OR_creplies = []
        T_creplies = []

        twitter = Okt()  # 트위터 형태소 사전을 사용하기 위해 초기화
        twitter.add_dictionary('백래시', 'Noun')
        twitter.add_dictionary('문재앙', 'Noun')

        #### 타이틀 토큰화
        #print('1')
        for i in range(len(title)):

            a = twitter.pos(title[i])
            b = []
            #print('title[i]',i,title[i])
            for j in range(len(a)):
                if a[j][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                    b.append(a[j])
                    #print('3',j)
            T_OR_title.append(b)
            T_title.append(twitter.morphs(title[i]))

            #### ccontent 토큰화
            try:
                c = twitter.pos(str(ccontent[i]))
                d = []
                # print('ccontent[i]',i, ccontent[i])
                for w in range(len(c)):
                    if c[w][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                        d.append(c[w])
                        #print('4',w)
                T_OR_ccontent.append(d)
                T_ccontent.append(twitter.morphs(str(ccontent[i])))

            except RuntimeError as e:
                T_OR_ccontent.append('')
                T_ccontent.append(twitter.morphs(''))

            ### 댓글 토큰화
            #print('creplies[i]',i,creplies[i])

            if type(creplies[i]) == str:  # string형 댓글 토큰화
                a = [creplies[i]]  # string을 리스트로 변경
                e = twitter.pos(str(a))
                f = []
                for u in range(len(e)):
                    if e[u][1] != 'Punctuation':
                        f.append(e[u])
                    elif e[u][1] != 'KoreanParticle':
                        f.append(e[u])
                    else:
                        break
                    #print('5',u)
                T_OR_creplies.append(f)
                T_OR_creplies.append(twitter.pos(str(a)))
                T_creplies.append(twitter.morphs(str(a)))

            else:
                temp = []
                temp2 = []

                x = []

                for n in range(len(creplies[i])):  ### 리스트로 반환되는 댓글
                    h = twitter.pos(creplies[i][n])
                    #print('6',n)

                    for z in range(len(h)):
                        if h[z][1] != 'Punctuation':
                            x.append(h[z])
                        elif h[z][1] != 'KoreanParticle':
                            x.append(h[z])
                        else:
                            break
                    # print('7',z)
                    # print('8',)
                    #print('h',z,h)

                    temp.append(x)
                    temp2.append(twitter.morphs(creplies[i][n]))

                T_OR_creplies.append(temp)
                T_creplies.append(temp2)

        return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
Пример #9
0
from ckonlpy.tag import Twitter
from konlpy.tag import Hannanum, Kkma, Komoran, Okt
from eunjeon import Mecab

test_text = "확진자와 접촉자는 다중이용시설 이용을 삼가하고, 사회적 거리두기 운동에 동참하며, 진료소와 마스크 착용을 자제해주시기 바랍니다."

# Customized Konlpy
twitter = Twitter()
twitter.add_dictionary(["확진자", "접촉자", "다중이용시설", "사회적", "거리두기", "진료소"], "Noun")
twitter.add_dictionary(["드립니다", "하시기", "해주시고", "해주시기", "지켜주십시오"], "Verb")
print(f"Customized Konlpy : {twitter.nouns(test_text)}")

# Hannanum
hannanum = Hannanum()
print(f"Hannanum : {hannanum.nouns(test_text)}")

# Kkma
kkma = Kkma()
print(f"Kkma : {kkma.nouns(test_text)}")

# Komoran
komoran = Komoran()
print(f"Komoran : {komoran.nouns(test_text)}")

# Okt
okt = Okt()
print(f"Okt : {okt.nouns(test_text)}")

# Mecab
mecab = Mecab()
print(f"Mecab : {mecab.nouns(test_text)}")
Пример #10
0
with open('bitcoin_news.txt', 'r', encoding='utf8') as f:
    content = f.read()

# 내용에서 불필요한 특수문자, 기호 등 제거
filtered_content = content.replace('.', '').replace(',', '').replace(
    "'", "").replace('·', ' ').replace('=', '').replace('"', '')
filtered_content = re.sub(r'▶.*', '', filtered_content)
filtered_content = re.sub(r'[^\.\?\!\w\d\s]', '', filtered_content)
print(filtered_content)

from ckonlpy.tag import Twitter
twitter = Twitter()

# 가상통화, 아시아경제 등 미등록 단어를 사전에 추가로 등록
twitter.add_dictionary('가상통화', 'Noun')
twitter.add_dictionary('아시아경제', 'Noun')
twitter.add_dictionary('한동안', 'Noun')
twitter.add_dictionary('블리클리', 'Noun')
twitter.add_dictionary('공동창립자', 'Noun')
twitter.add_dictionary('부크바', 'Noun')

# 형태소 분석
twitter_morphs = twitter.pos(filtered_content)

# 명사만 추출하기
Noun_words = []
for word, pos in twitter_morphs:
    if pos == 'Noun':
        Noun_words.append(word)
print(Noun_words)
Пример #11
0
from ckonlpy.utils import load_replace_wordpair
replace = load_replace_wordpair('postprocess/replace.txt')

from ckonlpy.utils import load_ngram
ngrams = load_ngram('postprocess/ngrams.txt')

Okt = Okt()
twitter = Twitter()

new_nouns = []

with open('preprocess/dictionary.txt', encoding='utf8') as fd:
    for line in fd:
        new_nouns.append(line.strip('\n'))

twitter.add_dictionary(new_nouns, 'Noun')

passtags = {'Noun'}

postprocessor = Postprocessor(
    base_tagger=twitter,
    stopwords=stopwords,
    #passwords = passwords,
    passtags=passtags,
    replace=replace,
    ngrams=ngrams)

token = []
nouns = []

Пример #12
0
    # 브랜드 이름 명사로 저장

    brand_set = np.array(
        [re.sub('[^가-힣]', '', x) for x in product_DB.brandname.unique()])
    brand_set = pd.Series([x for x in brand_set if x != ''])

    # 특수 이름 추가
    brand_set = brand_set.append(pd.Series([
        '비씨데이션', '파데', '다크닝', '지속력', '밀착력', '피부톤', '커버력', '쿨톤', '웜톤', '결보정',
        '코끼임'
    ]),
                                 ignore_index=True)

    # 트위터 tokenizer loading 및 업로드
    twitter = Twitter()
    twitter.add_dictionary(brand_set, 'Noun')

    # 원래 tokenizer 업로드
    ori_twitter = original_Twitter()
    #### emoji2text
    preprocessed_text = emoji2text(review_DB.text)
    #### upper & 제품 매핑
    preprocessed_text = pd.Series(preprocessed_text).apply(lambda x: x.upper())

    bow = preprocessed_text.apply(lambda x: '◈'.join([
        token + '╹' + pos if pos in ['Noun'] else ori_twitter.
        pos(token, stem=True, norm=True)[0][0] + '╹' + ori_twitter.pos(
            token, stem=True, norm=True)[0][1] for token, pos in twitter.pos(x)
    ]))
    ### token normalizing
    p_normal = {
Пример #13
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPost WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect(
            "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh",
            "ghintake", DBname)
        df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        self.raw_data = df.as_matrix()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def hashtags_split(self, hashtags):
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))

        hashtags_list = []

        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list

        return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []
            nav_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                            nav_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)
            nav_list.append(nav_temp)

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def pos_extractor(self, parsed):

        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []
        for i in parsed:
            n_temp = []
            adj_temp = []
            verb_temp = []
            nav_temp = []
            if self.isHangul(i[0]):
                if ((len(i[0]) > 1) or (i[0] in exception_list)):
                    if i[1] == 'Noun':
                        n_temp.append(i[0])
                        nav_temp.append(i[0])
                    elif i[1] == 'Verb':
                        adj_temp.append(i[0])
                        nav_temp.append(i[0])
                    elif i[1] == 'Adjective':
                        verb_temp.append(i[0])
                        nav_temp.append(i[0])
                else:
                    print('{} 제외'.format(i[0]))
            else:
                print('{} 한글이 아님.'.format(i[0]))

            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)
            nav_list.append(nav_temp)

        return nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def join_underbar(self, morph_list):

        all_list = []
        post_list = []
        for i in morph_list:
            for j in i:
                post_list.append(j[0] + '_' + j[1])
            all_list.append([(' , ').join(post_list)])
            post_list = []
        all_list = np.array(all_list)

        return all_list

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0

    def convert_list(self, *tokenized_list):
        input_length = len(tokenized_list)
        lists = [[] for i in range(input_length)]

        for idx, li in enumerate(tokenized_list):
            for j in li:
                lists[idx].append(['/'.join(j)])

        converted_array = np.array(lists[0])
        for idx in range(input_length):
            try:
                converted_array = np.concatenate(
                    (converted_array, lists[idx + 1]), axis=1)
            except Exception as e:
                print(e, '끝')

        return converted_array

    def make_df(self, converted_array):
        df = pd.DataFrame(np.hstack(
            (intake.raw_data[:, :3], converted_array, intake.raw_data[:, 3:])),
                          index=None)
        return df

    # 키워드 리스트 중 하나라도 있는 경우
    def word_check_or(self, text, keywords):
        if any(word in text for word in keywords): return True
        else: return False

    # 키워드 리스트에 있는 단어가 모두 있는 경우
    def word_check_and(self, text, keywords):
        if all(word in text for word in keywords): return True
        else: return False

    def word_check(self,
                   method='and',
                   keywords=[],
                   df=None,
                   column_name=None,
                   filter_TF=True):
        if method == 'and':
            df['flag'] = df[column_name].apply(
                lambda x: self.word_check_and(x, keywords))
            return df[df.flag == filter_TF]

        if method == 'or':
            df['flag'] = df[column_name].apply(
                lambda x: self.word_check_or(x, keywords))
            return df[df.flag == filter_TF]
Пример #14
0
    def Daum(self):
        cr_name = 'daum'
        # 이미지파일 저장 장소 확인
        save_path = os.path.join(self.img_path, cr_name)
        if os.path.isdir(save_path):
            print(cr_name + ' 이미지 경로 확인 완료')
        elif os.path.isdir(self.img_path):
            os.mkdir(save_path)
        else:
            os.mkdir(self.img_path)
            os.mkdir(save_path)

        text_save_path = os.path.join(self.text_path, cr_name)
        if os.path.isdir(text_save_path):
            print(cr_name + ' 텍스트 경로 확인 완료')
        elif os.path.isdir(self.text_path):
            os.mkdir(text_save_path)
        else:
            os.mkdir(self.text_path)
            os.mkdir(text_save_path)

        # 다음뉴스 헤드라인 긁어오기
        http = []
        print('Daum 접속 중')
        httz = 'https://media.daum.net/ranking/popular/?regDate={}'.format(
            self.date)
        res = requests.get(httz)
        soup = BeautifulSoup(res.content, 'html.parser')
        body = soup.select('#mArticle > div.rank_news > ul.list_news2')
        body = body[0].find_all('a')

        for i in range(len(body)):
            t = body[i].get('href')
            http.append(t)

        # 중복제거
        http = list(set(http))

        files = pd.DataFrame()
        for i in range(len(http)):
            res = requests.get(http[i])
            soup = BeautifulSoup(res.content, 'html.parser')
            body = soup.select('.article_view')[0]

            files = files.append(
                pd.DataFrame(
                    {
                        'Title':
                        soup.find('div', attrs={
                            'class': 'head_view'
                        }).h3.text,
                        'Contents':
                        " ".join(p.get_text() for p in body.find_all('p')),
                        'link':
                        http[i]
                    },
                    index=[i]))
        text2 = files.Contents

        # 텍스트파일에 댓글 저장하기
        files.to_csv(text_save_path + '/다음뉴스종합_{}.csv'.format(self.date),
                     index=False,
                     encoding='utf-8')
        print('다음 텍스트 저장완료!')

        t = Twitter()
        t.add_dictionary(self.sajun(), 'Noun')
        print('형태소 사전 업로드 완료!!')

        tokens_ko = []

        for i in range(len(text2)):
            tokens_ko.append(t.nouns(text2[i]))

        final = []
        for _, q in enumerate(tokens_ko):
            for i in range(len(q)):
                final.insert(-1, q[i])

        ko = nltk.Text(final, name="첫번째")
        data = ko.vocab().most_common(1000)
        print('nltk 완료')

        # 다음뉴스는 50페이지 긁어오는거라서 1글자는 삭제했음. 필요한건 바로바로 보고서 사전에 추가해서 태깅 다시해야함.
        data_1 = []
        for i in range(len(data)):
            for q in range(0, 1, 1):
                if len(data[i][0]) >= 2:
                    data_1.append(data[i])

        tmp_data = dict(data_1)
        print('wordcloud 실행')
        wordcloud = WordCloud(
            font_path=self.fontPath, background_color='white',
            max_words=230).generate_from_frequencies(tmp_data)
        print('wordcloud 실행!!!')
        plt.figure(figsize=(10, 8))
        plt.imshow(wordcloud)
        plt.axis('off'), plt.xticks([]), plt.yticks([])
        plt.tight_layout()
        plt.subplots_adjust(left=0,
                            bottom=0,
                            right=1,
                            top=1,
                            hspace=0,
                            wspace=0)
        plt.savefig(save_path + "/daum_{}.png".format(self.date),
                    bbox_inces='tight',
                    dpi=400,
                    pad_inches=0)
Пример #15
0
    def Naver(self):
        cr_name = 'naver'
        # 이미지파일 저장 장소 확인
        save_path = os.path.join(self.img_path, cr_name)
        if os.path.isdir(save_path):
            print(cr_name + ' 이미지 경로 확인 완료')
        elif os.path.isdir(self.img_path):
            os.mkdir(save_path)
        else:
            os.mkdir(self.img_path)
            os.mkdir(save_path)

        text_save_path = os.path.join(self.text_path, cr_name)
        if os.path.isdir(text_save_path):
            print(cr_name + ' 텍스트 경로 확인 완료')
        elif os.path.isdir(self.text_path):
            os.mkdir(text_save_path)
        else:
            os.mkdir(self.text_path)
            os.mkdir(text_save_path)

        # 네이버 헤드라인 가져오는소스

        result = []
        res = []

        # 웹 셋팅
        if self.platform == 'linux':

            display = Display(visible=0, size=(800, 600))
            display.start()

            options = Options()
            options.binary_location = "/usr/bin/google-chrome"

            # chrome_options = webdriver.ChromeOptions()
            options.headless = True
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-dev-shm-usage')

            chrome = webdriver.Chrome(executable_path=self.driver_path,
                                      options=options)
        else:
            chrome = self.generate_chrome(driver_path=self.driver_path,
                                          headless=self.headless,
                                          download_path=self.DOWNLOAD_DIR)

        # 웹접속 - 네이버 이미지 접속
        print("Naver 접속중")
        # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
        # driver.implicitly_wait(30)

        url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format(
            self.date)
        chrome.get(url)
        chrome.implicitly_wait(30)

        # scroll(3)
        for sun in range(4, 10):
            pr = chrome.find_elements_by_xpath(
                '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun))
            for p in pr:
                result.append(p.find_elements_by_tag_name('a'))
            # print(result)

            for i, q in enumerate(result):
                for e in q:
                    res.append(e.get_attribute('href'))
        http = list(set(res))
        len(http)
        https = []

        for idx in range(len(http)):
            if http[idx].find('popularDay') >= 0:
                continue
            else:
                https.append(http[idx])

        files = pd.DataFrame()

        if self.platform == 'linux':
            chrome.close()
            display.stop()

        for i in range(len(https)):
            res = requests.get(https[i])
            soup = BeautifulSoup(res.content, 'html.parser')
            body = soup.select('._article_body_contents')
            files = files.append(
                pd.DataFrame(
                    {
                        'Title':
                        soup.find('div', attrs={
                            'class': 'article_info'
                        }).h3.text,
                        'Contents':
                        re.sub(
                            '   ', '',
                            re.sub(
                                '    ', '',
                                re.sub(
                                    '\t', '',
                                    self.cleanText(body[0].text)[
                                        (self.cleanText(body[0].text)
                                         ).find('{}') + 2:]))),
                        'link':
                        https[i]
                    },
                    index=[i]))

        text2 = files.Contents
        # 텍스트파일에 저장 csv
        files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(self.date),
                     index=False,
                     encoding='utf-8')

        # -------------------------------------

        # 사전만들기
        t = Twitter()
        t.add_dictionary(self.sajun(), 'Noun')

        tokens_ko = []

        for i in range(len(text2)):
            tokens_ko.append(t.nouns(text2[i]))

        final = []
        for _, q in enumerate(tokens_ko):
            for i in range(len(q)):
                final.insert(-1, q[i])

        ko = nltk.Text(final, name="첫번째")
        data = ko.vocab().most_common(1000)

        data_1 = []
        for i in range(len(data)):
            for q in range(0, 1, 1):
                if len(data[i][0]) >= 2:
                    data_1.append(data[i])

        tmp_data = dict(data_1)

        wordcloud = WordCloud(
            font_path=self.fontPath, background_color='white',
            max_words=230).generate_from_frequencies(tmp_data)
        plt.figure(figsize=(10, 8))
        plt.imshow(wordcloud)
        plt.axis('off'), plt.xticks([]), plt.yticks([])
        plt.tight_layout()
        plt.subplots_adjust(left=0,
                            bottom=0,
                            right=1,
                            top=1,
                            hspace=0,
                            wspace=0)
        plt.savefig(save_path + "/naver_{}.png".format(self.date),
                    bbox_inces='tight',
                    dpi=400,
                    pad_inches=0)
Пример #16
0
class PreprocessingText:
    def help(self):
        print("******PreprocessingText******")
        print("1) make_content_re(df['컬럼이름'](Series)) : 입력받은 열을 전처리 후 시리즈로 반환")
        print("2) add_noun_dict('list') : 명사 사전에 단어 추가")
        print("3) add_stopwords('list') : 불용어 사전에 단어 추가")
        print("4) tokenize(df['컬럼이름'](Series)) : 입력받은 열을 토큰화한 후 시리즈로 반환")
        print(
            "5) change_similar_words(토큰화된 문서(Series), 유의어 사전(dictionary)) : 유의어 사전을 기반으로 문서 내 유의어를 대표어로 변환하고, 변환된 문서를 시리즈로 반환한다."
        )
        print("*****************************")

    def __init__(self):
        self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자')  # 기자
        self.reg_email = re.compile(
            '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')  # 이메일
        self.reg_eng = re.compile('[a-z]+')  # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다
        self.reg_chi = re.compile("[\u4e00-\u9fff]+")  # 한자
        self.reg_sc = re.compile(
            "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+")  # 특수문자
        self.reg_date = re.compile(
            '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+')  # 날짜,시간,숫자

        self.twitter_obj = Twitter()
        self.stopwords = []
        self.noun_list = []

    def preprocessing(self, doc):
        tmp = re.sub(self.reg_reporter, '', doc)
        tmp = re.sub(self.reg_email, '', tmp)
        tmp = re.sub(self.reg_eng, '', tmp)
        tmp = re.sub(self.reg_chi, '', tmp)
        tmp = re.sub(self.reg_sc, ' ', tmp)
        tmp = re.sub(self.reg_date, '', tmp)
        return tmp

    def make_content_re(self, data):
        pp_data = data.apply(self.preprocessing)
        return pp_data

    def add_noun_dict(self, noun_list):
        self.twitter_obj.add_dictionary(noun_list, 'Noun')
        self.noun_list.extend(noun_list)
        print("추가한 명사")
        print(noun_list)

    def add_stopwords(self, stopword_list):
        self.stopwords.extend(stopword_list)
        print("추가한 불용어")
        print(stopword_list)

    def change_similar_words(self, tokenized_docs, similar_words_dict):
        changed_docs = []
        for doc in tokenized_docs:
            changed_doc = []
            for word in doc:
                if word in similar_words_dict.keys():
                    changed_doc.append(similar_words_dict[word])
                else:
                    changed_doc.append(word)
            changed_docs.append(changed_doc)
        return changed_docs

    def tokenize(self, data):
        print('추가한 명사:', self.noun_list)
        print('불용어: ', self.stopwords)
        tokenized_doc = data.apply(lambda x: self.twitter_obj.nouns(x))
        tokenized_doc_without_stopwords = tokenized_doc.apply(
            lambda x:
            [item.lower() for item in x if item not in self.stopwords])
        tokenized_data = tokenized_doc_without_stopwords
        return pd.Series(tokenized_data)
Пример #17
0
def naver():
    from selenium import webdriver
    import re
    from selenium.webdriver.common.keys import Keys
    import time
    cr_name = 'naver'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)

    # 네이버 헤드라인 가져오는소스

    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    result = []
    res = []

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(driver_path=Main.driver_path,
                                          headless=Main.headless,
                                          download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Naver 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format(
        date)
    chrome.get(url)
    time.sleep(2)

    # scroll(3)
    for sun in range(4, 10):
        pr = chrome.find_elements_by_xpath(
            '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun))
        for p in pr:
            result.append(p.find_elements_by_tag_name('a'))
        # print(result)

        for i, q in enumerate(result):
            for e in q:
                res.append(e.get_attribute('href'))
    http = list(set(res))
    len(http)
    https = []

    for idx in range(len(http)):
        if http[idx].find('popularDay') >= 0:
            continue
        else:
            https.append(http[idx])

    files = pd.DataFrame()

    for i in range(len(https)):
        res = requests.get(https[i])
        soup = BeautifulSoup(res.content, 'html.parser')
        body = soup.select('._article_body_contents')
        files = files.append(
            pd.DataFrame(
                {
                    'Title':
                    soup.find('div', attrs={
                        'class': 'article_info'
                    }).h3.text,
                    'Contents':
                    re.sub(
                        '   ', '',
                        re.sub(
                            '    ', '',
                            re.sub(
                                '\t', '',
                                cleanText(body[0].text)
                                [(cleanText(body[0].text)).find('{}') + 2:]))),
                    'link':
                    https[i]
                },
                index=[i]))

    text2 = files.Contents
    # 텍스트파일에 저장 csv
    files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2),
                 index=False,
                 encoding='utf-8')

    # -------------------------------------

    # 사전만들기
    from ckonlpy.tag import Twitter
    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    import nltk
    tokens_ko = []

    for i in range(len(text2)):
        tokens_ko.append(t.nouns(text2[i]))

    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)

    data_1 = []
    for i in range(len(data)):
        for q in range(0, 1, 1):
            if len(data[i][0]) >= 2:
                data_1.append(data[i])

    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    import time
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))

    tmp_data = dict(data_1)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white',
                          max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path + "/naver_{}.png".format(date),
                bbox_inces='tight',
                dpi=400,
                pad_inches=0)
Пример #18
0
    import matplotlib.pyplot as plt
    from ckonlpy.tag import Twitter
    from wordcloud import WordCloud

    from matplotlib import font_manager, rc
    from PIL import Image
    import numpy as np

    font_name = font_manager.FontProperties(
        fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)

    tager = Twitter()

    for add in add_noun:
        tager.add_dictionary(add, 'Noun')

    text = open(keyword + '.txt', encoding='utf-8-sig').read()

    tokens = tager.pos(text)

    wordlist = []
    for word in tokens:
        if word[1] in ['Noun']:
            if word[0] not in no_word:
                wordlist.append(word[0])

    words = nltk.Text(wordlist, name=keyword)
    top_words = words.vocab().most_common(1000)
    words_dic = dict(top_words)
Пример #19
0
def kor_preprocessing(q, q3, df):
    data = df.copy().reset_index(drop=True)
    temp = []

    data = data.str.join('').str.replace(r"\n", "")
    data = data.str.replace(pat=r'[^\w]', repl=r'', regex=True)

    for i in range(len(data)):
        okt = Okt()
        new = okt.normalize(data[i])  # 정규화

        new = only_hangle(new)
        new = emoticon_normalize(new,
                                 num_repeats=2)  # ㅋㅋㅋㅋㅋㅋ -> ㅋㅋ, ㅠㅠㅠㅠ -> ㅠㅠ

        data[i] = data[i].replace(" ", '')

        spacing = Spacing()
        new = spacing(data[i])  # Apply space preprocessing
        try:
            new = spell_checker.check(new).checked  # 오타 처리
        except:
            print(new)
        temp.append(new)

    data = pd.Series(temp)

    # 신조어 사전 추가
    token = Twitter()  # 추가
    adding_noun = [
        '식후감', '존맛', '개존맛', '꿀맛', '짱맛', '요기요', 'ㅈㅁㅌ', 'ㅃㄲ', '소확행', '민초', '치밥',
        '소맥', '넘사벽', '순삭', '빛삭', '광삭', '반반무', '반반무마니', '솔까말', '스압', '썸남', '썸녀',
        'jmt', 'jmtg', 'jmtgr', 'JMT', 'JMTG', 'JMTGR', '배불띠', '돈쭐', '쿨타임',
        '닥추', '강추', '유튜버', '홧팅', '팟팅', '단짠단짠', '단짠', '맵단', '맵달', '맛도리', '부조캐',
        '밍밍쓰', '노맛', '존노맛', '최애', '차애', '섭스', '서빗', '프레젠또', '존맛탱', '개존맛탱',
        '존맛탱구리', '킹맛', '댕맛', '뿌링클', '로제', '오레오', '로투스', '사장님', '싸장님', '사장뉨'
        '소소한', '프라프치노', ' 프라푸치노', '갓성비', '커엽', '굳잡', '굿잡', '굳굳', '이벵트', '이벵'
    ]

    for i in adding_noun:
        token.add_dictionary(i, 'Noun')  # 명사 추가

    adding_verb = ['맛나', '마이쩡', '마이쪙', '마시땅', '마시쩡', '마시쪙']

    for i in adding_verb:
        token.add_dictionary(i, 'Noun')  # 동사 추가

    token.add_dictionary('잘', 'Noun')  # 동사 추가

    token = Okt()
    # 불용어 사전
    with open('stop.txt', 'rt', encoding='UTF8') as f:
        stopwords = f.read().replace('\n', ' ')
    stopwords = stopwords.split(' ')

    result = []
    for i in range(len(data)):
        review = data[i]
        temp = (token.morphs(review, norm=True, stem=True))

        stopwords_removed_sentence = [
            word for word in temp if not word in stopwords
        ]  # 불용어 제거
        sentence = ''

        for s in stopwords_removed_sentence:
            sentence = sentence + ' ' + s
        result.append(sentence)
    q.put(result)
    q3.put(df)
Пример #20
0
def twitter():
    cr_name = 'twitter'
    # 이미지파일 저장 장소 확인
    save_path = os.path.join(Main.img_path, cr_name)
    if os.path.isdir(save_path):
        print(cr_name + ' 이미지 경로 확인 완료')
    elif os.path.isdir(Main.img_path):
        os.mkdir(save_path)
    else:
        os.mkdir(Main.img_path)
        os.mkdir(save_path)

    text_save_path = os.path.join(Main.text_path, cr_name)
    if os.path.isdir(text_save_path):
        print(cr_name + ' 텍스트 경로 확인 완료')
    elif os.path.isdir(Main.text_path):
        os.mkdir(text_save_path)
    else:
        os.mkdir(Main.text_path)
        os.mkdir(text_save_path)


    import time
    import nltk
    keyword = Main.text()

    # 웹 셋팅
    chrome = chromedriver.generate_chrome(
        driver_path=Main.driver_path,
        headless=Main.headless,
        download_path=Main.DOWNLOAD_DIR)

    # 웹접속 - 네이버 이미지 접속
    print("Twitter 접속중")
    # driver = webdriver.Chrome(executable_path="./chromedriver.exe")
    # driver.implicitly_wait(30)

    url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword)
    chrome.get(url)
    time.sleep(3)


    # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div')


    # for i in range(15):
    #     for q in range(3):
    #         body = chrome.find_element_by_css_selector('body')
    #         body.send_keys(Keys.PAGE_DOWN)
    #         time.sleep(1)
    #     for ttt in tqdm(text2):
    #         result.append(ttt.text)
    #     time.sleep(1)
    #
    #
    # result2 = []
    # for i in range(len(result)):
    #     if i % 2 == 0:
    #         result2.append(result[i])
    # print(len(result2))
    #
    # result3 = []
    # for i in range(len(result2)):
    #     result3.append(cleanText(result2[i]))

    body = chrome.find_element_by_css_selector('body')
    text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div')

    for i in range(10):
        for q in range(3):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(1)
        for ttt in tqdm(text2):
            result.append(re.sub('\n', '', ttt.text))

    t = Twitter()
    t.add_dictionary(Main.sajun(), 'Noun')

    tokens_ko = []

    for i in range(len(result)):
        tokens_ko.append(t.nouns(result[i]))
    final = []
    for _, q in enumerate(tokens_ko):
        for i in range(len(q)):
            final.insert(-1, q[i])

    ko = nltk.Text(final, name="첫번째")
    data = ko.vocab().most_common(1000)
    date = time.strftime('%Y%m%d', time.localtime(time.time()))
    date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time()))


    # 텍스트파일에 댓글 저장하기
    file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8')

    for review in result:
        file.write(review + '\n')

    file.close()

    tmp_data = dict(data)

    wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf',
                          background_color='white', max_words=230).generate_from_frequencies(tmp_data)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud)
    plt.axis('off'), plt.xticks([]), plt.yticks([])
    plt.tight_layout()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0)
    plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
Пример #21
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def hashtags_split(self, hashtags):
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))

        hashtags_list = []

        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list

        return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #22
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def DB_to_table(self, DBname='intake', keyword='intake'):
        self.query = \
        """
        SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect(
            "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh",
            "ghintake", DBname)
        df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        self.raw_data = df.as_matrix()

    # def hashtags_split(self, hashtags):
    #     hashtags_split = []
    #     for i in hashtags:
    #         hashtags_split.append(i.split('/'))
    #
    #     hashtags_list = []
    #
    #     for i in hashtags_split:
    #         temp = []
    #         for j in i:
    #             if self.isHangul(j):
    #                 t_hashtags = j.translate(self.non_bmp_map)
    #                 temp.append(t_hashtags)
    #         hashtags_list.append(temp)
    #     self.hashtags_list = hashtags_list
    #
    #     return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #23
0
train_stories[3572]

train_questions[3572]

train_answers[3572]

twitter = Twitter()

print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
print(twitter.morphs('수종이는 사무실로 갔습니다.'))
print(twitter.morphs('은경이는 침실로 갔습니다.'))

twitter.add_dictionary('은경이', 'Noun')
twitter.add_dictionary('경임이', 'Noun')
twitter.add_dictionary('수종이', 'Noun')

print(twitter.morphs('은경이는 화장실로 이동했습니다.'))
print(twitter.morphs('경임이는 정원으로 가버렸습니다.'))
print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.'))
print(twitter.morphs('수종이는 사무실로 갔습니다.'))
print(twitter.morphs('은경이는 침실로 갔습니다.'))

def preprocess_data(train_data, test_data):
    counter = FreqDist()

    # 두 문장의 story를 하나의 문장으로 통합하는 함수
    flatten = lambda data: reduce(lambda x, y: x + y, data)
from konlpy.tag import Komoran
from konlpy.tag import Okt
from ckonlpy.tag import Twitter
okt = Okt()
twitter = Twitter()

sentence = 'IBK기업은행 '
sentences = '소은지국민은행계좌로30만원이체해줘'
komoran = Komoran()

twitter.add_dictionary('이체해줘', 'Noun')
twitter.add_dictionary('KB 국민은행', 'Noun')

komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt")

print(twitter.pos(sentence, stem=True))
print(twitter.pos(sentences, stem=True))

print(komoran.pos(sentence))
print(komoran.pos(sentences))

arr = komoran.pos(sentence)
for word, tag in arr:
    if (tag == 'VV'): print("|||||||")
    print(word, tag)
    if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||")

brr = komoran.pos(sentences)
for word, tag in brr:

    if (tag == 'VV' or tag == 'XSV'):