Пример #1
0
def tokenize_okt_noscreen(df):
    okt = Twitter()
    okt.add_dictionary(call_userword(), 'Noun')
    stopwords = load_wordset('./tokenizer/korean_stopword.txt')
    #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt')
    stopwords = list(stopwords)
    df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos(
        x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1)
    return df
Пример #2
0
class RawTaggerReader:
    def __init__(self, filepath, tagger=None):
        if tagger:
            self.tagger = tagger
        else:
            self.tagger = Twitter()
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
Пример #3
0
class RawTagger:
    def __init__(self, textIter, tagger=None):
        # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가
        f = open('형태소 보완.txt')
        dd = f.read()
        a = dd.split('\n')
        if tagger:
            self.tagger = tagger
        else:
            self.tagger = Twitter()
            self.tagger.add_dictionary(a, 'Noun')
        if type(textIter) == str:
            self.textIter = textIter.split('\n')
        else:
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
Пример #4
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def hashtags_split(self, hashtags):
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))

        hashtags_list = []

        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list

        return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #5
0
    def token(self, title, ccontent, creplies):
        memory = psutil.Process(os.getpid())

        T_OR_title = []
        T_title = []
        T_OR_ccontent = []
        T_ccontent = []
        T_OR_creplies = []
        T_creplies = []

        twitter = Okt()  # 트위터 형태소 사전을 사용하기 위해 초기화
        twitter.add_dictionary('백래시', 'Noun')
        twitter.add_dictionary('문재앙', 'Noun')

        #### 타이틀 토큰화
        #print('1')
        for i in range(len(title)):

            a = twitter.pos(title[i])
            b = []
            #print('title[i]',i,title[i])
            for j in range(len(a)):
                if a[j][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                    b.append(a[j])
                    #print('3',j)
            T_OR_title.append(b)
            T_title.append(twitter.morphs(title[i]))

            #### ccontent 토큰화
            try:
                c = twitter.pos(str(ccontent[i]))
                d = []
                # print('ccontent[i]',i, ccontent[i])
                for w in range(len(c)):
                    if c[w][1] != 'Punctuation':  # 오류로 'Punctuation'에 해당하는 튜플 제거
                        d.append(c[w])
                        #print('4',w)
                T_OR_ccontent.append(d)
                T_ccontent.append(twitter.morphs(str(ccontent[i])))

            except RuntimeError as e:
                T_OR_ccontent.append('')
                T_ccontent.append(twitter.morphs(''))

            ### 댓글 토큰화
            #print('creplies[i]',i,creplies[i])

            if type(creplies[i]) == str:  # string형 댓글 토큰화
                a = [creplies[i]]  # string을 리스트로 변경
                e = twitter.pos(str(a))
                f = []
                for u in range(len(e)):
                    if e[u][1] != 'Punctuation':
                        f.append(e[u])
                    elif e[u][1] != 'KoreanParticle':
                        f.append(e[u])
                    else:
                        break
                    #print('5',u)
                T_OR_creplies.append(f)
                T_OR_creplies.append(twitter.pos(str(a)))
                T_creplies.append(twitter.morphs(str(a)))

            else:
                temp = []
                temp2 = []

                x = []

                for n in range(len(creplies[i])):  ### 리스트로 반환되는 댓글
                    h = twitter.pos(creplies[i][n])
                    #print('6',n)

                    for z in range(len(h)):
                        if h[z][1] != 'Punctuation':
                            x.append(h[z])
                        elif h[z][1] != 'KoreanParticle':
                            x.append(h[z])
                        else:
                            break
                    # print('7',z)
                    # print('8',)
                    #print('h',z,h)

                    temp.append(x)
                    temp2.append(twitter.morphs(creplies[i][n]))

                T_OR_creplies.append(temp)
                T_creplies.append(temp2)

        return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
Пример #6
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def DB_to_table(self, DBname='intake', keyword='intake'):
        self.query = \
        """
        SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect(
            "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh",
            "ghintake", DBname)
        df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        self.raw_data = df.as_matrix()

    # def hashtags_split(self, hashtags):
    #     hashtags_split = []
    #     for i in hashtags:
    #         hashtags_split.append(i.split('/'))
    #
    #     hashtags_list = []
    #
    #     for i in hashtags_split:
    #         temp = []
    #         for j in i:
    #             if self.isHangul(j):
    #                 t_hashtags = j.translate(self.non_bmp_map)
    #                 temp.append(t_hashtags)
    #         hashtags_list.append(temp)
    #     self.hashtags_list = hashtags_list
    #
    #     return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)

        nav_list = noun_list + adj_list + verb_list

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0
Пример #7
0
    from matplotlib import font_manager, rc
    from PIL import Image
    import numpy as np

    font_name = font_manager.FontProperties(
        fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)

    tager = Twitter()

    for add in add_noun:
        tager.add_dictionary(add, 'Noun')

    text = open(keyword + '.txt', encoding='utf-8-sig').read()

    tokens = tager.pos(text)

    wordlist = []
    for word in tokens:
        if word[1] in ['Noun']:
            if word[0] not in no_word:
                wordlist.append(word[0])

    words = nltk.Text(wordlist, name=keyword)
    top_words = words.vocab().most_common(1000)
    words_dic = dict(top_words)

    mask = np.array(Image.open("shape.png"))
    wordcloud = WordCloud(
        font_path='c:/Windows/Fonts/malgun.ttf',
        mask=mask,
Пример #8
0
filtered_content = re.sub(r'[^\.\?\!\w\d\s]', '', filtered_content)
print(filtered_content)

from ckonlpy.tag import Twitter
twitter = Twitter()

# 가상통화, 아시아경제 등 미등록 단어를 사전에 추가로 등록
twitter.add_dictionary('가상통화', 'Noun')
twitter.add_dictionary('아시아경제', 'Noun')
twitter.add_dictionary('한동안', 'Noun')
twitter.add_dictionary('블리클리', 'Noun')
twitter.add_dictionary('공동창립자', 'Noun')
twitter.add_dictionary('부크바', 'Noun')

# 형태소 분석
twitter_morphs = twitter.pos(filtered_content)

# 명사만 추출하기
Noun_words = []
for word, pos in twitter_morphs:
    if pos == 'Noun':
        Noun_words.append(word)
print(Noun_words)

# 불용어 제거를 위한 별도의 사전 구축
# 본문과 상관 없는 아시아 경제, 기자 이름, 기자 단어 제거
# count 했을 때 빈도수가 높지만 본문 주요 내용과 관련이 없는 못, 것, 수, 까지 단어 제거
stopwords = ['아시아경제', '김철현', '기자', '못', '것', '수', '까지']

# unique 하게 명사 추려내기
unique_noun_words = set(Noun_words)
Пример #9
0
    # 트위터 tokenizer loading 및 업로드
    twitter = Twitter()
    twitter.add_dictionary(brand_set, 'Noun')

    # 원래 tokenizer 업로드
    ori_twitter = original_Twitter()
    #### emoji2text
    preprocessed_text = emoji2text(review_DB.text)
    #### upper & 제품 매핑
    preprocessed_text = pd.Series(preprocessed_text).apply(lambda x: x.upper())

    bow = preprocessed_text.apply(lambda x: '◈'.join([
        token + '╹' + pos if pos in ['Noun'] else ori_twitter.
        pos(token, stem=True, norm=True)[0][0] + '╹' + ori_twitter.pos(
            token, stem=True, norm=True)[0][1] for token, pos in twitter.pos(x)
    ]))
    ### token normalizing
    p_normal = {
        'BB╹Alpha': '비비크림╹Noun',
        'CC╹Alpha': '씨씨크림╹Noun',
        '비비╹Noun': '비비크림╹Noun',
        '씨씨╹Noun': '씨씨크림╹Noun',
        '파데╹Noun': '파운데이션╹Noun',
        '쟂빛╹Noun': '잿빛╹Noun',
        '비씨╹Noun': '비씨데이션╹Noun'
    }
    bow = bow.apply(lambda x: '◈'.join([
        p_normal[token] if token in p_normal.keys() else token
        for token in x.split('◈')
    ]))
Пример #10
0
class Social_analysis():

    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

    def __init__(self):
        self.twitter = Twitter()

    def DB_to_table(self, DBname='intake', keyword='intake'):
        import pymssql
        import pandas.io.sql as pdsql
        import pandas as pd
        self.query = \
        """
        SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPost WHERE keyword = '{}'
        """.format(keyword)
        conn = pymssql.connect(
            "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh",
            "ghintake", DBname)
        df = pdsql.read_sql_query(self.query, con=conn)
        # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map))
        # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
        conn.close()
        self.raw_data = df.as_matrix()

    def pickle_to_table(self, filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        data = data[1:]
        for idx, i in enumerate(data):
            data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map)
            data[idx][3] = '/'.join(i[3])
            data[idx][4] = '/'.join(i[4])
        self.raw_data = np.array(data)

    def hashtags_split(self, hashtags):
        hashtags_split = []
        for i in hashtags:
            hashtags_split.append(i.split('/'))

        hashtags_list = []

        for i in hashtags_split:
            temp = []
            for j in i:
                if self.isHangul(j):
                    t_hashtags = j.translate(self.non_bmp_map)
                    temp.append(t_hashtags)
            hashtags_list.append(temp)
        self.hashtags_list = hashtags_list

        return hashtags_list

    def add_keyword_dic(self, keyword_list, tag='Noun'):
        for i in keyword_list:
            if type(i) == tuple:
                self.twitter.add_dictionary(i[0], i[1])
            else:
                self.twitter.add_dictionary(i, tag)

    def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']):

        morph_list = []
        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []

        for j in text_list:
            parsed = self.twitter.pos(j)
            temp = []
            n_temp = []
            adj_temp = []
            verb_temp = []
            nav_temp = []

            for i in parsed:
                if self.isHangul(i[0]):
                    if ((len(i[0]) > 1) or (i[0] in exception_list)):
                        temp.append(i)
                        if i[1] == 'Noun':
                            n_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif i[1] == 'Verb':
                            adj_temp.append(i[0])
                            nav_temp.append(i[0])
                        elif i[1] == 'Adjective':
                            verb_temp.append(i[0])
                            nav_temp.append(i[0])
                    else:
                        print('{} 제외'.format(i[0]))
                else:
                    print('{} 한글이 아님.'.format(i[0]))

            morph_list.append(temp)
            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)
            nav_list.append(nav_temp)

        return morph_list, nav_list, noun_list, adj_list, verb_list

    def pos_extractor(self, parsed):

        noun_list = []
        adj_list = []
        verb_list = []
        nav_list = []
        for i in parsed:
            n_temp = []
            adj_temp = []
            verb_temp = []
            nav_temp = []
            if self.isHangul(i[0]):
                if ((len(i[0]) > 1) or (i[0] in exception_list)):
                    if i[1] == 'Noun':
                        n_temp.append(i[0])
                        nav_temp.append(i[0])
                    elif i[1] == 'Verb':
                        adj_temp.append(i[0])
                        nav_temp.append(i[0])
                    elif i[1] == 'Adjective':
                        verb_temp.append(i[0])
                        nav_temp.append(i[0])
                else:
                    print('{} 제외'.format(i[0]))
            else:
                print('{} 한글이 아님.'.format(i[0]))

            noun_list.append(n_temp)
            adj_list.append(adj_temp)
            verb_list.append(verb_temp)
            nav_list.append(nav_temp)

        return nav_list, noun_list, adj_list, verb_list

    def merge_list(self, tokenized_list):
        return [j for i in tokenized_list for j in i]

    def join_list(self, tokenized_list):
        joined_list = []
        for idx, i in enumerate(tokenized_list):
            joined_list.append(" ".join(i))
        return joined_list

    def split_list(self, untokenized_list):
        hashtag_splited = []
        for idx, i in enumerate(untokenized):
            hashtag_splited.append(i.split('/'))
            return hastag_splited

    def join_underbar(self, morph_list):

        all_list = []
        post_list = []
        for i in morph_list:
            for j in i:
                post_list.append(j[0] + '_' + j[1])
            all_list.append([(' , ').join(post_list)])
            post_list = []
        all_list = np.array(all_list)

        return all_list

    def word_substitute(self, dataset, sublist):
        dataset = copy.deepcopy(dataset)
        sub_book = dict()
        for i in sublist:
            for j in i['sub_words']:
                sub_book[j] = i['main']
        gc.collect()
        for n, i in enumerate(dataset):
            dataset[n] = [sub_book.get(item, item) for item in i]

        del sub_book
        gc.collect()

        return dataset

    def word_delete(self, dataset, del_list):
        dataset = copy.deepcopy(dataset)

        for n, line in enumerate(dataset):
            dataset[n] = [i for i in line if i not in del_list]

        return dataset

    def isHangul(self, text):
        encText = text
        hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
        return hanCount > 0

    def convert_list(self, *tokenized_list):
        input_length = len(tokenized_list)
        lists = [[] for i in range(input_length)]

        for idx, li in enumerate(tokenized_list):
            for j in li:
                lists[idx].append(['/'.join(j)])

        converted_array = np.array(lists[0])
        for idx in range(input_length):
            try:
                converted_array = np.concatenate(
                    (converted_array, lists[idx + 1]), axis=1)
            except Exception as e:
                print(e, '끝')

        return converted_array

    def make_df(self, converted_array):
        df = pd.DataFrame(np.hstack(
            (intake.raw_data[:, :3], converted_array, intake.raw_data[:, 3:])),
                          index=None)
        return df

    # 키워드 리스트 중 하나라도 있는 경우
    def word_check_or(self, text, keywords):
        if any(word in text for word in keywords): return True
        else: return False

    # 키워드 리스트에 있는 단어가 모두 있는 경우
    def word_check_and(self, text, keywords):
        if all(word in text for word in keywords): return True
        else: return False

    def word_check(self,
                   method='and',
                   keywords=[],
                   df=None,
                   column_name=None,
                   filter_TF=True):
        if method == 'and':
            df['flag'] = df[column_name].apply(
                lambda x: self.word_check_and(x, keywords))
            return df[df.flag == filter_TF]

        if method == 'or':
            df['flag'] = df[column_name].apply(
                lambda x: self.word_check_or(x, keywords))
            return df[df.flag == filter_TF]
from konlpy.tag import Komoran
from konlpy.tag import Okt
from ckonlpy.tag import Twitter
okt = Okt()
twitter = Twitter()

sentence = 'IBK기업은행 '
sentences = '소은지국민은행계좌로30만원이체해줘'
komoran = Komoran()

twitter.add_dictionary('이체해줘', 'Noun')
twitter.add_dictionary('KB 국민은행', 'Noun')

komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt")

print(twitter.pos(sentence, stem=True))
print(twitter.pos(sentences, stem=True))

print(komoran.pos(sentence))
print(komoran.pos(sentences))

arr = komoran.pos(sentence)
for word, tag in arr:
    if (tag == 'VV'): print("|||||||")
    print(word, tag)
    if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||")

brr = komoran.pos(sentences)
for word, tag in brr:

    if (tag == 'VV' or tag == 'XSV'):