예제 #1
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') 
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
            regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n\n')
예제 #2
0
def clean_csv(dataset_file_dir, merged_file_save_path, ignore_list):
    sentence_list = []
    for filepath in os.listdir(dataset_file_dir):
        if filepath.endswith(".csv"):
            entire_path = os.path.join(dataset_file_dir, filepath)
            with open(entire_path, newline="") as word_file:
                csv_reader = csv.reader(word_file)
                for row in csv_reader:
                    sentence_list.append(row)

    tokenized_sentence_list = []
    tokenizer = RegexTokenizer()
    count = 0

    for sentence in sentence_list:
        tokenized_sentence = tokenizer.tokenize(str(sentence))
        clean_sentence = [
            elem for elem in tokenized_sentence
            if is_valid_word(elem, ignore_list)
        ]
        tokenized_sentence_list.append(clean_sentence)
        # print(tokenized_sentence)
        count += 1

    file = open(merged_file_save_path, 'w', encoding='utf-8', newline='')
    writer = csv.writer(file)
    for sentence in tokenized_sentence_list:
        writer.writerow(sentence)
    file.close()
예제 #3
0
    def convert_to_vector_list(self, ignore_list, model_length, sentence):
        tokenizer = RegexTokenizer()
        tokenized_sentence = tokenizer.tokenize(str(sentence))
        print(self.key_vector_path)
        kv = KeyedVectors.load(self.key_vector_path, mmap='r')
        clean_sentence = [
            elem for elem in tokenized_sentence
            if csv_reader.is_valid_word(elem, ignore_list)
        ]

        vector = []
        for elem in clean_sentence:
            try:
                array = kv[elem]
            except:
                array = [1] * 100
            vector.append(array)

        vector_list = []
        while (len(vector_list) < model_length):
            vector_list += vector

        if (len(vector_list) > model_length):
            vector_list = vector_list[:model_length]
        return np.array(vector_list)
예제 #4
0
class RegexTokenizerKorean(SpecialTokenizer):
    def __init__(self):
        from soynlp.tokenizer import RegexTokenizer
        self.inst = RegexTokenizer()
        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
예제 #5
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError(
            "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
                regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".
            format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({
        '데이터': 0.4,
        '데이': 0.35,
        '데이터센터': 0.38
    })
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n')
예제 #6
0
    def prepare_corpus(self, ignore_list, model_length, corpus_path):
        tokenizer = RegexTokenizer()
        data_list = []
        label_list = []
        myw2v = w2v.word2vec(self.model_path)
        myw2v.load_keyvector(self.key_vector_path)
        with open(corpus_path, newline='') as corpus_file:
            reader = csv.reader(corpus_file)
            for row in reader:
                sentence = row[0]
                label = row[1]

                # uncensored data
                if label == '1':
                    label = [1, 0]
                # Censored data
                else:
                    label = [0, 1]

                tokenized_sentence = tokenizer.tokenize(str(sentence))
                clean_sentence = [
                    elem for elem in tokenized_sentence
                    if csv_reader.is_valid_word(elem, ignore_list)
                ]

                vector = [myw2v.get_vector(elem) for elem in clean_sentence]
                print("length: " + str(len(vector)))

                if (len(vector) > 0):
                    vector_list = []
                    while (len(vector_list) < model_length):
                        vector_list += vector

                    if (len(vector_list) > model_length):
                        vector_list = vector_list[:model_length]
                    # print(np.array(vector_list).shape)
                    data_list.append(np.array(vector_list))
                    label_list.append(np.array(label))

        train_input = data_list
        train_label = label_list

        return (train_input, train_label)
예제 #7
0
def word2vec(user_file='./review_01_0005_72378155.txt'):
    tokenizer = RegexTokenizer()
    sents = []

    file = open(user_file, 'r', encoding='UTF-8', newline='')

    while True:
        line = file.readline()
        line = re.sub('\s*\n', '', line)

        if "-----------------" not in line:
            sents.append(line)
        if len(sents) > 5000:
            break

    tokenized_contents = []

    for sent in sents:
        temp = tokenizer.tokenize(sent, flatten=True)
        tokenized_contents.append(temp)

    embedding_model = Word2Vec(tokenized_contents,
                               size=100,
                               window=5,
                               min_count=2,
                               workers=4,
                               iter=100,
                               sg=1)
    while True:
        print("User input : ")
        user_input = input()
        if user_input is "":
            break
        else:
            try:
                result = embedding_model.most_similar(positive=[user_input],
                                                      topn=5)
                for elem in result:
                    print(elem)
            except Exception:
                print("ERROR : 결과가 없습니다.")
예제 #8
0
from soynlp.tokenizer import RegexTokenizer
import konlpy

tok = konlpy.tag.Mecab()
tokenizer = RegexTokenizer()

print(tok.morphs('동일하게 테스트 중입니다'))
print(tokenizer.tokenize('테스트 중이다'))
def Tokenize(data):
    tokenizer = RegexTokenizer()
    output = list(map(lambda x: ' '.join(tokenizer.tokenize(x)), data))
    return output
import sqlite3
import token_word_judge
import os
import sys
import json
user_name = sys.argv[1]

# user가 word input
user_input_word = sys.argv[2]

# input_number = input('여기에 숫자를 입력하세요 : ')
# RegexTokenizer's object 생성
tokenizer = RegexTokenizer()

# input한 word token화
new_token_list = tokenizer.tokenize(user_input_word)

# token_list를 영어버젼도!
final_token_list = token_word_judge.google_translator(new_token_list)

# 0.1초 이거만하면
# print(new_token_list)

# final_token_list = []

# # 명사만 추출하는 code
# for new_token in new_token_list:
#     # str인 것은 모두 영어이거나, 한글 단독 명사다.
#     if type(token_word_judge.token_judge_en_lower_ko_noun(new_token)) == str:
#         final_token_list.append(token_word_judge.token_judge_en_lower_ko_noun(new_token))
#     # list 인 것은 명사가 여러개인 것이 list로 묶인다.
예제 #11
0
# In[40]:

df = pd.DataFrame(review_list, columns=['review'])

# In[41]:

from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

tokenizer = RegexTokenizer()
tokenizer

# In[42]:

parsed_list = []
for i in df['review']:
    temp = tokenizer.tokenize(i)
    parsed_list.append(temp)

df['review_parsed'] = parsed_list
#print(df)

# In[43]:

STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※']

# In[44]:


def remove_stopwords(tokens):
    return [t for t in tokens if t not in STOP_WORDS]
def db_sentence_2_token_list(database_history_all_users_data_list):
    """
    설명 : 긴 sentence를 RegexTokenizer로 token으로 나눠서 title에 있던 자리에 다시 담는다.

    input : Sentence들을 모아둔 list
    ex)
    input : [['computer', '의', 'Youtube', '채널', '확인하기'], ----]
    return : result (type = list)
    result : 
    [['computer', 'https://www.youtube.com/', 36, 3], 
    ['의', 'https://www.youtube.com/', 36, 3], 
    ['youtube', 'https://www.youtube.com/', 36, 3], 
    ['채널', 'https://www.youtube.com/', 36, 3], 
    ['확인', 'https://www.youtube.com/', 36,3]
    ------
    ]

    """
    # token
    tokenizer = RegexTokenizer()

    result = []
    # DB 한 줄씩 읽어들이기
    for line in database_history_all_users_data_list:
        """
        output : 
        ('https://www.youtube.com/', 'YouTube', 36)
        """
        # output이 tuple이어서
        url, title, visit_count, user_count = line
        # title의 text를 word로 끊어버리기
        title_list = tokenizer.tokenize(title)
        # title이 빈공간인 건 제외
        if len(title_list) == 0:
            continue

        else:
            for word in title_list:
                judgement = kor_or_eng_judge(word)
                # judgement 가 영어 한글이 아닐 경우
                if judgement == 0:
                    pass
                # judgement가 영어 경우 : 영어인 경우 lower한 단어 입력
                elif judgement == 'en':
                    result.append([
                        token_judge_en_lower_ko_noun(word), url, visit_count,
                        user_count
                    ])
                # judgement가 영어 경우 : 한글인 경우 lower한 단어 입력
                elif judgement == 'ko':
                    if len(token_judge_en_lower_ko_noun(word)) == 1:
                        result.append([
                            token_judge_en_lower_ko_noun(word)[0], url,
                            visit_count, user_count
                        ])
                    elif len(token_judge_en_lower_ko_noun(word)) == 0:
                        pass
                    else:
                        for token_noun in token_judge_en_lower_ko_noun(word):
                            result.append(
                                [token_noun, url, visit_count, user_count])

    return result
예제 #13
0
from soynlp.tokenizer import RegexTokenizer
from soynlp.noun import LRNounExtractor
# import pandas as pd
# import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import re

content = '밥도둑 지코바 양념치킨~~ 한번 우동 사리를 추가 해 먹어봤어요 강추드려요!!!! 지코바는 마무리로 치밥인거 알죠?'

tokenizer = RegexTokenizer()
tokened_content = tokenizer.tokenize(content)
print(tokened_content)

# def preprocessing(text):
#     text = re.sub('\\\\n', ' ', text)
#     return text

# sentences = care['content'].apply(preprocessing)
# sentences = preprocessing(content)
# tokens = tokenizer.tokenize(content)
# print(tokens)
# print(sentences)

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

stopwords_kr = [
    '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말',
    '너무'
예제 #14
0
#step3. 토큰화  참고 :  https://linguistech.tistory.com/13
blog = pd.read_json('.blogreview.json')
# print(df.count())
# print(df['title'])

from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

tokenizer = RegexTokenizer()

##토큰화 테스트##
sample_index = 5
sample_title = blog['title'][sample_index]
sample_description = blog['description'][sample_index]
# print(sample_title)
# print(sample_description)
tokened_title = tokenizer.tokenize(sample_title)
tokened_description = tokenizer.tokenize(sample_description)

# print(tokened_description)


##개행문자 제거##
def preprocessing(text):
    text = re.sub('\\\\n', ' ', text)
    return text


##개행문자 제거##
title_sentences = blog['title'].apply(preprocessing)
description_sentences = blog['description'].apply(preprocessing)
# print(title_sentences)
예제 #15
0
def review_cr(urll):
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    import pandas as pd
    from bs4 import BeautifulSoup

    # In[37]:

    url = urll

    # In[38]:

    driver = webdriver.Chrome(
        'C:/Users/multicampus/PycharmProjects/airbnb_bot/chromedriver')
    driver.implicitly_wait(3)
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # In[39]:

    reviews = soup.find('div', {
        'id': 'reviews'
    }).findAll('div', {'class': '_czm8crp'})
    review_list = []
    for review in reviews:
        review_list.append(review.string)
    print(review_list)

    # In[40]:

    df = pd.DataFrame(review_list, columns=['review'])

    # In[41]:

    from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

    tokenizer = RegexTokenizer()
    tokenizer

    # In[42]:

    parsed_list = []
    for i in df['review']:
        temp = tokenizer.tokenize(i)
        parsed_list.append(temp)

    df['review_parsed'] = parsed_list
    # print(df)

    # In[43]:

    STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※']

    # In[44]:

    def remove_stopwords(tokens):
        return [t for t in tokens if t not in STOP_WORDS]

    # In[45]:

    df['review_parsed'] = df['review_parsed'].apply(remove_stopwords)

    # In[118]:

    from collections import Counter
    from matplotlib import pyplot as plt

    faq_answer_parsed_lst = [
        y for x in df['review_parsed'].to_list() for y in x
    ]

    counter = Counter(faq_answer_parsed_lst)
    counter.most_common(20)
    counter = counter.most_common(20)

    print(counter)
    return counter