예제 #1
0
def clean_csv(dataset_file_dir, merged_file_save_path, ignore_list):
    sentence_list = []
    for filepath in os.listdir(dataset_file_dir):
        if filepath.endswith(".csv"):
            entire_path = os.path.join(dataset_file_dir, filepath)
            with open(entire_path, newline="") as word_file:
                csv_reader = csv.reader(word_file)
                for row in csv_reader:
                    sentence_list.append(row)

    tokenized_sentence_list = []
    tokenizer = RegexTokenizer()
    count = 0

    for sentence in sentence_list:
        tokenized_sentence = tokenizer.tokenize(str(sentence))
        clean_sentence = [
            elem for elem in tokenized_sentence
            if is_valid_word(elem, ignore_list)
        ]
        tokenized_sentence_list.append(clean_sentence)
        # print(tokenized_sentence)
        count += 1

    file = open(merged_file_save_path, 'w', encoding='utf-8', newline='')
    writer = csv.writer(file)
    for sentence in tokenized_sentence_list:
        writer.writerow(sentence)
    file.close()
예제 #2
0
    def convert_to_vector_list(self, ignore_list, model_length, sentence):
        tokenizer = RegexTokenizer()
        tokenized_sentence = tokenizer.tokenize(str(sentence))
        print(self.key_vector_path)
        kv = KeyedVectors.load(self.key_vector_path, mmap='r')
        clean_sentence = [
            elem for elem in tokenized_sentence
            if csv_reader.is_valid_word(elem, ignore_list)
        ]

        vector = []
        for elem in clean_sentence:
            try:
                array = kv[elem]
            except:
                array = [1] * 100
            vector.append(array)

        vector_list = []
        while (len(vector_list) < model_length):
            vector_list += vector

        if (len(vector_list) > model_length):
            vector_list = vector_list[:model_length]
        return np.array(vector_list)
예제 #3
0
    def prepare_corpus(self, ignore_list, model_length, corpus_path):
        tokenizer = RegexTokenizer()
        data_list = []
        label_list = []
        myw2v = w2v.word2vec(self.model_path)
        myw2v.load_keyvector(self.key_vector_path)
        with open(corpus_path, newline='') as corpus_file:
            reader = csv.reader(corpus_file)
            for row in reader:
                sentence = row[0]
                label = row[1]

                # uncensored data
                if label == '1':
                    label = [1, 0]
                # Censored data
                else:
                    label = [0, 1]

                tokenized_sentence = tokenizer.tokenize(str(sentence))
                clean_sentence = [
                    elem for elem in tokenized_sentence
                    if csv_reader.is_valid_word(elem, ignore_list)
                ]

                vector = [myw2v.get_vector(elem) for elem in clean_sentence]
                print("length: " + str(len(vector)))

                if (len(vector) > 0):
                    vector_list = []
                    while (len(vector_list) < model_length):
                        vector_list += vector

                    if (len(vector_list) > model_length):
                        vector_list = vector_list[:model_length]
                    # print(np.array(vector_list).shape)
                    data_list.append(np.array(vector_list))
                    label_list.append(np.array(label))

        train_input = data_list
        train_label = label_list

        return (train_input, train_label)
예제 #4
0
def word2vec(user_file='./review_01_0005_72378155.txt'):
    tokenizer = RegexTokenizer()
    sents = []

    file = open(user_file, 'r', encoding='UTF-8', newline='')

    while True:
        line = file.readline()
        line = re.sub('\s*\n', '', line)

        if "-----------------" not in line:
            sents.append(line)
        if len(sents) > 5000:
            break

    tokenized_contents = []

    for sent in sents:
        temp = tokenizer.tokenize(sent, flatten=True)
        tokenized_contents.append(temp)

    embedding_model = Word2Vec(tokenized_contents,
                               size=100,
                               window=5,
                               min_count=2,
                               workers=4,
                               iter=100,
                               sg=1)
    while True:
        print("User input : ")
        user_input = input()
        if user_input is "":
            break
        else:
            try:
                result = embedding_model.most_similar(positive=[user_input],
                                                      topn=5)
                for elem in result:
                    print(elem)
            except Exception:
                print("ERROR : 결과가 없습니다.")
예제 #5
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError(
            "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
                regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError(
            "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".
            format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({
        '데이터': 0.4,
        '데이': 0.35,
        '데이터센터': 0.38
    })
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError(
            "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
                maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n')
예제 #6
0
from soynlp.tokenizer import RegexTokenizer
import konlpy

tok = konlpy.tag.Mecab()
tokenizer = RegexTokenizer()

print(tok.morphs('동일하게 테스트 중입니다'))
print(tokenizer.tokenize('테스트 중이다'))
def Tokenize(data):
    tokenizer = RegexTokenizer()
    output = list(map(lambda x: ' '.join(tokenizer.tokenize(x)), data))
    return output
예제 #8
0
def detail(m_no, current_movie_title):

    conn = pymysql.connect(host='127.0.0.1',
                           user='******',
                           password='******',
                           db='movie',
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)
    try:
        with conn.cursor() as cursor:
            sql = 'select * from current_movie c inner join test t on c.current_movie_title = t.title where current_movie_title = %s;'
            cursor.execute(sql, (current_movie_title))
            result = cursor.fetchone()  #하나만 가져올떄

            sql = 'select * from current_movie where current_movie_title = %s;'
            cursor.execute(sql, (current_movie_title))
            result1 = cursor.fetchone()  #하나만 가져올떄

            sql = 'select * from board where m_no= %s;'
            cursor.execute(sql, (m_no))
            board = cursor.fetchall()
    finally:
        conn.close()
    if result is not None:
        tmrvl = []
        movieName = result['codem']

        for page in range(1, 200):
            url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=" + str(
                movieName) + "&page=" + str(page)
            response = urllib.request.urlopen(url)

            soup = BeautifulSoup(response, 'html.parser')
            table = soup.select('ul.rvw_list_area li a')
            for result3 in table:
                mrv = str(result3.string)
                tmrv = tuple([mrv])
                tmrvl.append(tmrv)
                #tmrv1=str(tmrv)
                #f.write(tmrv1)
        df = pd.DataFrame(tmrvl)

        def preprocessing(text):
            # 개행문자 제거
            text = re.sub('\\\\n', ' ', text)
            return text

        tokenizer = RegexTokenizer()
        stopwords_kr = [
            '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은',
            '많이', '정말', '너무', '[', ']', '것으로', '했습니다', '했다'
        ]

        sentences = df[0].apply(preprocessing)

        # soynlp로 명사 추출하기
        noun_extractor = LRNounExtractor(verbose=True)
        noun_extractor.train(sentences)
        nouns = noun_extractor.extract()

        # 이미지 파일위에 출력하기
        img = Image.open('IT_Bank_Movie/static/img/cloud.png')
        img_array = np.array(img)

        wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf',
                              stopwords=stopwords_kr,
                              background_color='white',
                              mask=img_array,
                              width=800,
                              height=600).generate(' '.join(nouns))
        plt.figure(figsize=(15, 10))
        plt.imshow(wordcloud)
        plt.axis("off")
        #plt.show()
        url1 = "IT_Bank_Movie/static/wordcloud/" + current_movie_title + ".png"
        wordcloud.to_file(url1)

    return render_template('movie_detail.html',
                           wordInfo=result,
                           board=board,
                           movieInfo=result1)
예제 #9
0
 def __init__(self):
     from soynlp.tokenizer import RegexTokenizer
     self.inst = RegexTokenizer()
     self.OUT_TYPE = [list, str]
예제 #10
0
    def __init__(self, root, phase='train'):
        print("CustomDataset-> init")
        #count_vectorizer = make_vocab(root)
        self.root = root
        self.phase = phase
        self.labels = {}

        self.label_path = os.path.join(root, self.phase + '_hate.txt')
        with open(self.label_path, 'r',encoding="utf-8") as f:
            temp1 = []
            bias_list = []
            hate_list = []
            for line in f.readlines()[0:]:
                v = line.strip().split('\t')
                w = v[1]
                w = w.replace('!','')
                w = w.replace('.','')
                w = w.replace('^','')
                w = w.replace('♡','')
                w = w.replace('@','')
                w = w.replace('ㅎ','')
                w = w.replace('ㅉ','')
                w = w.replace('?','')
                w = w.replace('ㅜ','')
                w = w.replace('ㅠ','')
                w = w.replace('~','')
                w = w.replace('ㅋ','')
                w = w.replace('ㅡ','')
                w = w.replace('!','')
                w = w.replace('ㄷ','')
                w = w.replace('ㄹ','')
                w = w.replace('ㅇ','')
                w = w.replace(',','')
                w = w.replace('ㅈ','')
                w = w.replace('♥','')
                w = w.replace('ㅁ','')
                w = w.replace('ㅊ','')
                w = w.replace(';','')
                w = w.replace('ㄴ','')
                w = w.replace('ㆍ','')
                temp1.append(w)
                if phase != 'test':
                    bias_list.append(v[2])
                    hate_list.append(v[3])
        
        
        stopwords =['의','가','이','은','들','는','좀','잘',
                    '걍','과','도','를','으로','자','에','와','한','하다']
        
        comments_list = [] # 형태소로 자름
        
        okt = Okt()
        komoran =Komoran()
        tokenizer = RegexTokenizer()
        
        
        for sentence in temp1:
            temp_x =[]
            #temp_x= komoran.morphs(sentence,stem=True)
            temp_x= komoran.morphs(sentence)
            #temp_x = tokenizer.tokenize(sentence)
            temp_x = [word for word in temp_x if not word in stopwords]
            comments_list.append(temp_x) # 형태소로 잘리고
      
        
        vocab = FreqDist(np.hstack(comments_list)) #빈도수로 sort
        
        threshold = 2
        total_cnt = len(vocab)
        rare_cnt = 0
        total_freq = 0
        rare_freq = 0
        
        for key in vocab.keys():
            total_freq = total_freq + vocab[key] 
            if vocab[key] < threshold :
                rare_cnt = rare_cnt+1
                rare_freq = rare_freq + vocab[key]
                
        #         print('문장 집합(vocabulary)의 크기 :',total_cnt)
#         print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
#         print("문장 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
#         print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)
        
        vocab_size = total_cnt - rare_cnt + 2
        vocab = vocab.most_common(vocab_size) 

        word_to_index = {word[0] : index + 2 for index, word in enumerate(vocab)}
        word_to_index['pad'] = 0
        word_to_index['unk'] = 0
        encoded = []
        
        for line in comments_list: 
            temp = []
            for w in line: 
                try:
                    temp.append(word_to_index[w])
                except KeyError: 
                    temp.append(word_to_index['unk']) # unk의 인덱스로 변환
            encoded.append(temp)
        #print(encoded[0:5])
#         rint(encoded.size())
    
        #max_len = max(len(length) for length in encoded)
        max_len = 74 # batch_size        
#         print("here")
#         print(a)
#         print("encoded")
#         print(len(encoded))
#         print('문장의최대 길이 : %d' % max_len)
#         print('문장의최소 최소 길이 : %d' % min(len(length) for length in encoded))
#         print('문장의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))
        
        for line in encoded:
            if len(line) < max_len: # 현재 샘플이 정해준 길이보다 짧으면
                line += [word_to_index['pad']] * (max_len - len(line))
                
        encoded = torch.LongTensor(encoded)
        #print(encoded[0:10])
        
        
     
        #encoded = pad_sequence(encoded,batch_first=True)
        #print(encoded.size)
        
#         print('패딩결과 최대 길이 : %d' % max(len(l) for l in encoded))
#         print('패딩결과의 최소 길이 : %d' % min(len(l) for l in encoded))
#         print('패딩결과의 평균 길이 : %f' % (sum(map(len, encoded))/len(encoded)))

        comments_vector = []
#         for comment in temp1:
#             comments_vector.append(count_vectorizer.transform([comment]).toarray()[0])
#         comments_vector = torch.FloatTensor(comments_vector)

        self.comments_vec = encoded  # 단어집합 숫자에 맞추고 pad, 한 결과 집합
        self.comments_list = temp1  # 문장 원본
                print(len(temp1))
def db_sentence_2_token_list(database_history_all_users_data_list):
    """
    설명 : 긴 sentence를 RegexTokenizer로 token으로 나눠서 title에 있던 자리에 다시 담는다.

    input : Sentence들을 모아둔 list
    ex)
    input : [['computer', '의', 'Youtube', '채널', '확인하기'], ----]
    return : result (type = list)
    result : 
    [['computer', 'https://www.youtube.com/', 36, 3], 
    ['의', 'https://www.youtube.com/', 36, 3], 
    ['youtube', 'https://www.youtube.com/', 36, 3], 
    ['채널', 'https://www.youtube.com/', 36, 3], 
    ['확인', 'https://www.youtube.com/', 36,3]
    ------
    ]

    """
    # token
    tokenizer = RegexTokenizer()

    result = []
    # DB 한 줄씩 읽어들이기
    for line in database_history_all_users_data_list:
        """
        output : 
        ('https://www.youtube.com/', 'YouTube', 36)
        """
        # output이 tuple이어서
        url, title, visit_count, user_count = line
        # title의 text를 word로 끊어버리기
        title_list = tokenizer.tokenize(title)
        # title이 빈공간인 건 제외
        if len(title_list) == 0:
            continue

        else:
            for word in title_list:
                judgement = kor_or_eng_judge(word)
                # judgement 가 영어 한글이 아닐 경우
                if judgement == 0:
                    pass
                # judgement가 영어 경우 : 영어인 경우 lower한 단어 입력
                elif judgement == 'en':
                    result.append([
                        token_judge_en_lower_ko_noun(word), url, visit_count,
                        user_count
                    ])
                # judgement가 영어 경우 : 한글인 경우 lower한 단어 입력
                elif judgement == 'ko':
                    if len(token_judge_en_lower_ko_noun(word)) == 1:
                        result.append([
                            token_judge_en_lower_ko_noun(word)[0], url,
                            visit_count, user_count
                        ])
                    elif len(token_judge_en_lower_ko_noun(word)) == 0:
                        pass
                    else:
                        for token_noun in token_judge_en_lower_ko_noun(word):
                            result.append(
                                [token_noun, url, visit_count, user_count])

    return result
예제 #12
0
def review_cr(urll):
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    import pandas as pd
    from bs4 import BeautifulSoup

    # In[37]:

    url = urll

    # In[38]:

    driver = webdriver.Chrome(
        'C:/Users/multicampus/PycharmProjects/airbnb_bot/chromedriver')
    driver.implicitly_wait(3)
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # In[39]:

    reviews = soup.find('div', {
        'id': 'reviews'
    }).findAll('div', {'class': '_czm8crp'})
    review_list = []
    for review in reviews:
        review_list.append(review.string)
    print(review_list)

    # In[40]:

    df = pd.DataFrame(review_list, columns=['review'])

    # In[41]:

    from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

    tokenizer = RegexTokenizer()
    tokenizer

    # In[42]:

    parsed_list = []
    for i in df['review']:
        temp = tokenizer.tokenize(i)
        parsed_list.append(temp)

    df['review_parsed'] = parsed_list
    # print(df)

    # In[43]:

    STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※']

    # In[44]:

    def remove_stopwords(tokens):
        return [t for t in tokens if t not in STOP_WORDS]

    # In[45]:

    df['review_parsed'] = df['review_parsed'].apply(remove_stopwords)

    # In[118]:

    from collections import Counter
    from matplotlib import pyplot as plt

    faq_answer_parsed_lst = [
        y for x in df['review_parsed'].to_list() for y in x
    ]

    counter = Counter(faq_answer_parsed_lst)
    counter.most_common(20)
    counter = counter.most_common(20)

    print(counter)
    return counter