Пример #1
0
def movieword(code):
    df1=movie_start.Getdata([code],20)
    noun_extractor = LRNounExtractor(verbose=True)
    noun_extractor.train(df1['text'])
    nouns = noun_extractor.extract()
    movie_wordcloud.displayWordCloud(str(code),' '.join(nouns))    
    return "ok"
Пример #2
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)

    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-' * 40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(noun_scores)))
    topwords = sorted(
        noun_scores,
        key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-' * 40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores,
                      key=lambda x: -newsnoun_scores[x].score *
                      newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word,
                                             newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')
Пример #3
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    
    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-'*40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(noun_scores)))
    topwords = sorted(noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-'*40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')
Пример #4
0
def movieword(code):
    df1 = movie_start.Getdata([code])
    # 명사만 뽑는 작업 나중에 코드 쓰기
    noun_extractor = LRNounExtractor(verbose=True)
    noun_extractor.train(df1['text'])
    nouns = noun_extractor.extract()
    # 명사들을 연결해서 워드클라우드로 뽑음
    movie_wordcloud.displayWordCloud(str(code), ' '.join(nouns))
    return "ok"
Пример #5
0
def train():

    normed_path = path['norm']

    noun_src_path = path['noun']['src']
    noun_lrgraph_path = path['noun']['lrgraph']
    noun_trained_path = path['noun']['train']['pkl']
    noun_readable_path = path['noun']['train']['readable']
    noun_result_path = path['noun']['result']

    corpus = DoublespaceLineCorpus(normed_path, iter_sent=True)

    noun_extractor = LRNounExtractor(verbose=False, min_num_of_features=1)
    nouns = noun_extractor.train_extract(corpus, minimum_noun_score=0.5)

    word_freq = noun_extractor._wordset_l_counter
    lrgraph = noun_extractor.lrgraph
    words = noun_extractor.words

    trained_data = {}
    trained_data['lrgraph'] = lrgraph
    trained_data['words'] = words
    trained_data['word_freq'] = word_freq

    with open(noun_src_path, 'wb') as f:
        pickle.dump(trained_data, f)

    with open(noun_lrgraph_path, 'w', encoding='utf8') as f:
        json.dump(lrgraph, f, ensure_ascii=False, indent=4)

    params = {}
    for noun, noun_score in nouns.items():
        params[noun] = {
            'frequency': noun_score.frequency,
            'score': noun_score.score,
            'known_r_ratio': noun_score.known_r_ratio
        }

    with open(noun_trained_path, 'wb') as f:
        pickle.dump(params, f)

    with open(noun_readable_path, 'w', encoding='utf8') as f:
        json.dump(sorted(params.items()), f, ensure_ascii=False, indent=4)

    with open(noun_result_path, 'w', encoding='utf8') as f:
        json.dump(sorted(params), f, ensure_ascii=False, indent=4)

    update_user_dict()
    update(forced=True)
Пример #6
0
def tag_counting(law_event_type):
    prec = pd.read_csv('law_list_detail.csv', encoding='utf-8')

    noun_extractor = LRNounExtractor(verbose=True)
    noun_extractor.train(prec[prec['law_event_type'] == law_event_type]
                         ['law_content'].astype('str').apply(preprocessing))
    nouns = noun_extractor.extract()

    count = Counter(nouns)
    # print(count)

    tag_count = []
    stopwords = make_stopword()
    # print(stopwords)

    for n, c in count.most_common(200):
        if n not in stopwords:
            dics = {'tag': n, 'count': c[0]}
            tag_count.append(dics)

        if len(tag_count) == 20:
            break

    # print(tag_count)

    for tag in tag_count:
        print("{:<14}".format(tag['tag']), end='\t')
        print("{}".format(tag['count']))

    df = pd.DataFrame.from_dict(tag_count, orient='columns')
    df.set_index(df['tag'], inplace=True)
    # print(df)

    # 스타일 서식 지정
    plt.style.use('ggplot')

    ax1 = df.plot(kind='bar',
                  figsize=(20, 10),
                  width=0.7,
                  stacked=False,
                  legend=None)

    ax1.set_ylim(0, 60000)
    ax1.set_xlabel('단어', size=20)
    ax1.set_ylabel('빈도수', size=20)

    plt.title('사건 종류별 특정 단어 빈도수(형사)', size=20)

    plt.show()
Пример #7
0
def train_extractor(begin_d=None,
                    end_d=None,
                    sections: list = None,
                    base_dir='./out',
                    tokenizer=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d,
                                             end_d=end_d,
                                             sections=sections,
                                             base_dir=base_dir)
    # nouns = get_noun_words(begin_d='20201101', end_d='20201130')

    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)  # list of str like
    noun_score = dict([(key, val.score) for key, val in nouns.items()])
    if tokenizer is None:
        tokenize = lambda x: x.strip().split()
    elif tokenizer == 'max_score_tokenizer':
        tokenize = MaxScoreTokenizer(noun_score)
    elif tokenizer == 'ltokenizer':
        tokenize = LTokenizer(noun_score)
    else:
        raise NotImplementedError

    if sections is not None and len(sections) >= 1:
        min_tf = 10
        min_df = 2
    else:
        min_tf = 20
        min_df = 2

    keyword_extractor = CorpusbasedKeywordExtractor(
        min_tf=min_tf,
        min_df=min_df,
        # tokenize=lambda x: x.strip().split(),
        tokenize=tokenize,
        verbose=True)
    # docs: list of str like
    keyword_extractor.train(sentences)
    return keyword_extractor, nouns, corpus_class
Пример #8
0
def get_keyword(characters):
    df = pd.read_csv("./MbtiApp/keyword/roles.csv")
    stopwords = pd.read_csv("./MbtiApp/keyword/stopwords.csv")["stopwords"]
    sentences = df.iloc[:, 2]
    sentences = list(sentences) + list(characters["feature_total"])
    # 명사 추출
    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)
    nouns = sorted(nouns, key=lambda x: len(x), reverse=True)

    # stopwords 제거
    for sw in stopwords:
        if sw in nouns:
            nouns.remove(sw)

    personal = []
    for i, row in characters.iterrows():
        noun_sen = ""
        for noun in nouns:
            if noun in row["feature_total"]:
                noun_sen = noun_sen + " #" + noun
        personal.append(noun_sen)
    characters["personal"] = personal
    return characters
Пример #9
0
    def __init__(self, pre_trained=True, analyzer='Hannanum'):
        self.pre_trained = pre_trained

        if analyzer == 'Hannanum':
            self.analyzer = tag.Hannanum()
        elif analyzer == 'Kkma':
            self.analyzer = tag.Kkma()
        elif analyzer == 'Komoran':
            self.analyzer = tag.Komoran()
        elif analyzer == 'Mecab':
            self.analyzer = tag.Mecab()
        elif analyzer == 'Okt':
            self.analyzer = tag.Okt()
        else:
            if pre_trained == False:
                pass
            else:
                print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt')

        self.WordExtractor = WordExtractor(min_frequency=0)
        self.noun_extractor = LRNounExtractor(verbose=False)
        self.word_score = {}
Пример #10
0
                          width=width,
                          height=height).generate(data)
    wordcloud.to_file(os.path.join(currdir, "wc" + num + ".png"))
    #plt.figure(figsize = (15 , 10))
    #plt.imshow(wordcloud)
    #plt.axis("off")
    #plt.show()


# In[51]:

from soynlp.noun import LRNounExtractor

# In[52]:

noun_extractor = LRNounExtractor(verbose=True)
noun_extractor.train(sentences1)
nouns1 = noun_extractor.extract()
noun_extractor.train(sentences2)
nouns2 = noun_extractor.extract()
noun_extractor.train(sentences3)
nouns3 = noun_extractor.extract()
noun_extractor.train(sentences4)
nouns4 = noun_extractor.extract()
noun_extractor.train(sentences5)
nouns5 = noun_extractor.extract()
noun_extractor.train(sentences6)
nouns6 = noun_extractor.extract()
noun_extractor.train(sentences7)
nouns7 = noun_extractor.extract()
noun_extractor.train(sentences8)
Пример #11
0
    plt.axis("off")
    plt.show()


df = pd.read_csv('foo1.csv', engine='python', encoding='utf-8')
tokenizer = RegexTokenizer()
stopwords_kr = [
    '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말',
    '너무', '[', ']', '것으로', '했습니다', '했다'
]

sentences = df['본문'].apply(preprocessing)
displayWordCloud(' '.join(sentences))

# soynlp로 명사 추출하기
noun_extractor = LRNounExtractor(verbose=True)
noun_extractor.train(sentences)
nouns = noun_extractor.extract()
displayWordCloud(' '.join(nouns))

# 이미지 파일위에 출력하기
img = Image.open('cloud.png')
img_array = np.array(img)

wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf',
                      stopwords=stopwords_kr,
                      background_color='white',
                      mask=img_array,
                      width=800,
                      height=600).generate(' '.join(nouns))
plt.figure(figsize=(15, 10))
Пример #12
0
law_categoriesMin1 = law_categoriesMin['law_content'].astype('str').apply(
    preprocessing)
# print(law_categoriesMin1.head())
# displayWordCloud(' '.join(law_categoriesMin1))

# law_categoriesSe = prec[prec['law_event_type'] == "세무"]
# law_categoriesSe1 = law_categoriesSe['law_content'].astype('str').apply(preprocessing)
# displayWordCloud(' '.join(law_categoriesSe1))

# law_categoriesH = prec[prec['law_event_type'] == "일반행정"]
# law_categoriesH1 = law_categoriesH['law_content'].astype('str').apply(preprocessing)
# displayWordCloud(' '.join(law_categoriesH1))

# law_categoriesT = prec[prec['law_event_type'] == "특허"]
# law_categoriesT1 = law_categoriesT['law_content'].astype('str').apply(preprocessing)
# # print(law_categoriesT1)
# displayWordCloud(' '.join(law_categoriesT1))

# law_categoriesP = prec[prec['law_event_type'] == "형사"]
# law_categoriesP1 = law_categoriesP['law_content'].astype('str').apply(preprocessing)
# displayWordCloud(' '.join(law_categoriesP1))

noun_extractor = LRNounExtractor(verbose=True)
noun_extractor.train(law_categoriesMin1)
# 명사만 추출
nouns = noun_extractor.extract()
# print(type(nouns))
# print(nouns)
displayWordCloud(' '.join(nouns))

# displayWordCloud(' '.join(law_categoriesGa1))
Пример #13
0
def detail(m_no, current_movie_title):

    conn = pymysql.connect(host='127.0.0.1',
                           user='******',
                           password='******',
                           db='movie',
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)
    try:
        with conn.cursor() as cursor:
            sql = 'select * from current_movie c inner join test t on c.current_movie_title = t.title where current_movie_title = %s;'
            cursor.execute(sql, (current_movie_title))
            result = cursor.fetchone()  #하나만 가져올떄

            sql = 'select * from current_movie where current_movie_title = %s;'
            cursor.execute(sql, (current_movie_title))
            result1 = cursor.fetchone()  #하나만 가져올떄

            sql = 'select * from board where m_no= %s;'
            cursor.execute(sql, (m_no))
            board = cursor.fetchall()
    finally:
        conn.close()
    if result is not None:
        tmrvl = []
        movieName = result['codem']

        for page in range(1, 200):
            url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=" + str(
                movieName) + "&page=" + str(page)
            response = urllib.request.urlopen(url)

            soup = BeautifulSoup(response, 'html.parser')
            table = soup.select('ul.rvw_list_area li a')
            for result3 in table:
                mrv = str(result3.string)
                tmrv = tuple([mrv])
                tmrvl.append(tmrv)
                #tmrv1=str(tmrv)
                #f.write(tmrv1)
        df = pd.DataFrame(tmrvl)

        def preprocessing(text):
            # 개행문자 제거
            text = re.sub('\\\\n', ' ', text)
            return text

        tokenizer = RegexTokenizer()
        stopwords_kr = [
            '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은',
            '많이', '정말', '너무', '[', ']', '것으로', '했습니다', '했다'
        ]

        sentences = df[0].apply(preprocessing)

        # soynlp로 명사 추출하기
        noun_extractor = LRNounExtractor(verbose=True)
        noun_extractor.train(sentences)
        nouns = noun_extractor.extract()

        # 이미지 파일위에 출력하기
        img = Image.open('IT_Bank_Movie/static/img/cloud.png')
        img_array = np.array(img)

        wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf',
                              stopwords=stopwords_kr,
                              background_color='white',
                              mask=img_array,
                              width=800,
                              height=600).generate(' '.join(nouns))
        plt.figure(figsize=(15, 10))
        plt.imshow(wordcloud)
        plt.axis("off")
        #plt.show()
        url1 = "IT_Bank_Movie/static/wordcloud/" + current_movie_title + ".png"
        wordcloud.to_file(url1)

    return render_template('movie_detail.html',
                           wordInfo=result,
                           board=board,
                           movieInfo=result1)
Пример #14
0
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

stopwords_kr = [
    '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말',
    '너무'
]


def displayWordCloud(data=None,
                     backgroundcolor='white',
                     width=800,
                     height=600):
    wordcloud = WordCloud(font_path=fontpath,
                          stopwords=stopwords_kr,
                          background_color=backgroundcolor,
                          width=width,
                          height=height).generate(data)
    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


# noun_extractor = LRNounExtractor(verbose=True)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(content)
# nouns = noun_extractor.extract()

print(nouns)
# displayWordCloud(' '.join(nouns))