Python fileids 예제들, konlpy.corpus.kobill.fileids Python 예제들

예제 #1

0

파일 보기

nltk.download("punkt") # 구두점 정의에 필요한 모듈. (※ 다운로드는 최초 1회만 하면 됨)
from nltk.corpus import brown, gutenberg
from nltk.tokenize import sent_tokenize # 문장 단위 tokenize를 수행
from nltk.tokenize import word_tokenize, TweetTokenizer, regexp_tokenize # regexp_tokenize는 내가 정의한 정규식 표현으로 tokenizing을 수행함.
from nltk.corpus import stopwords # 불용어 사전
# brown corpus : 만들어진지 30년이 지났지만 밸런스가 좋아서 교과서처럼 사용하는 Corpus. tagged corpus(어절분류 후 품사까지 붙어 있는)이다.
# gutenberg corpus : 소설 말뭉치.
nltk.download() # Korpus를 다운받을 수 있는 GUI창을 띄워줌. nltk.download("brown")을 치면 GUI가 뜨지 않고 다운로드됨. (※ 다운로드는 최초 1회만 하면 됨)


# ------------------------------------------------- Konlpy 사용해보기 ---------------------------------------------------------------------
ma = Kkma()
print(ma.pos("오늘은 불금입니다.")) # 테스트

print(kolaw.fileids()) # txt 하나만 있음.
print(kobill.fileids()) # 의안과 관련된 txt파일 10개 제공

c = kolaw.open(kolaw.fileids()[0]).read() # 파일포인터를 통해 첫번째 파일 오픈
print(len(c)) # 18884개의 character를 갖고 있음.
print(len(c.split())) # 몇 개의 어절이 있는지 확인해보기(단순 띄어쓰기로 세었기때문에 중복 허용.) (4178개/정식 corpus는 보통 100만~1000만 단위의 어절 제공.)
print(len(c.splitlines()))  # 몇 개의 엔터가 들어가 있는지 확인
d = kobill.open(kobill.fileids()[0]).read()
print(d.splitlines()[:2]) # 처음 두 요소만 출력
# -------------------------------------------------------------------------------------------------------------------------------------------



# ------------------------------- NLTK 말뭉치 사용해보기(brown, gutenberg corpus) ----------------------------------------
print(len(brown.fileids()))
a = brown.open(brown.fileids()[0]).read()
print(len(a), len(a.split()), len(a.splitlines()), a.splitlines()[:3])

예제 #2

0

파일 보기

    return text


# 테스트
predict_pos_neg("올해 최고의 영화! 세 번 넘게 봐도 질리지가 않네요.")
predict_pos_neg("심심해")
predict_pos_neg("기뻐!! 신나!!")
predict_pos_neg("슬퍼 죽겠다")
predict_pos_neg("우울해")
# http://blog.naver.com/PostView.nhn?blogId=2feelus&logNo=220384206922&redirect=Dlog&widgetTypeCall=true
# [출처] 한글을 이용한 데이터마이닝및 word2vec이용한 유사도 분석|작성자 IDEO (참고하여 소스 커스터마이징)
# 1. 읽기
#!/usr/bin/env python
# -- coding: utf-8 --
from konlpy.corpus import kobill  # Docs from pokr.kr/bill
files_ko = kobill.fileids()  # Get file ids

# news.txt는 http://boilerpipe-web.appspot.com/ 를 통해 포탈뉴스 부분에서 긁어왔다.
# news.txt 는  konlpy의 corpus아래에 있는 kobill directory에 미리 저장되어있어야 한다.
# /Library/Python/2.7/site-packages/konlpy/data/corpus/kobill
doc_ko = kobill.open('ratings_train.txt').read()

# 2.Tokenize (의미단어 검출)
from konlpy.tag import Okt
import os
import json

# 학습시간이 오래 걸리므로 파일로 저장하여 처리 한다.
if os.path.isfile('tokens_ko_morphs.txt'):
    with open('tokens_ko_morphs.txt', encoding='UTF8') as f:
        tokens_ko_morphs = json.load(f)

예제 #3

0

파일 보기

파일: SimpleWord2Vec2.py 프로젝트: bluepoet/Word2VecExercise

from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from konlpy.corpus import kobill
from konlpy.tag import Twitter

t = Twitter()
fields_ko = kobill.fileids()
docs_ko = kobill.open('1809890.txt').read()
tokens_ko = t.morphs(docs_ko)
print(isinstance(tokens_ko, list))
print(tokens_ko)

embedding = word2vec.Word2Vec(tokens_ko,
                              size=5,
                              window=1,
                              negative=3,
                              min_count=1)

# token으론 잘 짤리는 데 왜 한글자로 저장되는지 모르겠음
embedding.wv.save_word2vec_format('my.sample', binary=False)

model = KeyedVectors.load_word2vec_format('my.sample',
                                          binary=False,
                                          encoding='utf-8')

print(model.most_similar('육'))

예제 #4

0

파일 보기

import importlib
from konlpy.corpus import kobill
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]

from konlpy.tag import Mecab
t = Mecab(dicpath="C:/mecab/mecab-ko-dic")
pos = lambda d: ['/'.join(p) for p in t.morphs(d)]
texts_ko = [pos(doc) for doc in docs_ko]

from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(texts_ko)
wv_model_ko.init_sims(replace=True)
wv_model_ko.save('ko_word2vec_e.model')

print(wv_model_ko.most_similar(pos('기가지니')))

예제 #5

0

파일 보기

파일: word_embedding_test.py 프로젝트: nlplab908/word2vec_test_kor

#!usr/bin/env python
# -*- coding: utf-8 -*-

# set default coding euc-kr 2 utf-8
import sys
reload(sys)

sys.setdefaultencoding('utf-8')

print ("load")
#load from kobill 
from konlpy.corpus import kobill
#docs_ko =kobill.open('kobill/news.txt').read()
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]
print ("tokenize")

#tokenize
from konlpy.tag import Twitter; t = Twitter()
print ("tokenize1")
pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)]
print ('tokenize2')
texts_ko = [pos(doc) for doc in docs_ko]
#texts_ko = pos(docs_ko)
print ("train")
import time
now_time = time.time()
#train
from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300)
wv_model_ko.init_sims(replace=True)

예제 #6

0

파일 보기

파일: 03-30.py 프로젝트: jang-yean-chul/Python_study

from konlpy.tag import Hannanum
h = Hannanum()
h.nouns(txt)


from konlpy.tag import Twitter
t = Twitter()
t.nouns(txt)
t.pos(txt)


import nltk

from konlpy.corpus import kobill

file_ko = kobill.fileids()

#디렉토리를 찾아야한다
#윈도우 탐색 > kobill 검색
#C:\Users\stu\Anaconda3\Lib\site-packages\konlpy\data\corpus\kobill
#예제파일 위치 찾아서 문재인태통령 취임사 > 열어서 다른이름으로 저장 > UTF-8로 kobill에 저장

doc_ko = kobill.open("문재인대통령취임사.txt").read()
doc_ko

from konlpy.tag import Twitter
t = Twitter()
tokens_ko = t.nouns(doc_ko)
tokens_ko

ko = nltk.Text(tokens_ko)

예제 #7

0

파일 보기

파일: ko_nlpy.py 프로젝트: arkainoh/nlpu

def load_ko_stopwords(filename):
  with open(filename, "r", encoding="utf-8") as f:
    s = set([line.rstrip() for line in f])
  return s

def tokenize(txt):
  tokens = Komoran().morphs(txt)
  hangul = re.compile('[^\uac00-\ud7a3]+')
  stpwrds = load_ko_stopwords("ko_stopwords.txt")
  tokens = [hangul.sub('', i) for i in tokens]
  tokens = [i for i in tokens if len(i) > 0 and i not in stpwrds]
  return tokens

#main
tokens = []
for i in kobill.fileids(): tokens.append(tokenize(kobill.open(i).read()))

for i in kolaw.fileids(): tokens.append(tokenize(kolaw.open(i).read()))

config = {
  'min_count': 2,
  'size': 100,
  'sg': 1,
  'batch_words': 10000,
  'iter': 20,
  'workers': multiprocessing.cpu_count(),
}

embedding_model = Word2Vec(tokens, **config)

print(embedding_model.most_similar(positive=tokenize('육아휴직'), topn=50))

예제 #8

0

파일 보기

파일: Konlpy Jak code modified by mads but not working.py 프로젝트: aiedward/JARDIS

import multiprocessing
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from os import walk
from gensim.models.word2vec import Text8Corpus
from datetime import datetime
import time
import os

import logging

import konlpy

# read Korean Doc
from konlpy.corpus import kobill
files_ko = kobill.fileids()
doc_ko = kobill.open('1809890.txt').read()

# Tokenize
from konlpy.tag import Twitter
t = Twitter()
tokens_ko = t.morphs(doc_ko)

# Load tokens with
import nltk
ko = nltk.Text(tokens_ko, name='대한민국 국회 의안 제 1809890호')


def train_and_test(root_train, root_test, output_name, params):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',

예제 #9

0

파일 보기

파일: __init__.py 프로젝트: jushik91/190713_1

import pandas as pd
import matplotlib.pyplot as plt
from konlpy.corpus import kolaw
print(kolaw.fileids())
c = kolaw.open('constitution.txt').read()
print(c[:40])

from konlpy.corpus import kobill
kobill.fileids()

d = kobill.open('1809890.txt').read()
print(d[:40])

from konlpy.tag import *

hannanum = Hannanum()
kkma = Kkma()
komoran = Komoran()
##mecab = Mecab()
okt = Okt()

hannanum.nouns(c[:40])

kkma.nouns(c[:40])

# komoran은 빈줄이 있으면 에러가 남
komoran.nouns("\n".join([s for s in c[:40].split("\n") if s]))

##mecab.nouns(c[:40])

okt.nouns(c[:40])

예제 #10

0

파일 보기

def word_cloud(book_name):
    # !pip install wordcloud

    import nltk
    from konlpy.corpus import kobill
    from konlpy.tag import Twitter
    t = Twitter()
    from wordcloud import WordCloud

    import matplotlib.pyplot as plt
    import platform
    import io
    import base64
    img = io.BytesIO()

    # OS별 matplotlib 한국어 처리
    path = "static/AppleGothic.ttf"  # window 사용자의 경우 path 설정 중요
    from matplotlib import font_manager, rc
    if platform.system() == 'Darwin':
        rc('font', family='AppleGothic')
    elif platform.system() == 'Windows':
        font_name = font_manager.FontProperties(fname=path).get_name()
        rc('font', family=font_name)
    else:
        print('Unknown system... sorry~~~~')

    # 워드 클라우드 만들기 시작

    files_ko = kobill.fileids()
    books_all = pd.read_csv('static/books_all.csv')

    book_name = book_name  # input으로 받음

    files_ko = kobill.fileids()

    doc_ko = books_all[books_all['name'] == book_name].iloc[0].text
    tokens_ko = t.nouns(doc_ko)

    with open('static/project_stopwords.txt', 'r', encoding='utf-8') as f:
        stop_words = f.read().split(' ')

    ko = nltk.Text(tokens_ko)
    ko = [each_word for each_word in ko if each_word not in stop_words]
    ko = nltk.Text(ko)

    data = ko.vocab().most_common(150)

    # for win : font_path='c:/Windows/Fonts/malgun.ttf'
    wordcloud = WordCloud(
        font_path='static/AppleGothic.ttf',
        relative_scaling=0.2,
        background_color='white',
    ).generate_from_frequencies(dict(data))

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(img, format='png')
    img.seek(0)

    return base64.b64encode(img.getvalue()).decode()

예제 #11

0

파일 보기

from konlpy.corpus import kolaw, kobill
print(kolaw.fileids(), kobill.fileids())

# 헌법 전문
c = kolaw.open('constitution.txt').read()
print(c[:40])

print('--------------------------------------------------------')
# 국회 속기록
k = kobill.open('1809899.txt').read()
print(k[:300])

예제 #12

0

파일 보기

from konlpy.corpus import kolaw
import pandas as pd
from konlpy.tag import *
from konlpy.corpus import kobill
from nltk import Text
import matplotlib.pyplot as plt
from wordcloud import WordCloud

print(kolaw.fileids())

c = kolaw.open('constitution.txt').read()
print(c[:40])

print(kobill.fileids())

d = kobill.open('1809890.txt').read()
print(d[:40])


hannanum = Hannanum()
kkma = Kkma()
komoran = Komoran()
#mecab = Mecab()  # 일본어
okt = Okt()

hannanum.nouns(c[:40])
kkma.nouns(c[:40])
# komoran은 빈줄이 있으면 에러가 남
komoran.nouns("\n".join([s for s in c[:40].split("\n") if s]))
#mecab.nouns(c[:40])
okt.nouns(c[:40])

예제 #13

0

파일 보기

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from konlpy.corpus import kobill    # Docs from pokr.kr/bill
files_ko = kobill.fileids()         # Get file ids
doc_ko = kobill.open('test.txt').read() 
# news.txt는 http://boilerpipe-web.appspot.com/ 를 통해 포탈뉴스 부분에서 긁어왔다.
# news.txt 는  konlpy의 corpus아래에 있는 kobill directory에 미리 저장되어있어야 한다. 
# /Library/Python/2.7/site-packages/konlpy/data/corpus/kobill

2.Tokenize (의미단어 검출)
from konlpy.tag import Twitter; t = Twitter()
tokens_ko = t.morphs(doc_ko)

3. Token Wapper 클래스 만들기(token에대해 이런 저런 처리를 하기 위해)
import nltk
ko = nltk.Text(tokens_ko, name='뉴스')

4. 토근 정보및 단일 토큰 정보 알아내기
print(len(ko.tokens))       # returns number of tokens (document length)
print(len(set(ko.tokens)))  # returns number of unique tokens
ko.vocab()                  # returns frequency distribution



#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
reload(sys)

sys.setdefaultencoding('utf-8')