예제 #1
0
 def setUp(self):
     nltk.download("gutenberg")
     self.docs = []
     for fid in gutenberg.fileids():
         f = gutenberg.open(fid)
         self.docs.append(f.read())
         f.close()
예제 #2
0
c = kolaw.open(kolaw.fileids()[0]).read() # 파일포인터를 통해 첫번째 파일 오픈
print(len(c)) # 18884개의 character를 갖고 있음.
print(len(c.split())) # 몇 개의 어절이 있는지 확인해보기(단순 띄어쓰기로 세었기때문에 중복 허용.) (4178개/정식 corpus는 보통 100만~1000만 단위의 어절 제공.)
print(len(c.splitlines()))  # 몇 개의 엔터가 들어가 있는지 확인
d = kobill.open(kobill.fileids()[0]).read()
print(d.splitlines()[:2]) # 처음 두 요소만 출력
# -------------------------------------------------------------------------------------------------------------------------------------------



# ------------------------------- NLTK 말뭉치 사용해보기(brown, gutenberg corpus) ----------------------------------------
print(len(brown.fileids()))
a = brown.open(brown.fileids()[0]).read()
print(len(a), len(a.split()), len(a.splitlines()), a.splitlines()[:3])

b = gutenberg.open(gutenberg.fileids()[0]).read()
print(len(b), len(b.split()), len(b.splitlines()), b.splitlines()[:3])
# ------------------------------------------------------------------------------------------------------------------------



# ------------------------------------------- Tokenize 해보기 -------------------------------------------------------
s = sent_tokenize(b) # 공식적으로 10개의 언어를 지원하지만 한국어, 일본어, 중국어는 없다. 구두점을 기준으로 분석.
print(len(s), len(b.splitlines()))
print(s[:3], b.splitlines()[:3])
print(sent_tokenize("Hello world, Hello world! Hello........?"))
print(sent_tokenize("집에 가고?싶다.....")) # 구두점 다음에 스페이스가 있으면 문장의 경계로 인식.(없으면 하나의 문장으로 인식.) 즉 구두점 다음에 스페이스가 있어야 한다.
print(word_tokenize(d)) # 어절 단위로 인식했으나 띄어쓰기 단위는 아님
print(word_tokenize("10분만 버티자 :) ")) # ':)'는 트위터 등과 같이 몇자 안되는 글자에서 감정을 표현하는 중요한 수단이므로 어절 단위로 분리되면 안된다.
# 따라서 TweetTokenizer라는 모듈이 제공되며, word_tokenize와 특징이 다르므로 인스턴스로 받아서 사용해야 함.
print(TweetTokenizer().tokenize("10분만 버티자 :)")) # ':)'도 분류가 잘 되는 것을 확인할 수 있음.
from nltk import sent_tokenize, word_tokenize
#stopwords
from nltk.corpus import stopwords
#K Fold Cross Validation
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
#Import svm model
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#import 3 Gutenberg Books
from nltk.corpus import gutenberg
files_en = gutenberg.fileids()  # Get file ids
emma_en = gutenberg.open('austen-emma.txt').read()
sense_en = gutenberg.open('austen-sense.txt').read()
brown_en = gutenberg.open('chesterton-brown.txt').read()


def tokenize(data):
    tokenized_word = word_tokenize(data)
    tokenized_word = [w for w in tokenized_word if w.isalpha()]
    tokenized_word = [w.lower() for w in tokenized_word]
    stop_words = stopwords.words('english')
    tokenized_word = [
        w for w in tokenized_word if not w in stop_words and len(w) >= 3
    ]

    tokenized_word_list = []
    start = 0
예제 #4
0
from nltk.corpus import gutenberg
import matplotlib.pyplot as plt
# % matplotlib inline

bible = gutenberg.open('bible-kjv.txt')
bible = bible.readlines()
res = bible[:5]
print(res)

# sentence = "I love coding on python, because it gives me and enormous ability to use the Data processing!"

예제 #5
0
파일: preprocess.py 프로젝트: yitzikc/nesta
    tfidf.fit(docs)
    lower_idf = np.percentile(tfidf.idf_, lower_idf_limit)
    upper_idf = np.percentile(tfidf.idf_, upper_idf_limit)
    # Pick out the vocab to be dropped
    drop_vocab = set(
        term for term, idx in tfidf.vocabulary_.items()
        if tfidf.idf_[idx] < lower_idf or tfidf.idf_[idx] >= upper_idf)
    # Filter the documents
    new_docs = []
    for doc in documents:
        _new_doc = []
        for sent in doc:
            _new_sent = [w for w in sent if w not in drop_vocab]
            if len(_new_sent) == 0:
                continue
            _new_doc.append(_new_sent)
        new_docs.append(_new_doc)
    return new_docs


if __name__ == '__main__':
    nltk.download("gutenberg")
    from nltk.corpus import gutenberg
    docs = []
    for fid in gutenberg.fileids():
        f = gutenberg.open(fid)
        docs.append(f.read())
        f.close()
    docs = [tokenize_document(d) for d in docs]
    docs = filter_by_idf(docs, 10, 90)
예제 #6
0
# coding=UTF-8
#import nltk
#nltk.download()

import csv
import pandas as pd
from nltk.corpus import gutenberg   # Docs from project gutenberg.org
#from scrapy.item import Field
files_en = gutenberg.fileids()      # Get file ids
doc_en = gutenberg.open('C:\\Python27\\NLPK\\pg158.txt').read()


from nltk import regexp_tokenize
pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
tokens_en = regexp_tokenize(doc_en, pattern)


import nltk
en = nltk.Text(tokens_en)



# CSV Field 
# 

import csv
import nltk
import os.path
import sys

### read csv
예제 #7
0
def get_gutenberg():
    return gutenberg.open(gutenberg.fileids()[0]).read()
예제 #8
0
BA_04192018
Text Mining

@author: Justin
"""

import nltk

nltk.download()
# stopwords collection은 usually 텍스트마이닝에서 제외하는 단어를 모아놨다.
# nltk lemmatization 은 worknet을 기반으로 분석한다.

from nltk.corpus import gutenberg
ids = gutenberg.fileids()  # 옛것이라 라이센스 없어서 괜찮음 ㅎ

text = gutenberg.open(ids[0]).read()  # emma 로 분석을 시작해보자.

nltk.download('punkt')
from nltk import word_tokenize
tokens = word_tokenize(text)
tokens[:100]

en = nltk.Text(tokens)
#tokens = en.tokens # 모든 character를 나눈다. nltk.Text에 text를 넣으면.
dic = en.vocab()
en.plot(50)

lower_tokens = [x.lower() for x in tokens]  # 모든 character를 lower case로.
en_lw = nltk.Text(lower_tokens)
dic_lw = en_lw.vocab()
from nltk.corpus import gutenberg  # Docs from project gutenberg.org

files_en = gutenberg.fileids()  # Get file ids
doc_en = gutenberg.open('austen-emma.txt').read()

from nltk import regexp_tokenize
pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
tokens_en = regexp_tokenize(doc_en, pattern)
#nltk.download('gutenberg')

import nltk
en = nltk.Text(tokens_en)

print(len(en.tokens))  # returns number of tokens (document length)
print(len(set(en.tokens)))  # returns number of unique tokens
en.vocab()

#en.plot(50)
print(doc_en.count('Emma'))
print(tokens_en.count('Emma'))
print(en.count('Emma'))  # Counts occurrences

#en.dispersion_plot(['Emma', 'Frank', 'Jane'])

#en.concordance('Emma', lines=5)

# Find similar words;
#en.similar('Emma')
#en.similar('Frank')

#en.collocations()
예제 #10
0
# http://www.lucypark.kr/courses/2015-ba/text-mining.html


from nltk.corpus import gutenberg
from nltk import regexp_tokenize
import nltk


# donwload corpus (just the first time!!)
nltk.download('gutenberg')
nltk.download('maxent_treebank_pos_tagger')
nltk.download("reuters")

# View
files_en = gutenberg.fileids()
doc_en = gutenberg.open('austen-emma.txt').read()

# Tokenize
pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
tokens_en = regexp_tokenize(doc_en, pattern)
en = nltk.Text(tokens_en)

print(len(en.tokens))
print(len(set(en.tokens)))

en.vocab()
en.plot(50)


# Count
en.count('Emma')
예제 #11
0
import pyLDAvis
import pyLDAvis.gensim
from sklearn import mixture
from copy import deepcopy
from sklearn.metrics.cluster import adjusted_rand_score
from gensim import corpora, models
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('gutenberg')
from nltk.corpus import gutenberg 

files_en = gutenberg.fileids()
selected_titles = ['3623-8.txt','19528-8.txt','24681-8.txt','29444-8.txt','milton-paradise.txt']
#Downloading and opening 5 books
#upload the text 
text_1 = gutenberg.open('3623-8.txt').read()
text_2 = gutenberg.open('19528-8.txt').read()
text_3 = gutenberg.open('24681-8.txt').read()
text_4 = gutenberg.open('29444-8.txt').read()
text_5 = gutenberg.open('milton-paradise.txt').read()

#remove numbers from the text
removeNum1 = re.sub('[^a-zA-Z]',' ', text_1 )
removeNum2 = re.sub('[^a-zA-Z]',' ', text_2 )
removeNum3 = re.sub('[^a-zA-Z]',' ', text_3 )
removeNum4 = re.sub('[^a-zA-Z]',' ', text_4 )
removeNum5 = re.sub('[^a-zA-Z]',' ', text_5 )

#Tokenizing data
from nltk import regexp_tokenize
pattern = r'''(?x) (?:[A-Z]\.)+ | \w+(?:[-]\w+)* | \$?\d+(?:\.\d+)?%?| \.\.\. | [][.,;"'?():-_`]'''
예제 #12
0
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# In[115]:

text_1 = gutenberg.open(
    '10985.txt').read()  #The Infant System by Samuel Wilderspin
text_2 = gutenberg.open('42547.txt').read(
)  #The Art and Practice of Silver Printing by Abney and Robinson
text_3 = gutenberg.open(
    '10773.txt').read()  #Ancient and Modern Physics by Thomas Edgar Willson
text_4 = gutenberg.open('51397.txt').read()  #People Soup by Alan Arkin
text_5 = gutenberg.open(
    '17699.txt').read()  #The Evolution of Love by Emil Lucka
text_6 = gutenberg.open(
    '29420.txt').read()  #American Rural Highways by T. R. Agg
text_7 = gutenberg.open('389.txt').read()  #The Great God Pan by Arthur Machen

# In[116]:


class Preprocess(