Python load_20newsgroups 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: twenty_newsgroups

메소드/함수: load_20newsgroups

hotexamples.com에서의 예제들: 6

Python load_20newsgroups - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 twenty_newsgroups.load_20newsgroups에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: lab2_naivebayes.py 프로젝트: Gundanss/ScikitLearnforWebMining

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np

# added
from twenty_newsgroups import load_20newsgroups
# added

categories = [
    'alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'
]
# twenty_train = fetch_20newsgroups(subset='train', categories=categories)

# added
twenty_train = load_20newsgroups(data_home='./',
                                 subset='train',
                                 categories=categories)
# added

# tokenizing text with sk-learn
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

# tf–idf can be computed as follows:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# print(type(X_train_tfidf))

# train classifier
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

예제 #2

파일 보기

파일: evaluate-performance-of-classifier.py 프로젝트: singsing215/exercise

from twenty_newsgroups import load_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LogisticRegression  #we use Logisitic Regression11 classifier rather than Naïve Bayes, because the latter cannot handle negative features that are commonly seen after dimension reduction.
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as pt
from wordcloud import WordCloud

train_data = load_20newsgroups(data_home='./',
                               subset='train',
                               categories=[
                                   'alt.atheism', 'comp.sys.mac.hardware',
                                   'rec.sport.baseball', 'sci.med'
                               ])
test_data = load_20newsgroups(data_home='./',
                              subset='test',
                              categories=[
                                  'alt.atheism', 'comp.sys.mac.hardware',
                                  'rec.sport.baseball', 'sci.med'
                              ])
#print(len(train_data.data))#2249
#print(len(test_data.data))#1497
'''
# list out all the categories name in the dataset
print(train_data.target_names)#['alt.atheism', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'sci.med']
print(train_data.target_names[train_data.target[2000]])#sci.med
print(train_data.data[2000])#content
print(train_data.target_names[0])#alt.atheism

예제 #3

파일 보기

파일: lab2_wordcloud.py 프로젝트: Gundanss/ScikitLearnforWebMining

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = 'To introduce the fundamental concepts as well as practical applications of contemporary \
        Artificial Intelligence incorporating knowledge discovery and data mining social \
        network intelligence and intelligent agents and advanced Information Technology in the \
        context of Web empowered social computing systems environments and activities To \
        discuss the techniques and issues central to the development of social computing and Web \
        intelligence systems'

dataset = load_20newsgroups(data_home='./', subset='all', categories=None)
labels = dataset.target
k = 6
print("k = ", k)

# For detailed usage of class WordCloud(), you can refer to the following link:
# https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html#wordcloud.WordCloud
wordcloud = WordCloud().generate(text)
# Output the generated file to current folder
wordcloud.to_file('wordcloud.png')
# For detailed usage of class matplotlib.pyplot, you can refer to the following links
# Matplotlib: https://matplotlib.org/index.html
# matplotlib.pyplot.imshow: https://matplotlib.org/api/_as_gen/matplotlib.pyplot.imshow.html
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

예제 #4

파일 보기

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

# all_dataset = load_20newsgroups(data_home='./test', subset='all')
# training_dataset = load_20newsgroups(data_home='./test', subset='train')
# test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test')

#### word cloud
categories = ['happy', 'sad']
for category in categories:
    ## categories must be a list in here
    train_dataset1 = load_20newsgroups(data_home='./test',
                                       subset='train',
                                       categories=[category])
    string = train_dataset1.data.__str__()
    # print(string)
    wordcloud = WordCloud().generate(string)
    # wordcloud.to_file(category + ".png")
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
# dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all', categories=['sci.med'])

예제 #5

파일 보기

파일: lab2_load_local_data.py 프로젝트: Gundanss/ScikitLearnforWebMining

from twenty_newsgroups import load_20newsgroups

# The reference of the data set 20 Newsgroups can be found here:
# http://qwone.com/~jason/20Newsgroups/
newsgroups_data = load_20newsgroups(data_home='./', subset='all')
# list out all the categories name in the dataset
print('newsgroups_data.target_names:')
print(newsgroups_data.target_names)
print('')

print('Size of newsgroups_data.data: %d' % len(newsgroups_data.data))
for i in range(3):
    print('Doc Number %d' % i)
    print('Target Index: %d' % newsgroups_data.target[i])
    print('Doc Type: %s' % newsgroups_data.target_names[newsgroups_data.target[i]])
    print(newsgroups_data.data[i])
    print('')

예제 #6

파일 보기

파일: Analysis.py 프로젝트: Gundanss/IMDB_DataAcquisition

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

## question A
## load the training data and all the data
all_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all')
training_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='train')
test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test')
# print(training_dataset.target)
# list out all the categories name in the dataset
print('all_dataset.target_names:', all_dataset.target_names)
print('training_dataset.target_names:', training_dataset.target_names)
print('')

# print('Size of dataset.data: %d' % len(all_dataset.data))
# print('Size of training_dataset.data: %d' % len(training_dataset.data))
print('')
# for i in range(3):
#     print('Doc Number %d' % i)
#     print('Target Index: %d' % all_dataset.target[i]) ## ??
#     print('Doc Type: %s' % all_dataset.target_names[all_dataset.target[i]])