from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np

# added
from twenty_newsgroups import load_20newsgroups
# added

categories = [
    'alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'
]
# twenty_train = fetch_20newsgroups(subset='train', categories=categories)

# added
twenty_train = load_20newsgroups(data_home='./',
                                 subset='train',
                                 categories=categories)
# added

# tokenizing text with sk-learn
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

# tf–idf can be computed as follows:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# print(type(X_train_tfidf))

# train classifier
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
from twenty_newsgroups import load_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LogisticRegression  #we use Logisitic Regression11 classifier rather than Naïve Bayes, because the latter cannot handle negative features that are commonly seen after dimension reduction.
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as pt
from wordcloud import WordCloud

train_data = load_20newsgroups(data_home='./',
                               subset='train',
                               categories=[
                                   'alt.atheism', 'comp.sys.mac.hardware',
                                   'rec.sport.baseball', 'sci.med'
                               ])
test_data = load_20newsgroups(data_home='./',
                              subset='test',
                              categories=[
                                  'alt.atheism', 'comp.sys.mac.hardware',
                                  'rec.sport.baseball', 'sci.med'
                              ])
#print(len(train_data.data))#2249
#print(len(test_data.data))#1497
'''
# list out all the categories name in the dataset
print(train_data.target_names)#['alt.atheism', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'sci.med']
print(train_data.target_names[train_data.target[2000]])#sci.med
print(train_data.data[2000])#content
print(train_data.target_names[0])#alt.atheism
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = 'To introduce the fundamental concepts as well as practical applications of contemporary \
        Artificial Intelligence incorporating knowledge discovery and data mining social \
        network intelligence and intelligent agents and advanced Information Technology in the \
        context of Web empowered social computing systems environments and activities To \
        discuss the techniques and issues central to the development of social computing and Web \
        intelligence systems'

dataset = load_20newsgroups(data_home='./', subset='all', categories=None)
labels = dataset.target
k = 6
print("k = ", k)

# For detailed usage of class WordCloud(), you can refer to the following link:
# https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html#wordcloud.WordCloud
wordcloud = WordCloud().generate(text)
# Output the generated file to current folder
wordcloud.to_file('wordcloud.png')
# For detailed usage of class matplotlib.pyplot, you can refer to the following links
# Matplotlib: https://matplotlib.org/index.html
# matplotlib.pyplot.imshow: https://matplotlib.org/api/_as_gen/matplotlib.pyplot.imshow.html
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
Exemplo n.º 4
0
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

# all_dataset = load_20newsgroups(data_home='./test', subset='all')
# training_dataset = load_20newsgroups(data_home='./test', subset='train')
# test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test')

#### word cloud
categories = ['happy', 'sad']
for category in categories:
    ## categories must be a list in here
    train_dataset1 = load_20newsgroups(data_home='./test',
                                       subset='train',
                                       categories=[category])
    string = train_dataset1.data.__str__()
    # print(string)
    wordcloud = WordCloud().generate(string)
    # wordcloud.to_file(category + ".png")
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
# dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all', categories=['sci.med'])
from twenty_newsgroups import load_20newsgroups

# The reference of the data set 20 Newsgroups can be found here:
# http://qwone.com/~jason/20Newsgroups/
newsgroups_data = load_20newsgroups(data_home='./', subset='all')
# list out all the categories name in the dataset
print('newsgroups_data.target_names:')
print(newsgroups_data.target_names)
print('')

print('Size of newsgroups_data.data: %d' % len(newsgroups_data.data))
for i in range(3):
    print('Doc Number %d' % i)
    print('Target Index: %d' % newsgroups_data.target[i])
    print('Doc Type: %s' % newsgroups_data.target_names[newsgroups_data.target[i]])
    print(newsgroups_data.data[i])
    print('')
Exemplo n.º 6
0
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
import numpy as np
from twenty_newsgroups import load_20newsgroups
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

## question A
## load the training data and all the data
all_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all')
training_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='train')
test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test')
# print(training_dataset.target)
# list out all the categories name in the dataset
print('all_dataset.target_names:', all_dataset.target_names)
print('training_dataset.target_names:', training_dataset.target_names)
print('')

# print('Size of dataset.data: %d' % len(all_dataset.data))
# print('Size of training_dataset.data: %d' % len(training_dataset.data))
print('')
# for i in range(3):
#     print('Doc Number %d' % i)
#     print('Target Index: %d' % all_dataset.target[i]) ## ??
#     print('Doc Type: %s' % all_dataset.target_names[all_dataset.target[i]])