from sklearn.naive_bayes import MultinomialNB from sklearn import metrics import numpy as np # added from twenty_newsgroups import load_20newsgroups # added categories = [ 'alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian' ] # twenty_train = fetch_20newsgroups(subset='train', categories=categories) # added twenty_train = load_20newsgroups(data_home='./', subset='train', categories=categories) # added # tokenizing text with sk-learn count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) # tf–idf can be computed as follows: tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # print(type(X_train_tfidf)) # train classifier clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
from twenty_newsgroups import load_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn import metrics import numpy as np from sklearn.linear_model import LogisticRegression #we use Logisitic Regression11 classifier rather than Naïve Bayes, because the latter cannot handle negative features that are commonly seen after dimension reduction. from sklearn.decomposition import TruncatedSVD import matplotlib.pyplot as pt from wordcloud import WordCloud train_data = load_20newsgroups(data_home='./', subset='train', categories=[ 'alt.atheism', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'sci.med' ]) test_data = load_20newsgroups(data_home='./', subset='test', categories=[ 'alt.atheism', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'sci.med' ]) #print(len(train_data.data))#2249 #print(len(test_data.data))#1497 ''' # list out all the categories name in the dataset print(train_data.target_names)#['alt.atheism', 'comp.sys.mac.hardware', 'rec.sport.baseball', 'sci.med'] print(train_data.target_names[train_data.target[2000]])#sci.med print(train_data.data[2000])#content print(train_data.target_names[0])#alt.atheism
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics from sklearn.cluster import KMeans import numpy as np from twenty_newsgroups import load_20newsgroups from wordcloud import WordCloud import matplotlib.pyplot as plt text = 'To introduce the fundamental concepts as well as practical applications of contemporary \ Artificial Intelligence incorporating knowledge discovery and data mining social \ network intelligence and intelligent agents and advanced Information Technology in the \ context of Web empowered social computing systems environments and activities To \ discuss the techniques and issues central to the development of social computing and Web \ intelligence systems' dataset = load_20newsgroups(data_home='./', subset='all', categories=None) labels = dataset.target k = 6 print("k = ", k) # For detailed usage of class WordCloud(), you can refer to the following link: # https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html#wordcloud.WordCloud wordcloud = WordCloud().generate(text) # Output the generated file to current folder wordcloud.to_file('wordcloud.png') # For detailed usage of class matplotlib.pyplot, you can refer to the following links # Matplotlib: https://matplotlib.org/index.html # matplotlib.pyplot.imshow: https://matplotlib.org/api/_as_gen/matplotlib.pyplot.imshow.html plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show()
from sklearn import metrics from sklearn.cluster import KMeans from sklearn.linear_model import SGDClassifier import numpy as np from twenty_newsgroups import load_20newsgroups from wordcloud import WordCloud import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.decomposition import TruncatedSVD # all_dataset = load_20newsgroups(data_home='./test', subset='all') # training_dataset = load_20newsgroups(data_home='./test', subset='train') # test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test') #### word cloud categories = ['happy', 'sad'] for category in categories: ## categories must be a list in here train_dataset1 = load_20newsgroups(data_home='./test', subset='train', categories=[category]) string = train_dataset1.data.__str__() # print(string) wordcloud = WordCloud().generate(string) # wordcloud.to_file(category + ".png") plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() # dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all', categories=['sci.med'])
from twenty_newsgroups import load_20newsgroups # The reference of the data set 20 Newsgroups can be found here: # http://qwone.com/~jason/20Newsgroups/ newsgroups_data = load_20newsgroups(data_home='./', subset='all') # list out all the categories name in the dataset print('newsgroups_data.target_names:') print(newsgroups_data.target_names) print('') print('Size of newsgroups_data.data: %d' % len(newsgroups_data.data)) for i in range(3): print('Doc Number %d' % i) print('Target Index: %d' % newsgroups_data.target[i]) print('Doc Type: %s' % newsgroups_data.target_names[newsgroups_data.target[i]]) print(newsgroups_data.data[i]) print('')
from sklearn.naive_bayes import MultinomialNB from sklearn import metrics from sklearn.cluster import KMeans from sklearn.linear_model import SGDClassifier import numpy as np from twenty_newsgroups import load_20newsgroups from wordcloud import WordCloud import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.decomposition import TruncatedSVD ## question A ## load the training data and all the data all_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='all') training_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='train') test_dataset = load_20newsgroups(data_home='./WI2020_Data', subset='test') # print(training_dataset.target) # list out all the categories name in the dataset print('all_dataset.target_names:', all_dataset.target_names) print('training_dataset.target_names:', training_dataset.target_names) print('') # print('Size of dataset.data: %d' % len(all_dataset.data)) # print('Size of training_dataset.data: %d' % len(training_dataset.data)) print('') # for i in range(3): # print('Doc Number %d' % i) # print('Target Index: %d' % all_dataset.target[i]) ## ?? # print('Doc Type: %s' % all_dataset.target_names[all_dataset.target[i]])