def classifyLR(train, test): classifier.fit(train, twenty_train.target) predicted = classifier.predict(test) predicted_probs = classifier.predict_proba(test) hlp.getStats(twenty_test.target, predicted) hlp.plot_roc(twenty_test.target, predicted_probs[:, 1], 'Logistic Regression')
def classifyWithSVC(valC): clf = svm.SVC(C=valC, probability=True, kernel='linear', random_state=42) svdListTrain = td.getsvdListTrain() nmfListTrain = td.getnmfListTrain() svdListTest = td.getsvdListTest() nmfListTest = td.getnmfListTest() for min_df in [2,5]: print ".......... With min_df = ", min_df , "..........." if min_df == 2: svd_matrix_train=svdListTrain[0] nmf_matrix_train=nmfListTrain[0] svd_matrix_test=svdListTest[0] nmf_matrix_test=nmfListTest[0] else: svd_matrix_train=svdListTrain[1] nmf_matrix_train=nmfListTrain[1] svd_matrix_test=svdListTest[1] nmf_matrix_test=nmfListTest[1] print "With SVD" clf.fit(svd_matrix_train, twenty_train.target) predicted = clf.predict(svd_matrix_test) probabilities = clf.predict_proba(svd_matrix_test) hlp.getStats(twenty_test.target, predicted) hlp.plot_roc(twenty_test.target, probabilities[:,1], 'SVM') print "With NMF" clf.fit(nmf_matrix_train, twenty_train.target) predicted = clf.predict(nmf_matrix_test) probabilitiesnmf = clf.predict_proba(nmf_matrix_test) hlp.getStats(twenty_test.target, predicted) hlp.plot_roc(twenty_test.target, probabilitiesnmf[:,1], 'SVM')
def computeWithScaling(): print "-------------- Scaled SVD----------------" svd_old=svd_matrix[:,0:100] scaled_svd_matrix = preprocessing.scale(svd_old, with_mean = False) kmeans = hlp.getKmeans(20) svd_new=hlp.getSVD(2) svd_matrix_new = svd_new.fit_transform(scaled_svd_matrix) kmeans.fit(scaled_svd_matrix) hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_svd_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Scaled NMF----------------" nmf_old = hlp.getNMF(10) nmf_matrix = nmf_old.fit_transform(tfidf_matrix) scaled_nmf_matrix = preprocessing.scale(nmf_matrix, with_mean = False) kmeans = hlp.getKmeans(20) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(scaled_nmf_matrix) kmeans.fit(scaled_nmf_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Logarithmic NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) log_matrix = np.log(nmf_matrix+1) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(log_matrix) kmeans.fit(log_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Log scaled NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) log_matrix = np.log(nmf_matrix+1) nmf_matrix_scaled = preprocessing.scale(log_matrix, with_mean = False) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(nmf_matrix_scaled) kmeans.fit(nmf_matrix_scaled) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_log_scaled_20classes.png") hlp.getStats(labels,kmeans.labels_) print "--------------Scaled log NMF----------------" nmf_matrix = nmf_old.fit_transform(tfidf_matrix) scaled_matrix = preprocessing.scale(nmf_matrix, with_mean = False) log_scaled_nmf = np.log(scaled_matrix+1) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(log_scaled_nmf) kmeans.fit(log_scaled_nmf) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_nmf_scaled_log_20classes.png") hlp.getStats(labels,kmeans.labels_)
def classify(train, test, obj): classifier_ovo = OneVsOneClassifier(obj) classifier_ovr = OneVsRestClassifier(obj) classifier_ovo.fit(train, twenty_train.target) classifier_ovr.fit(train, twenty_train.target) print "Testing" predicted_ovo = classifier_ovo.predict(test) predicted_ovr = classifier_ovr.predict(test) print "One vs one" hlp.getStats(twenty_test.target, predicted_ovo) print "One vs Rest" hlp.getStats(twenty_test.target, predicted_ovr)
def api_stats(): database = os.path.expanduser(flask.request.args.get('database')) if not os.path.isfile(database): return flask.abort(404) with sqlite3.connect(database) as conn: data = pandas.read_sql_query('select * from measurement;', conn) stats = helper.getStats(data) return flask.jsonify(stats)
def compute4a(): #convert HD to 2D print "........With LSI........" svd_old =svd_matrix[:,0:100] kmeans = hlp.getKmeans(20) svd_new=hlp.getSVD(2) svd_matrix_new = svd_new.fit_transform(svd_old) kmeans.fit(svd_old) hlp.plot20Clusters(svd_matrix_new,kmeans,"clusters_2d_svd_best_20classes.png") hlp.getStats(labels,kmeans.labels_) print ".........With NMF......." nmf = hlp.getNMF(10) nmf_matrix = nmf.fit_transform(tfidf_matrix) nmf_new=hlp.getNMF(2) nmf_matrix_new=nmf_new.fit_transform(nmf_matrix) kmeans.fit(nmf_matrix) hlp.plot20Clusters(nmf_matrix_new,kmeans,"clusters_2d_nmf_best_20classes.png") hlp.getStats(labels,kmeans.labels_)
def getDataPerYear(): year = request.args.get('year', default='1970', type=str) filename = "Data" + year + ".csv" df = helper.getDataFrameBasedOnYear(filename) attr1 = request.args.get('attr', default='Sex', type=str) if attr1 == 'Immigrant': attr1 = "Native" attr1 += "_Ratio" attr2 = request.args.get('profiler', default='PerCapitaIncome', type=str) corr, pval = helper.getStats(df, attr1, attr2) if (pval < 0.05): print("P - value : " + str(pval) + ". STATISTICALLY SIGNIFICANT.") else: print("P - value : " + str(pval) + ". STATISTICALLY INSIGNIFICANT.") helper.writeToFile("stats.txt", corr, pval) getStats() return df.to_csv()
nmfListTest = td.getnmfListTest() classifier = MultinomialNB() for min_df in [2, 5]: print "WIth min_df = ", min_df if min_df == 2: nmf_matrix_train = nmfListTrain[0] nmf_matrix_test = nmfListTest[0] tfidf_matrix_train = tfidfListTrain[0] tfidf_matrix_test = tfidfListTest[0] else: nmf_matrix_train = nmfListTrain[1] nmf_matrix_test = nmfListTest[1] tfidf_matrix_train = tfidfListTrain[1] tfidf_matrix_test = tfidfListTest[1] print ".......... With SVD ........." classifier.fit(tfidf_matrix_train, twenty_train.target) predicted = classifier.predict(tfidf_matrix_test) probabilities = classifier.predict_proba(tfidf_matrix_test) hlp.getStats(twenty_test.target, predicted) hlp.plot_roc(twenty_test.target, probabilities[:, 1], 'MultinomialNB') print ".......... With NMF .........." classifier.fit(nmf_matrix_train, twenty_train.target) predicted = classifier.predict(nmf_matrix_test) probabilities = classifier.predict_proba(nmf_matrix_test) hlp.getStats(twenty_test.target, predicted) hlp.plot_roc(twenty_test.target, probabilities[:, 1], 'MultinomialNB')
import helper as hlp import task1 as t1 dataset = hlp.fetch_data() hlp.classify_into_two_class(dataset) labels = hlp.fetch_labels(dataset) tfidf_matrix = t1.getTFIDF_matrix(dataset, 3) km = hlp.getKmeans(2) km.fit(tfidf_matrix) hlp.getStats(labels, km.labels_)
import helper as hlp import task1 as t1 from sklearn.cluster import KMeans import numpy as np import matplotlib.pyplot as plt dataset = hlp.fetch_data() hlp.classify_into_two_class(dataset) labels = hlp.fetch_labels(dataset) tfidf_matrix = t1.getTFIDF_matrix(dataset, 3) kmeans = hlp.getKmeans(2) svd = hlp.getSVD(3) svd_matrix = svd.fit_transform(tfidf_matrix) kmeans.fit(svd_matrix) hlp.plotClusters(svd_matrix, kmeans, "clusters_2d_svd_best.png") hlp.getStats(labels, kmeans.labels_) nmf = hlp.getNMF(10) nmf_matrix = nmf.fit_transform(tfidf_matrix) kmeans.fit(nmf_matrix) hlp.plotClusters(nmf_matrix, kmeans, "clusters_2d_nmf_best.png") hlp.getStats(labels, kmeans.labels_)
def getBestR(tfidf_matrix, num): rank_list = [1, 2, 3, 5, 10, 20, 50, 100, 300] homo_list_svd = [] comp_list_svd = [] vscore_list_svd = [] adjscore_list_svd = [] infoscore_list_svd = [] homo_list_nmf = [] comp_list_nmf = [] vscore_list_nmf = [] adjscore_list_nmf = [] infoscore_list_nmf = [] for r in rank_list: print "................. For r=", r, "......................\n" svd_matrix = hlp.getSVD(r) svd = svd_matrix.fit_transform(tfidf_matrix) nmf = hlp.getNMF(r) nmf_matrix = nmf.fit_transform(tfidf_matrix) km = hlp.getKmeans(num) print "*******With LSI********" km.fit(svd) h**o, comp, vscore, adjscore, infoscore = hlp.getStats( labels, km.labels_) homo_list_svd.append(h**o) comp_list_svd.append(comp) vscore_list_svd.append(vscore) adjscore_list_svd.append(adjscore) infoscore_list_svd.append(infoscore) print "" print "*******With NMF********" km.fit(nmf_matrix) h**o, comp, vscore, adjscore, infoscore = hlp.getStats( labels, km.labels_) homo_list_nmf.append(h**o) comp_list_nmf.append(comp) vscore_list_nmf.append(vscore) adjscore_list_nmf.append(adjscore) infoscore_list_nmf.append(infoscore) print "*******With LSI********" plt.plot(rank_list, homo_list_svd) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_svd) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_svd) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_svd) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_svd) plt.ylabel('Adjusted Mutual Info Score') plt.show() print "*******With NMF********" plt.plot(rank_list, homo_list_nmf) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_nmf) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_nmf) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_nmf) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_nmf) plt.ylabel('Adjusted Mutual Info Score') plt.show()
def computeBestR(): for r in rank_list: print "................. For r=",r,"......................\n" svd=svd_matrix[:,0:r] nmf = hlp.getNMF(r) nmf_matrix = nmf.fit_transform(tfidf_matrix) km = hlp.getKmeans(20) print "*******With LSI********" km.fit(svd) h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_) homo_list_svd.append(h**o) comp_list_svd.append(comp) vscore_list_svd.append(vscore) adjscore_list_svd.append(adjscore) infoscore_list_svd.append(infoscore) print "" print "*******With NMF********" km.fit(nmf_matrix) h**o, comp, vscore, adjscore, infoscore = hlp.getStats(labels,km.labels_) homo_list_nmf.append(h**o) comp_list_nmf.append(comp) vscore_list_nmf.append(vscore) adjscore_list_nmf.append(adjscore) infoscore_list_nmf.append(infoscore) print ".............With LSI............." plt.plot(rank_list, homo_list_svd) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_svd) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_svd) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_svd) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_svd) plt.ylabel('Adjusted Mutual Info Score') plt.show() print "............With NMF............." plt.plot(rank_list, homo_list_nmf) plt.ylabel('Homogeneity Score') plt.show() plt.plot(rank_list, comp_list_nmf) plt.ylabel('Completeness Score') plt.show() plt.plot(rank_list, vscore_list_nmf) plt.ylabel('V-measure Score') plt.show() plt.plot(rank_list, adjscore_list_nmf) plt.ylabel('Adjusted rand Score') plt.show() plt.plot(rank_list, infoscore_list_nmf) plt.ylabel('Adjusted Mutual Info Score') plt.show()