def getSVDChart(): data_set = DataSet() data, label, wm = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 28000) data = data[indexs] label = label[indexs] count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data) truncatedSVD = TruncatedSVD(n_components=5, n_iter=7, random_state=42) truncatedSVD = truncatedSVD.fit(X_train_counts) X_r = truncatedSVD.transform(X_train_counts) showChart(X_r, label, "PCA metric graph", len(X_r), 5000)
def getChiChart(numComp=5): data_set = DataSet() data, label, wm = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 10000) data = data[indexs] label = label[indexs] count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data) chi2_model = SelectKBest(chi2, k=numComp) chi2_model = chi2_model.fit(X_train_counts, label) X_new = chi2_model.transform(X_train_counts) print(chi2_model.scores_.shape) print(chi2_model.scores_) showChart(x=X_new, y=label, title="Chi squared", range_=X_new.shape[0], limit=3000)
from sklearn.naive_bayes import GaussianNB from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from data import DataSet from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt from conf_lib import plot_confusion_matrix from sklearn.decomposition import TruncatedSVD import random data_set = DataSet() data, label, class_names = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 50000) data = data[indexs] label = label[indexs] X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=42) est = [('count_vect', CountVectorizer()), ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)), ('clf_NB', GaussianNB())] pipeline_NB = Pipeline(est) pipeline_NB = pipeline_NB.fit(X_train, y_train)