def compare_classifiers(file): if "banknote" in file: used_features = ["Variance", "Skewness", "Curtosis", "Entropy"] output_feature = "Class" orig = pd.read_csv("banknote_s_orig.csv") pert = pd.read_csv("banknote_s_pert.csv") elif "diabetes" in file: used_features = [ "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age" ] output_feature = "Outcome" orig = pd.read_csv("diabetes_orig.csv") pert = pd.read_csv("diabetes_pert.csv") elif "bank" in file: used_features = [ "Age", "Job", "Marital", "Education", "Default", "Balance", "Housing", "Loan", "Contact", "Day", "Month", "Duration", "Campaign", "Pdays", "Previous", "Poutcome" ] output_feature = "Y" orig = pd.read_csv("bank_labeled_orig.csv") pert = pd.read_csv("bank_labeled_pert.csv") sc = StandardScaler() orig_scaled = orig.copy() pert_scaled = pert.copy() orig_scaled[used_features] = sc.fit_transform(orig_scaled[used_features]) pert_scaled[used_features] = sc.fit_transform(pert_scaled[used_features]) y_orig = orig_scaled[output_feature] x_orig = orig_scaled.drop([output_feature], axis=1)[used_features] y_pert = pert_scaled[output_feature] x_pert = pert_scaled.drop([output_feature], axis=1)[used_features] x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split( x_orig, y_orig, test_size=0.33, random_state=42) x_train_pert, x_test_pert, y_train_pert, y_test_pert = train_test_split( x_pert, y_pert, test_size=0.33, random_state=42) models = [ DecisionTreeClassifier(), KNeighborsClassifier(), SVC(), GaussianProcessClassifier(), RandomForestClassifier(), MLPClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis(), GaussianNB() ] fig = plt.figure() for idx, model in enumerate(models): model.fit(x_train_orig, y_train_orig) y_pred_orig = model.predict(x_test_orig) tn_orig, fp_orig, fn_orig, tp_orig = confusion_matrix( y_test_orig, y_pred_orig).ravel() model.fit(x_train_pert, y_train_pert) y_pred_pert = model.predict(x_test_pert) tn_pert, fp_pert, fn_pert, tp_pert = confusion_matrix( y_test_pert, y_pred_pert).ravel() accuracy_orig = (tp_orig + tn_orig) / (tp_orig + tn_orig + fp_orig + fn_orig) * 100 accuracy_pert = (tp_pert + tn_pert) / (tp_pert + tn_pert + fp_pert + fn_pert) * 100 precision_orig = tp_orig / (tp_orig + fp_orig) * 100 precision_pert = tp_pert / (tp_pert + fp_pert) * 100 recall_orig = tp_orig / (tp_orig + fn_orig) * 100 recall_pert = tp_pert / (tp_pert + fn_pert) * 100 print(("\n{}:\n Accuracy original | masked = {:5.2f} | {:5.2f}\n" + " Precision original | masked = {:5.2f} | {:5.2f}\n" + " Recall original | masked = {:5.2f} | {:5.2f}").format( type(model).__name__, accuracy_orig, accuracy_pert, precision_orig, precision_pert, recall_orig, recall_pert)) plot_data = [[accuracy_orig, precision_orig, recall_orig], [accuracy_pert, precision_pert, recall_pert]] X = np.arange(3) ax = fig.add_subplot(3, 3, idx + 1) ax.bar(X + 0.00, plot_data[0], color='b', width=0.25, label="Original data") ax.bar(X + 0.25, plot_data[1], color='g', width=0.25, label="Masked data") ax.set_title(type(model).__name__) ax.set_xticks(X) ax.set_xticklabels(["Accuracy", "Precision", "Recall"]) plt.tight_layout() plt.legend() plt.show()
import numpy as np from matplotlib import pyplot as plt from sklearn.metrics import accuracy_score, log_loss from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF # Generate data train_size = 50 rng = np.random.RandomState(0) X = rng.uniform(0, 5, 100)[:, np.newaxis] y = np.array(X[:, 0] > 2.5, dtype=int) # Specify Gaussian Processes with fixed and optimized hyperparameters gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print("Accuracy: %.3f (initial) %.3f (optimized)" % ( accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])), )) print("Log-loss: %.3f (initial) %.3f (optimized)" % (
'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2', solver='saga', multi_class='multinomial', max_iter=10000), 'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2', solver='saga', multi_class='ovr', max_iter=10000), 'Linear SVC': SVC(kernel='linear', C=C, probability=True, random_state=0), 'GPC': GaussianProcessClassifier(kernel) } n_classifiers = len(classifiers) plt.figure(figsize=(3 * 2, n_classifiers * 2)) plt.subplots_adjust(bottom=.2, top=.95) xx = np.linspace(3, 9, 100) yy = np.linspace(1, 5, 100).T xx, yy = np.meshgrid(xx, yy) Xfull = np.c_[xx.ravel(), yy.ravel()] for index, (name, classifier) in enumerate(classifiers.items()): classifier.fit(X, y)
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. y = np.array(iris.target, dtype=int) h = .02 # step size in the mesh kernel = 1.0 * RBF([1.0]) gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y) kernel = 1.0 * RBF([1.0, 1.0]) gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) titles = ["Isotropic RBF", "Anisotropic RBF"] plt.figure(figsize=(10, 5)) for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)): # Plot the predicted probabilities. For that, we will assign a color to # each point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 2, i + 1)
### WRITE YOUR CODE HERE # If you get stuck, uncomment the line above to load a correction in this cell (then you can execute this code). from sklearn.gaussian_process import GaussianProcessClassifier spam_GP = GaussianProcessClassifier() print(spam_GP.fit(Xtrain.toarray(), ytrain)) print("Score:", spam_GP.score(Xtest.toarray(), ytest))
def test_predict_consistent(kernel): # Check binary predict decision has also predicted probability above 0.5. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
def test_lml_precomputed(kernel): # Test that lml of optimized kernel is stored correctly. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7)
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1): """ AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats) inputFile: the .csv file containt feature tables groups: The selected groups to classify. Full set is ["S","F","Z","N","O"], but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG) FeatSelect: feature selection method: PCA, RFE, fisher or none Nfeats: number of selected features Returns: AllStatsMean: mean performance values AllStatsSTD: standard deviation of performance values """ #reads input features dfFeats = pd.read_csv(inputFile, sep=',', header=0) #only selected groups dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)] if "decTaime" in dfFeats: x = dfFeats.iloc[:, 2:] #ignores decomposition method execution time else: x = dfFeats.iloc[:, 1:] y = dfFeats.iloc[:, 0].values if scaleFeats: #scale feats? x = StandardScaler().fit_transform(x) #Feature selection if x.shape[1] > Nfeats: #RFE if FeatSelect == "RFE": rfeModel = SVC(kernel="linear", C=0.025, probability=True, gamma='scale') rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats) rfe_fit = rfeSelect.fit(x, y) x = x[:, rfe_fit.support_] if FeatSelect == "PCA": pca = PCA(n_components=Nfeats) x = pca.fit_transform(x) if FeatSelect == "fisher": fisherScore = fisher_score.fisher_score(x, y) idx = fisher_score.feature_ranking(fisherScore) x = x[:, idx[:Nfeats]] names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True, gamma='scale'), SVC(probability=True, gamma='scale'), GaussianProcessClassifier(1.0 * RBF(1.0)), MLPClassifier(alpha=1, max_iter=200) ] #initialize performance variable AllStats = {} AllStatsMean = {} AllStatsSTD = {} for name in names: AllStats[name] = { "Accuracy": np.zeros([realizations, K_folds]), "SensitivityMean": np.zeros([realizations, K_folds]), "SpecificityMean": np.zeros([realizations, K_folds]), "AUC_Mean": np.zeros([realizations, K_folds]), "SensitivityIctal": np.zeros([realizations, K_folds]), "SpecificityIctal": np.zeros([realizations, K_folds]), "AUC_Ictal": np.zeros([realizations, K_folds]), "TTtimes": np.zeros([realizations, K_folds]) } AllStatsMean[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } AllStatsSTD[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } #for each realization for i in range(realizations): skf = StratifiedKFold(n_splits=K_folds, shuffle=True) #5-fold validation for tupTemp, ki in zip(skf.split(x, y), range(K_folds)): train_idx, test_idx = tupTemp[0], tupTemp[1] X_train, X_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] for name, clf in zip(names, classifiers): #for each classifier tic = time.time( ) #check training/testing time of each classifier #Fit model and predict modelFit = clf.fit(X_train, y_train) yPredicted = modelFit.predict(X_test) probsTest = modelFit.predict_proba(X_test) toc = time.time() # AUC - #ictal class as positive if len(np.unique(y)) > 2: AUCs = roc_auc_score( LabelBinarizer().fit_transform(y_test), probsTest, average=None) else: AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None) #Sensitivity and Specificity cMatrix = confusion_matrix(y_test, yPredicted) FP = cMatrix.sum(axis=0) - np.diag(cMatrix) FN = cMatrix.sum(axis=1) - np.diag(cMatrix) TP = np.diag(cMatrix) TN = cMatrix.sum() - (FP + FN + TP) # Sensitivity TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) #fill performance variable AllStats[name]["Accuracy"][i, ki] = accuracy_score( y_test, yPredicted) AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR) AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR) AllStats[name]["SensitivityIctal"][i, ki] = TPR[0] AllStats[name]["SpecificityIctal"][i, ki] = TNR[0] AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs) AllStats[name]["TTtimes"][i, ki] = toc - tic if len(np.unique(y)) > 2: AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0] AllStatsDF = [0] * len(names) for idx, name in enumerate(names): for istat in AllStats[name].keys(): AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1) AllStatsMean[name][istat] = np.mean(AllStats[name][istat]) AllStatsSTD[name][istat] = np.std(AllStats[name][istat]) AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name]) AllStatsDF[idx]["Nmodes"] = Nmodes AllStatsDF[idx]["Classifier"] = name return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict( AllStatsSTD), pd.concat(AllStatsDF)
models = [ # MultinomialNB(alpha=.01), #very fast ~0.7 LogisticRegression(multi_class="auto", solver="lbfgs"), # fast ~10m and very good ~0.865 # QuadraticDiscriminantAnalysis(), # fast ~5m and very good ~0.81 # DecisionTreeClassifier(), # fast enough # RandomForestClassifier(), # fast enough # GaussianNB(), #very fast, performance around 0.58 ] non_models = [ #xgb.XGBClassifier(objective="multi:softprob"), # too slow probably around 10h and okayish ~ 0.759 KNeighborsClassifier(100), #works but v slow --> takes an hour + need to consider a hundred NNs because there are 20 classes SVC(kernel="linear", C=0.025, probability=True), # too slow to work SVC(gamma=2, C=1, probability=True), # too slow to work but v good ~0.83 GaussianProcessClassifier(1.0 * RBF(1.0)), # too slow to work MLPClassifier(alpha=1, max_iter=1000), #fast enough ~80m and really good ~0.81 AdaBoostClassifier() #prob take 1-2h, also very poor performance ] non_names = ["Nearest Neighbors", "Linear SVM","RBF SVM","Gaussian Process", "Neural Net","AdaBoost"] # "xgboost", names = [ # "Multinomial Naive Bayes", "Logistic Regression", # "QDA", # "Decision Tree", # "Random Forest", # "Gaussian Naive Bayes" ] def load_data(dataset='20newsgroups', true_ratio = 0.5):
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False): """ The list of all classifiers was generated by running the following commented code. Args: a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets. a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will be printed to a file with the current date and time. a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally describing the run that is being performed. Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_output_scores_to_csv, bool) assert isinstance(output_scores_csv_file_suffix, object) import time # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from sklearn.utils.testing import all_estimators #estimators = all_estimators() #for name, class_ in estimators: # log_print(name) from sklearn.calibration import CalibratedClassifierCV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegressionCV from sklearn.linear_model import SGDClassifier from sklearn.mixture import BayesianGaussianMixture from sklearn.mixture import DPGMM from sklearn.mixture import GaussianMixture from sklearn.mixture import GMM from sklearn.mixture import VBGMM from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier #from xgboost import XGBClassifier models = [] models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) models.append(('BernoulliNB', BernoulliNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('DPGMM', DPGMM())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=SEED))) models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED))) models.append(('GMM', GMM())) models.append(('GaussianMixture', GaussianMixture())) models.append(('GaussianNB', GaussianNB())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('LogisticRegression', LogisticRegression())) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('MLPClassifier', MLPClassifier())) #models.append(('MultinomialNB', MultinomialNB())) #models.append(('NuSVC', NuSVC())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('RandomForestClassifier', RandomForestClassifier(random_state=SEED))) models.append(('SGDClassifier', SGDClassifier())) models.append(('SVC', SVC())) models.append(('VBGMM', VBGMM())) #models.append(('XGBClassifier', XGBClassifier())) output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table) if print_output_scores_to_csv: output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv') return output_scores_df def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True): """ Run all classifiers of sklearn Args: X_train, X_test, y_train, y_test: The train and tests datasets. print_details: if true, print details of all models and save csv table ; if false, print only table with summary of the models Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_details, bool) log_method_execution_time(log_funcname()) from sklearn.utils.testing import all_estimators import sklearn.metrics import time from src.util.acq_util import RANDOM_SEED # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from xgboost import XGBClassifier #models.append(('XGBClassifier', XGBClassifier())) models = all_estimators(type_filter='classifier') output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0', 'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'], columns=list(zip(*models))[0]) for name, model in models: if print_details is True: print('------------------------------------------------------------------------------') print(name) print('------------------------------------------------------------------------------') if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'): continue model = model() if 'random_state' in model.get_params(): model.random_state = SEED #Fitting the model. model.fit(X_train, y_train) #Measuring accuracy. y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset, ['Accuracy on the train set', name], print_details) output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset, ['Accuracy on the test set', name], print_details) #Plotting confusion matrix. output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details) #Showing classification report. if print_details is True: print(sklearn.metrics.classification_report(y_test, y_test_pred)) # Printing scores to output dataset. output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name) # Can use idxmax with axis=1 to find the column with the greatest value on each row. output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1) #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1) if print_details is True: output_scores_dataset.to_csv('output_scores' + '.csv') return output_scores_dataset def train_test_split_for_classification(dataset, label, test_size, random_state=SEED): """ Selects X and y, considering that y has been renamed to label. """ from sklearn.model_selection import train_test_split assert isinstance(dataset, pd.core.frame.DataFrame) assert isinstance(test_size, float) assert isinstance(random_state, int) X = dataset.loc[:, dataset.columns != label] y = dataset[g_label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) log_print('X_train: {}'.format(X_train.shape)) log_print('y_train: {}'.format(y_train.shape)) log_print('X_test: {}'.format(X_test.shape)) log_print('y_test: {}'.format(y_test.shape)) return(X_train, X_test, y_train, y_test)
import pika import analyzer # warnings.filterwarnings("ignore") df = pd.read_csv('data.csv', header=0) y = df['Genre'] X = df[[ 'Duration', 'Tempo', 'Strength', 'Contrast', 'Fore_Diff', 'Fore_Position' ]] sc = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False) X = sc.fit_transform(X) classifier = GaussianProcessClassifier(1.0 * RBF(1.0)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.67, random_state=27, shuffle=True) classifier.fit(X_train, y_train) def detect(data, channel): try: response = { 'id': data['id'], 'status': 'processing', 'result': {},
from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.metrics import accuracy_score data = pd.read_csv('finance_data.csv', index_col=['Ticker', 'Fiscal Year', 'Fiscal Period']) print(data.columns) Y = data.loc[:, 'pos_neg'] X = data.drop(columns=['pos_neg', 'shifted_chg', 'report_date']) X = scale(X.values) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, shuffle=False) h = .02 # step size in the mesh#i ##i3#fff kernal = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernal) gpc.fit(X_train, y_train) Z = gpc.predict(X_test) acc = accuracy_score(y_test, Z) print(acc) print(y_test[0:10]) print(Z[0:10])
def gaussian_process_classifier(self): model = OneVsRestClassifier(GaussianProcessClassifier()).fit(self.train_texts_tfidf, self.train_labels) self.save_model(model, self.gpc_filename) return model
kernelfun = multiplier * gpkernels.RationalQuadratic( length_scale, alpha=alpha, length_scale_bounds=(length_scale_lb, length_scale_ub), alpha_bounds=(alpha_lb, alpha_ub)) else: print('It should have not reached here!') kernelfun = 1.0 * gpkernels.RBF(1.0) #RBF, Matern, ConstantKernel, WhiteKernel, RationalQuadratic # length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5 # length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-05, 100000.0), alpha_bounds=(1e-05, 100000.0) #device = torch.device("cpu") ## TODO: Define a model #print('We instiantate the model') model = GaussianProcessClassifier(kernelfun) ## TODO: Train the model #print('We fit the model') model.fit(train_x, train_y) print('score-training {}'.format(model.score(train_x, train_y))) ## --- End of your code --- ## # Save the trained model joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.metrics import accuracy_score classifiers = [ KNeighborsClassifier(), SVC(), GaussianProcessClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), MLPClassifier(), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] # #[height, weight, shoe_size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [
X_reduced = PCA(n_components=3).fit_transform(X_) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y_.ravel(), cmap=plt.cm.Set1, edgecolor='k', s=40) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) plt.show() from sklearn.gaussian_process.kernels import RBF kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel, multi_class = 'one_vs_one', random_state=0).fit(X_, Y_) # lets see how good our fit on the train set is print(gpc.score(X_, Y_)) # create the TF neural net # some hyperparams training_epochs = 200 n_neurons_in_h1 = 10 n_neurons_in_h2 = 10 learning_rate = 0.01 dkl_loss_rate = 0.1 n_features = len(X[0])
def evaluateIndividualClassifiers(x, y, train_size_pct): """ evaluateIndividualClassifiers x : The features of the dataset to be used for predictions y : The target class for each row in "x" train_size_pct : {float in the range(0.0, 1.0)} the percentage of the dataset that should be used for training """ max_depth_x2 = MAX_DEPTH * 2 max_iter_x2 = MAX_ITER * 2 max_iter_x10 = MAX_ITER * 10 n_neighbors_x2 = N_NEIGHBORS * 2 n_neighbors_d2 = N_NEIGHBORS // 2 rf = RandomForestClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED) rf_x2 = RandomForestClassifier(max_depth=max_depth_x2, criterion='entropy', random_state=SEED) et = ExtraTreesClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED) dectree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=SEED) knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS) knn_x2 = KNeighborsClassifier(n_neighbors=n_neighbors_x2) knn_d2 = KNeighborsClassifier(n_neighbors=n_neighbors_d2) mlpnn = MLPClassifier(max_iter=MAX_ITER) mlpnnE = MLPClassifier(max_iter=MAX_ITER, early_stopping=True) mlpnn_x2 = MLPClassifier(max_iter=max_iter_x2) mlpnnE_x2 = MLPClassifier(max_iter=max_iter_x2, early_stopping=True) XGB1 = XGBClassifier() GNB1 = GaussianNB() dumm = DummyClassifier() knb = neighbors.KNeighborsClassifier() LR1 = LogisticRegression(max_iter=max_iter_x10) SVC1 = SVC(max_iter=max_iter_x10) ovr1 = SGDClassifier(max_iter=max_iter_x2) ada1 = AdaBoostClassifier() gpc1 = GaussianProcessClassifier() GBclass1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) histgclass = HistGradientBoostingClassifier(max_iter=max_iter_x2) bagclass = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) ridge1 = RidgeClassifier(max_iter=max_iter_x10) #Mnb = MultinomialNB() SVC2 = NuSVC(max_iter=max_iter_x10) linear1 = LinearSVC(max_iter=max_iter_x10) classifier_mapping = { f'1-RandomForest-{MAX_DEPTH}': rf, f'2-RandomForest-{max_depth_x2}': rf_x2, f'3-ExtraTrees-{MAX_DEPTH}': et, f'4-DecisionTree-{MAX_DEPTH}': dectree, f'5-KNeighbors case1-{N_NEIGHBORS}': knn, f'5-KNeighbors case2-{n_neighbors_x2}': knn_x2, f'5-KNeighbors case3-{n_neighbors_d2}': knn_d2, f'6-MLP case1-{MAX_ITER}': mlpnn, f'6-MLP case2-{MAX_ITER}-early': mlpnnE, f'6-MLP case3-{max_iter_x2}': mlpnn_x2, f'6-MLP case4-{max_iter_x2}-early': mlpnnE_x2, f'7-XGB-': XGB1, f'8-GNB-': GNB1, f'9-dumm-': dumm, f'10-knb-': knb, f'11-LR1-': LR1, f'12-SVC1-': SVC1, f'13-ovr-': ovr1, f'14-ada-': ada1, f'15-gpc': gpc1, f'16-GBclass': GBclass1, f'17-histgclas': histgclass, f'18-bagclas': bagclass, f'19-ridge': ridge1, f'20-SVC2': SVC2, f'21-linear SVC': linear1, } for model_name, model in classifier_mapping.items(): train_test_model(model_name, model, x, y, train_size_pct)
"ElasticNet": linear_model.ElasticNet(random_state=0), "Lars": linear_model.Lars(n_nonzero_coefs=1), "LassoLars": linear_model.LassoLars(alpha=.1), "Omp": linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=1), "BayesianRidge":linear_model.BayesianRidge(), "ARDRegression":linear_model.ARDRegression(), "LogisitcRegression":linear_model.LogisticRegression(), "SGDClassifier":linear_model.SGDClassifier(), "Perceptron": linear_model.Perceptron(), "PassiveAggressiveClassifier": linear_model.PassiveAggressiveClassifier(), "Theil-Sen": linear_model.TheilSenRegressor(random_state=42), "RANSAC": linear_model.RANSACRegressor(random_state=42), "Huber": linear_model.HuberRegressor(), "SVC linear": SVC(kernel="linear", C=0.025), "SVC": SVC(gamma=2, C=1, probability=True), "GuassianProcess":GaussianProcessClassifier(1.0 * RBF(1.0)), "DecisionTree":DecisionTreeClassifier(max_depth=5), "RandomForest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "NeutraNet":MLPClassifier(alpha=1), "ADABoost":AdaBoostClassifier(), "GaussianNB":GaussianNB(), "QDA":QuadraticDiscriminantAnalysis() } best_model_names = {} for model_name in classifiers.keys(): try: model = classifiers[model_name] scores = cross_val_score(model, data, data_label, cv=5, verbose=1, scoring='accuracy') score = scores.mean() if score > .8:
def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(kernel.theta))
def return_model(mode, **kwargs): if mode == 'logistic': solver = kwargs.get('solver', 'liblinear') n_jobs = kwargs.get('n_jobs', None) max_iter = kwargs.get('max_iter', 5000) model = LogisticRegression(solver=solver, n_jobs=n_jobs, max_iter=max_iter, random_state=666) elif mode == 'Tree': model = DecisionTreeClassifier(random_state=666) elif mode == 'RandomForest': n_estimators = kwargs.get('n_estimators', 50) model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) elif mode == 'GB': n_estimators = kwargs.get('n_estimators', 50) model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) elif mode == 'AdaBoost': n_estimators = kwargs.get('n_estimators', 50) model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) elif mode == 'SVC': kernel = kwargs.get('kernel', 'rbf') model = SVC(kernel=kernel, random_state=666) elif mode == 'LinearSVC': model = LinearSVC(loss='hinge', random_state=666) elif mode == 'GP': model = GaussianProcessClassifier(random_state=666) elif mode == 'KNN': n_neighbors = kwargs.get('n_neighbors', 5) model = KNeighborsClassifier(n_neighbors=n_neighbors) elif mode == 'NB': model = MultinomialNB() elif mode == 'linear': model = LinearRegression(random_state=666) elif mode == 'ridge': alpha = kwargs.get('alpha', 1.0) model = Ridge(alpha=alpha, random_state=666) elif 'conv' in mode: tf.reset_default_graph() address = kwargs.get('address', 'weights/conv') hidden_units = kwargs.get('hidden_layer_sizes', [20]) activation = kwargs.get('activation', 'relu') weight_decay = kwargs.get('weight_decay', 1e-4) learning_rate = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 1000) early_stopping = kwargs.get('early_stopping', 10) warm_start = kwargs.get('warm_start', False) batch_size = kwargs.get('batch_size', 256) kernel_sizes = kwargs.get('kernel_sizes', [5]) strides = kwargs.get('strides', [5]) channels = kwargs.get('channels', [1]) validation_fraction = kwargs.get('validation_fraction', 0.) global_averaging = kwargs.get('global_averaging', 0.) optimizer = kwargs.get('optimizer', 'sgd') if mode == 'conv': model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif mode == 'conv_reg': model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif 'NN' in mode: solver = kwargs.get('solver', 'adam') hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20, )) if isinstance(hidden_layer_sizes, list): hidden_layer_sizes = list(hidden_layer_sizes) activation = kwargs.get('activation', 'relu') learning_rate_init = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 5000) early_stopping = kwargs.get('early_stopping', False) warm_start = kwargs.get('warm_start', False) if mode == 'NN': model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start=warm_start, max_iter=max_iter, early_stopping=early_stopping) if mode == 'NN_reg': model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start=warm_start, max_iter=max_iter, early_stopping=early_stopping) else: raise ValueError("Invalid mode!") return model
# %% print('label - feature split') topcols = [ 'pageinstanceid', 'referringpageinstanceid', 'pagesequenceinattribution', 'pagesequenceinsession' ] # X = df[topcols] # df['pageinstanceid'] = df['pageinstanceid'].apply(str) # df['referringpageinstanceid'] = df['referringpageinstanceid'].apply(str) # X_2h = pd.get_dummies(df[topcols]) # X_1h = df.drop(columns='iscustomer').values #<------- clfs_names = [ (KNeighborsClassifier(4), 'K-NN 4'), (GaussianProcessClassifier(1.0 * RBF(1.0)), 'GaussP'), (DecisionTreeClassifier(), 'DeciT'), (RandomForestClassifier(n_estimators=300), 'RF3'), (MLPClassifier(alpha=1), 'Neu-N'), (AdaBoostClassifier(), 'AdaBoo'), (GaussianNB(), 'NaiveBayes'), (QuadraticDiscriminantAnalysis(), 'QDA'), (SVC(gamma=2, C=1), 'RBF-SVM'), # (SVC(kernel="linear", C=0.025),'L-SVM') ] # %% X X.shape print(10 * '#', 'ORIG', 10 * '#') print('test - train split') tt_split = train_test_split(X, y, test_size=.25)
from sklearn import ensemble from sklearn import svm from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF _slow_or_bad_pipelines = { 'KNeighborsClassifier': Pipeline([('clf', KNeighborsClassifier(2))]), 'GaussianProcessClassifier': Pipeline([('clf', GaussianProcessClassifier(1.0 * RBF(1.0)))]), 'GaussianNB': Pipeline([('clf', GaussianNB())]), 'QuadraticDiscriminantAnalysis': Pipeline([('clf', QuadraticDiscriminantAnalysis())]), 'ExtraTreeClassifier': Pipeline([('clf', ExtraTreeClassifier())]), } #decomp =preprocessing.MaxAbsScaler() #decomp = decomposition.PCA(n_components=100) ecomp = decomposition.TruncatedSVD(n_components=100) #decomp = decomposition.NMF(n_components=250, random_state=1, ) #alpha=.1, l1_ratio=.5 #decomp = decomposition.LatentDirichletAllocation(n_components=400, learning_method='batch') classify_pre_pipeline = Pipeline([
x_test_original = testset[["bone_length", "rotting_flesh", "hair_length", "has_soul"]] x_test_hair_soul = testset[["bone_length", "rotting_flesh", "hair_length", "has_soul", "hair_soul"]] #creating a dictionary to hold classifier objects clfs = {} #clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'} #clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'} #clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'} #clfs['svc'] = {'clf': svm.SVC(kernel='linear'), 'name': 'SupportVectorClassifier'} #some of the classifiers clfs['tr'] = {'clf': DecisionTreeClassifier(), 'name':'DecisionTree'} clfs['nusvc'] = {'clf': NuSVC(gamma='scale'), 'name': 'NuSVC'} clfs['linearsvc'] = {'clf': LinearSVC(), 'name': 'LinearSVC'} clfs['SGD'] = {'clf': SGDClassifier(max_iter=1000 ,tol=1e-3), 'name': 'SGDClassifier'} clfs['GPC'] = {'clf': GaussianProcessClassifier(), 'name': 'GaussianProcess'} clfs['nb'] = {'clf': GaussianNB(), 'name':'GaussianNaiveBayes'} clfs['bag'] = {'clf': BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier"} clfs['gbc'] = {'clf': GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'} #clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'} #creating parameters for searching parameters = {'solver': ['lbfgs'], 'max_iter': [1500], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes':np.arange(5, 12)} clfs['mlpgrid'] = {'clf': GridSearchCV(MLPClassifier(), parameters,cv=3,iid=True), 'name': 'MLP with GridSearch'} parameters = {'kernel':['linear', 'sigmoid', 'poly', 'rbf'], 'gamma':np.linspace(0.0,2.0,num=21),'C': np.linspace(0.5,1.5,num=11)} clfs['svcgrid'] = {'clf': GridSearchCV(SVC(), parameters,cv=3,iid=True), 'name': 'SVC with GridSearch'} parameters = {'n_estimators':np.arange(64, 1024, step=64)} clfs['rfgrid'] = {'clf': GridSearchCV(RandomForestClassifier(), parameters,cv=3,iid=True), 'name': 'Random Forest with GridSearch'}
plt.title('2-class Logistic Regression\n Probabilistic Decision Boundary') plt.scatter(X[:,0][y==1],X[:,1][y==1], label="versicolor",color="red",edgecolors=(0, 0, 0)) plt.scatter(X[:,0][y==2],X[:,1][y==2], label="virginica",color="blue",edgecolors=(0, 0, 0)) plt.scatter(X[:,0][y==0],X[:,1][y==0], label="setosa",color="green",edgecolors=(0, 0, 0)) plt.xlabel("Sepal Length") plt.ylabel("Petal Length") plt.xlim(4,8) plt.ylim(0.5,7.5) plt.legend() plt.savefig("2-class-logr-prob.pdf") #gaussian process decision map xx, yy = np.mgrid[4:8:0.05, 0.5:7.5:0.05] kernel = 1.0 * RBF([1.0, 1.0])#rbf_anisotropic m_gpc = GaussianProcessClassifier(kernel=kernel).fit(Xt, yt) Z = m_gpc.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1] Z = Z.reshape(xx.shape) image = plt.imshow(Z.T, interpolation='nearest', extent=(4, 8, 0.5, 7.5), aspect='auto', origin='lower', cmap=plt.cm.RdBu) plt.scatter(X[:,0][y==1],X[:,1][y==1], label="versicolor",color="red",edgecolors=(0, 0, 0)) plt.scatter(X[:,0][y==2],X[:,1][y==2], label="virginica",color="blue",edgecolors=(0, 0, 0)) plt.scatter(X[:,0][y==0],X[:,1][y==0], label="setosa",color="green",edgecolors=(0, 0, 0)) plt.colorbar(image) plt.title("2-class RBF Gaussian Process Classifier\n Decision Map") plt.xlabel("Sepal Length") plt.ylabel("Petal Length") plt.xlim(4,8) plt.ylim(0.5,7.5) plt.legend()
def main(argv): # parse data parsed = parse_args(argv) if parsed.output_directory != None: parsed.output_directory += '/' if (not parsed.output_directory.endswith('/')) else '' if (os.path.exists(parsed.output_directory)): shutil.rmtree(parsed.output_directory) os.makedirs(parsed.output_directory) [gene_id, sample_id, expr_tr, label_tr] = parse_data(parsed.input_expr, 1, 2) label_unique= np.unique(label_tr) label_count = np.array([len(np.where(label_tr == l)[0]) for l in label_unique]) print "Training set dimension:", expr_tr.shape[0], "samples x", expr_tr.shape[1], "features" print "True labels", label_unique, "| Counts", label_count time_start = time.clock() ##### Random Forest ##### if parsed.learning_algorithm.lower() == 'random_forest': from sklearn.ensemble import RandomForestClassifier if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV rf = RandomForestClassifier() hyperparams = {'n_estimators': [250, 500, 1000], 'criterion': ['gini', 'entropy'], 'class_weight': [None, 'balanced']} clf = GridSearchCV(rf, hyperparams, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {'n_estimators': 1000, 'criterion': 'gini', 'class_weight': None} ## train the model clf = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], class_weight=params['class_weight'], oob_score=True, n_jobs=4, verbose=False) clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ## sort genes by importance num_most_important_gene = 25 gene_score = clf.feature_importances_ gene_index = gene_score.argsort()[-num_most_important_gene:][::-1] num_most_important_gene = min(num_most_important_gene, len(gene_score)) ##### C-SVM ##### elif parsed.learning_algorithm.lower() == 'svm': from sklearn.svm import SVC if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV svm = SVC() from sklearn.model_selection import RandomizedSearchCV import scipy.stats as ss hyperparams = {'C': ss.expon(scale=10), #randomized parameters 'kernel':['rbf'], # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'class_weight': [None]} clf = RandomizedSearchCV(svm, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {'C': 1.2, 'kernel': 'rbf', 'class_weight': None} ## train the model clf = SVC(C=params['C'], kernel=params['kernel'], class_weight=params['class_weight'], probability=True, verbose=False) clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### Nu-SVM ##### elif parsed.learning_algorithm.lower() == 'nu_svm': from sklearn.svm import NuSVC if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV svm = NuSVC() from sklearn.model_selection import RandomizedSearchCV import scipy.stats as ss hyperparams = {'nu': ss.expon(scale=10), #randomized parameters 'kernel':['rbf'], # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'class_weight': [None]} clf = RandomizedSearchCV(svm, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {'nu': 0.82, 'kernel': 'rbf', 'class_weight': 'balanced'} ## train the model clf = NuSVC(nu=params['nu'], kernel=params['kernel'], class_weight=params['class_weight'], probability=True, verbose=False) clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### SVR ##### elif parsed.learning_algorithm.lower() == 'svr': from sklearn.svm import SVR if parsed.cross_valid: ## sklearn model selection svr = SVR() from sklearn.model_selection import RandomizedSearchCV import scipy.stats as ss hyperparams = {'C': ss.expon(scale=10), #randomized parameters 'kernel':['rbf'], # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'] } clf = RandomizedSearchCV(svr, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, convert_labels(label_tr)) params = parse_cv_result(clf) else: params = {'C': 1.1, 'kernel': 'rbf'} ## train the model clf = SVR(C=params['C'], kernel=params['kernel'], verbose=False) clf.fit(expr_tr, convert_labels(label_tr)) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, convert_labels(label_tr)) #coefficient of determination R^2 of the prediction ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### Neural Network ##### elif parsed.learning_algorithm.lower() == 'neural_net': from sklearn.linear_model import LogisticRegression from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline # train the model logistic = LogisticRegression(C=10) rbm = BernoulliRBM(n_components=256, learning_rate=.001, n_iter=100, verbose=False) clf = Pipeline(steps=[('rmb', rbm), ('logistic', logistic)]) clf.fit(expr_tr, label_tr) if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### Naive Bayes ##### elif parsed.learning_algorithm.lower() == 'naive_bayes': from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### Gradient Boosting ##### elif parsed.learning_algorithm.lower() == 'grad_boosting': from sklearn.ensemble import GradientBoostingClassifier # ## convert to two class # label_tr = [1 if x=='P' or x=='C' else 0 for x in label_tr] if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV gb = GradientBoostingClassifier() hyperparams = {'learning_rate': [.01, .0075, .005, .001, .0005], 'max_depth': [3], 'subsample': [1, .8, .5], 'n_estimators': [1000]} clf = GridSearchCV(gb, hyperparams, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {'learning_rate': .0025, 'max_depth': 3, 'subsample': .8, 'n_estimators': 1000} ## train the model clf = GradientBoostingClassifier(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'], max_depth=params['max_depth'], subsample=params['subsample'], verbose=False) clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### AdaBoost ##### elif parsed.learning_algorithm.lower() == "adaboost": from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3)) hyperparams = {'learning_rate': [.01, .0075, .005, .001, .0005], 'n_estimators': [1000]} clf = GridSearchCV(ab, hyperparams, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {'learning_rate': .0025, 'n_estimators': 1000} ## train the model clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), learning_rate=params['learning_rate'], n_estimators=params['n_estimators']) clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) accuracy_pred = clf.score(expr_tr, label_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') ##### Gaussian Process ##### elif parsed.learning_algorithm.lower() == 'gauss_process': from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF if parsed.cross_valid: ## sklearn model selection from sklearn.model_selection import GridSearchCV gb = GaussianProcessClassifier() hyperparams = {} clf = GridSearchCV(gb, hyperparams, cv=parsed.cross_valid, n_jobs=4) clf.fit(expr_tr, label_tr) params = parse_cv_result(clf) else: params = {} ## train the model clf = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), optimizer="fmin_l_bfgs_b") clf.fit(expr_tr, label_tr) label_pred = clf.predict(expr_tr) ## save the model if parsed.output_directory != None: joblib.dump(clf, parsed.output_directory + parsed.learning_algorithm.lower() + '_model.pkl') else: sys.exit('Improper learning algorithm option given.') ## print timer messages time_end = time.clock()
log = open(logfilename, 'a') log.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}\n'.format( packetlevel, neibour, component, i, trainauc, teatauc, traintpr, testtpr, trainfpr, testfpr)) log.close() return clf warnings.filterwarnings("ignore") pool = Pool(10) ClassfiedList = { "Nearest Neighbors": KNeighborsClassifier(3), "SVMLinear": SVC(kernel="linear", C=0.025), "SVMrbf": SVC(gamma=2, C=1), "Gaussian": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), "DT": DecisionTreeClassifier(max_depth=5, random_state=0), "RF": RandomForestClassifier(max_depth=5, n_estimators=10, random_state=0), "GBRT": GradientBoostingClassifier(random_state=0), "NeualNet": MLPClassifier(alpha=1, random_state=0), "Ada": AdaBoostClassifier(), "NB": GaussianNB(), "xgb": xgb.XGBClassifier(n_estimators=125, max_depth=3, learning_rate=0.05), "QDA": QuadraticDiscriminantAnalysis() } manifoldlist = { 'LLE': manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2,
import matplotlib.pyplot as plt import numpy as np from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF, DotProduct xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50)) rng = np.random.RandomState(0) X = rng.randn(200, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) # fit the model plt.figure(figsize=(10, 5)) kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2] for i, kernel in enumerate(kernels): clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y) # plot the decision function for each datapoint on the grid Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1] Z = Z.reshape(xx.shape) plt.subplot(1, 2, i + 1) image = plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes='--') plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired) plt.xticks(())
df=clean_dataset(df) #-keep in x the variable and as y the validation x = df.iloc[:,1:].values y = df.iloc[:,0].values #-rename them data_input = x data_output = y #-set parameters for the kfold validation kf = KFold(10, n_folds = 5, shuffle=True) #-set parameters for the classifiers rf_class = RandomForestClassifier(n_estimators=10) log_class = LogisticRegression() svm_class = svm.SVC() nn_class = KNeighborsClassifier(n_neighbors=3) svc_class= SVC(kernel="linear", C=0.025) gausian_class= GaussianProcessClassifier(1.0 * RBF(1.0)) dtc_class = DecisionTreeClassifier(max_depth=5) mpl_class = MLPClassifier(alpha=1) abc_class = AdaBoostClassifier() bnb_class= GaussianNB() accu=[]#-- here we will keep all the accuracies of each classifier print("Random Forests: ") print(cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 10)) accuracy1 = cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 10).mean() * 100 accu.append(accuracy1) print("Accuracy of Random Forests is: " , accuracy1) print("\n\nsvm-linear: ")
def classifier_example(table, mf, classcol='more_red'): """ code from https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html """ h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] # Perform Aitchison PCA ordination = apca(table.T.astype(float)) X = ordination.samples.values y = mf[classcol].values #rng = np.random.RandomState(2) #X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [linearly_separable] figure = plt.figure(figsize=(27, 3)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.5, random_state=42) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name, fontsize=18) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=18, horizontalalignment='right') i += 1 plt.tight_layout() return ax
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train:\n{}\n'.format(sys.argv[1])) print('test:\n{}\n'.format(sys.argv[2])) if 'small' in sys.argv[1]: size = 'small' elif 'medium' in sys.argv[1]: size = 'medium' else: size = 'large' x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) # type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') # if type == 1: parameter = None method = input('select a method: {}: '.format(methods)) if method == 1: classifier = input('select a classifier: {}: '.format(classifiers)) if classifier == 1: parameter = input('criterion: [1: gini, 2: entropy] ') if parameter == 1: model = DecisionTreeClassifier(criterion='gini') parameter = 'gini' elif parameter == 2: model = DecisionTreeClassifier(criterion='entropy') parameter = 'entropy' else: print('no criterion chosen') sys.exit() elif classifier == 2: model = ExtraTreeClassifier() elif classifier == 3: model = ExtraTreesClassifier() elif classifier == 4: parameter = input('n: [1: 1, 2: 3: 3: 5] ') if parameter == 1: model = KNeighborsClassifier(n_neighbors=1) parameter = '1' elif parameter == 2: model = KNeighborsClassifier(n_neighbors=3) parameter = '3' elif parameter == 3: model = KNeighborsClassifier(n_neighbors=5) parameter = '5' else: print('no n chosen') sys.exit() elif classifier == 5: parameter = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if parameter == 1: model = GaussianNB() parameter = 'gaussian' elif parameter == 2: model = BernoulliNB() parameter = 'bernoulli' elif parameter == 3: model = MultinomialNB() parameter = 'multinomial' elif parameter == 4: model = ComplementNB() parameter = 'complement' else: print('no version chosen') sys.exit() elif classifier == 6: model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: model = LinearSVC(multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: model = GradientBoostingClassifier() elif classifier == 10: model = GaussianProcessClassifier(multi_class='one_vs_one') elif classifier == 11: model = SGDClassifier() elif classifier == 12: model = PassiveAggressiveClassifier() elif classifier == 13: model = NearestCentroid() elif classifier == 14: model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: model = MLPClassifier() elif classifier == 16: model = AdaBoostClassifier(n_estimators=50) elif classifier == 17: parameter = input( 'strategy: [1: stratified, 2: most frequent, 3: prior, 4: uniform, 5: constant] ' ) if parameter == 1: model = DummyClassifier(strategy='stratified') parameter = 'stratified' elif parameter == 2: model = DummyClassifier(strategy='most_frequent') parameter = 'most frequent' elif parameter == 3: model = DummyClassifier(strategy='prior') parameter = 'prior' elif parameter == 4: model = DummyClassifier(strategy='uniform') parameter = 'uniform' elif parameter == 5: model = DummyClassifier(strategy='constant') parameter = 'constant' else: print('no strategy selected') sys.exit() else: print('no classifier chosen') sys.exit() import time # Starts timer start = time.clock() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) report = classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other']) confusion = confusion_matrix( y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"]) if (parameter != None): filename = '{},{},{},{}.txt'.format(size, methods[method], classifiers[classifier], parameter) else: filename = '{},{},{}.txt'.format(size, methods[method], classifiers[classifier]) # Prints the time taken end = time.clock() time = str(end - start) with open(filename, 'w') as output: output.write('method:\n{}\n\n'.format(methods[method])) output.write('classifier:\n{}\n\n'.format(classifiers[classifier])) output.write('accuracy:\n{:.2f}%\n\n'.format( 100 * accuracy_score(y_test, predictions))) output.write('report:\n{}\n\n'.format(report)) output.write('confusion:\n{}\n\n'.format(confusion)) output.write('time:\n{}s\n\n'.format(time)) output.write('data:\n{:10}\t{:10}\t{:10}\n'.format( 'actual', 'predict', 'match?')) for i in range(len(predictions)): output.write('{:10}\t{:10}\t{:10}\n'.format( y_train[i], predictions[i], y_test[i] == predictions[i])) print('\nmethod:\n{}\n'.format(methods[method])) print('classifier:\n{}\n'.format(classifiers[classifier])) print('accuracy:\n{:.2f}%\n'.format( 100 * accuracy_score(y_test, predictions))) print('report:\n{}\n'.format(report)) print('confusion:\n{}\n'.format(confusion)) print('time: {}s\n'.format(time)) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers le = LabelEncoder() le.fit( y_train ) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) print(le.classes_) y_train = le.transform(y_train) y_test = le.transform(y_test) regressor = input('select a regressor: {}: '.format(regressors)) if regressor == 1: print(method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(method, regressor) model = LogisticRegression(solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(method, regressor) model = RidgeClassifier() elif regressor == 4: print(method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: model = OneVsRestClassifier(LinearRegression()) elif regressor == 6: model = OneVsRestClassifier(DecisionTreeRegressor()) elif regressor == 7: print(method, regressor) model = OneVsRestClassifier(Lasso(alpha=0.1)) elif regressor == 8: print(method, regressor) model = OneVsRestClassifier(MultiTaskLasso(alpha=0.1)) elif regressor == 9: print(method, regressor) model = OneVsRestClassifier(ElasticNet(random_state=0)) elif regressor == 10: print(method, regressor) model = OneVsRestClassifier(MultiTaskElasticNet(random_state=0)) elif regressor == 11: print(method, regressor) model = OneVsRestClassifier(Lars(n_nonzero_coefs=1)) elif regressor == 12: print(method, regressor) model = OneVsRestClassifier(LassoLars(alpha=.1)) elif regressor == 13: print(method, regressor) model = OneVsRestClassifier(OrthogonalMatchingPursuit()) elif regressor == 14: print(method, regressor) model = OneVsRestClassifier(BayesianRidge()) elif regressor == 15: print(method, regressor) model = OneVsRestClassifier(ARDRegression()) elif regressor == 16: print(method, regressor) model = OneVsRestClassifier(TheilSenRegressor(random_state=0)) elif regressor == 17: print(method, regressor) model = OneVsRestClassifier(HuberRegressor()) elif regressor == 18: print(method, regressor) model = OneVsRestClassifier(RANSACRegressor(random_state=0)) else: print('no regressor chosen') sys.exit() import time # Starts timer start = time.clock() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # y_train = le.inverse_transform(y_train) # y_test = le.inverse_transform(y_test) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) if (parameter != None): filename = '{},{},{},{}.txt'.format(size, methods[method], regressors[regressor], parameter) else: filename = '{},{},{}.txt'.format(size, methods[method], regressors[regressor]) # Prints the time taken end = time.clock() time = str(end - start) with open(filename, 'w') as output: output.write('method:\n{}\n\n'.format(methods[method])) output.write('regressor:\n{}\n\n'.format(regressors[regressor])) output.write('accuracy:\n{:.2f}%\n\n'.format( 100 * accuracy_score(y_test, predictions))) output.write('time:\n{}s\n\n'.format(time)) output.write('data:\n{:10}\t{:10}\t{:10}\n'.format( 'actual', 'predict', 'match?')) for i in range(len(predictions)): output.write('{:10}\t{:10}\t{:10}\n'.format( y_train[i], predictions[i], y_test[i] == predictions[i])) print('\nmethod:\n{}\n'.format(methods[method])) print('regressor:\n{}\n'.format(regressors[regressor])) print('accuracy:\n{:.2f}%\n'.format( 100 * accuracy_score(y_test, predictions))) print('time: {}s\n'.format(time)) else: print('no method chosen') sys.exit()