def parameter_search(): template = "{0:25}{1:15}{2:15}{3:15}{4:15}" print( template.format("\nClassifier", "Accuracy(%)", "Runtime(s)", "i * scalar", "Predict(s)")) for i in range(1, 10): scalar = 10 test_value = (i * scalar) # float(str(i / 10)) X_train, X_test, Y_train, Y_test = train_test_split(X_yeast, Y_yeast, train_size=0.65) _estimator = SVC(kernel='linear') _estimator.C = (i * scalar) start = time.time() _estimator.fit(X_train, Y_train) end = time.time() start_predict = time.time() prediction = (accuracy_score(Y_test, _estimator.predict(X_test))) end_predict = time.time() print( template.format('Test value=' + str(test_value), "%.2f" % (prediction * 100), "%.2f" % (end - start), str(i * scalar), "%.2f" % (end_predict - start_predict)))
def trainauc (self, train, trainlabel, seed, Cmin, Cmax, numC, rmin, rmax, numr, degree=3, method = 'roc_auc', rad_stat =2): C_range=np.logspace(Cmin, Cmax, num=numC, base=2,endpoint= True) gamma_range=np.logspace(rmin, rmax, num=numr, base=2,endpoint= True) svc = SVC(kernel=seed) # mean_score=[] df_C_gamma= DataFrame({'gamma_range':gamma_range}) # df_this = DataFrame({'gamma_range':gamma_range}) count = 0 for C in C_range: score_C=[] # score_C_this = [] count=count+1 for gamma in gamma_range: svc.C = C svc.gamma = gamma svc.degree = degree svc.random_state = rad_stat this_scores = cross_val_score(svc, train, trainlabel, scoring=method, cv=10, n_jobs=-1 \ ) score_C.append(np.mean(this_scores)) #score_C_this.append(np.mean(this_scores)) print (np.mean(score_C) ) print ("%r cycle finished, %r left" %(count, numC-count)) df_C_gamma[C]= score_C #df_this[C] = score_C_this return df_C_gamma
def __tune_on_kfolds(self, k_folds, sample_vector, targets, c_val): print("Tuning on K-Folds...") svm = SVC(kernel='linear', random_state=1) svm.C = c_val correct_predictions = 0 # split samples and targets into k-folds samples = np.array_split(sample_vector, k_folds) targets = np.array_split(targets, k_folds) # iterate over the k-folds for i in range(k_folds): print("Fold " + str(i + 1)) test_fold_data = samples[i] training_folds_data = samples.copy() del training_folds_data[i] training_folds_data = np.concatenate(training_folds_data, axis=0) test_fold_targets = targets[i] training_folds_targets = targets.copy() del training_folds_targets[i] training_folds_targets = np.concatenate(training_folds_targets, axis=0) svm.fit(training_folds_data, training_folds_targets) predictions = svm.predict(test_fold_data) for p, _ in enumerate(predictions): if predictions[p] == test_fold_targets[p]: correct_predictions += 1 '''return accuracy value''' return correct_predictions / len(sample_vector)
def select_param_rbf(X, y, kf, metric="accuracy"): """ Sweeps different settings for the hyperparameters of an RBF-kernel SVM, calculating the k-fold CV performance for each setting, then selecting the hyperparameters that 'maximize' the average k-fold CV performance. Parameters -------------------- X -- numpy array of shape (n,d), feature vectors n = number of examples d = number of features y -- numpy array of shape (n,), binary labels {1,-1} kf -- model_selection.KFold or model_selection.StratifiedKFold metric -- string, option used to select performance measure Returns -------------------- C -- float, optimal parameter value for an RBF-kernel SVM gamma -- float, optimal parameter value for an RBF-kernel SVM """ print 'RBF SVM Hyperparameter Selection based on ' + str(metric) + ':' ### ========== TODO : START ========== ### # part 3b: create grid, then select optimal hyperparameters using cross-validation C_range = 10.0**np.arange(-3, 3) gamma_range = np.logspace(-9, 3, 13) grid = [[0 for _ in xrange(len(gamma_range))] for _ in xrange(len(C_range))] i = 0 for curr_c in C_range: j = 0 for gamma in gamma_range: perf_total = 0 for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = SVC(kernel='rbf', gamma=gamma) model.C = curr_c model.fit(X_train, y_train) predictions = model.decision_function(X_test) perf_total += performance(y_test, predictions, metric) grid[i][j] = perf_total / kf.n_splits j = j + 1 i = i + 1 # get the index of the max value in a FLATTENED grid maxxx_i = np.argmax(grid) #now we figure out the location of that in the 2D grid array gamma_index = maxxx_i % len(gamma_range) c_index = maxxx_i / len(gamma_range) return C_range[c_index], gamma_range[gamma_index]
def pickParams(svc: SVC, gamma, c): """ Aids gridsearch in choosing parameters for SVM. :param svc: SVM :param gamma: param :param c: param :return: tuned SVM """ svc = svc if svc.kernel == 'linear': svc.C = c elif svc.kernel == 'poly': svc.C = c svc.D = gamma elif svc.kernel == 'rbf': svc.C = c svc.gamma = gamma return svc
def get_model(PARAMS): '''Get model according to parameters''' model = SVC() model.C = PARAMS.get('C') model.keral = PARAMS.get('keral') model.degree = PARAMS.get('degree') model.gamma = PARAMS.get('gamma') model.coef0 = PARAMS.get('coef0') return model
def new_pipe(mod): svc = SVC() svc.kernel = 'linear' svc.C = params_map[mod]['C'] svc.probability = True masker = SimpleMaskerPipeline(.2) return Pipeline([ ('columns', ColumnSelector(index_map[mod])), ('whitematter', masker), ('anova', SelectKBest(k=500)), ('svc', svc) ])
def trainSVC(self, train, trainlabel, seed, Cmin, Cmax, numC, rmin, rmax, numr, degree=3): C_range = np.logspace(Cmin, Cmax, num=numC, base=2, endpoint=True) gamma_range = np.logspace(rmin, rmax, num=numr, base=2, endpoint=True) svc = SVC(kernel=seed) # mean_score=[] df_C_gamma = DataFrame({'gamma_range': gamma_range}) # df_this = DataFrame({'gamma_range':gamma_range}) count = 0 for C in C_range: score_C = [] # score_C_this = [] count = count + 1 for gamma in gamma_range: training_manCV.secret_cm = [] training_manCV.secret_score = [] svc.C = C svc.gamma = gamma svc.degree = degree this_scores = cross_val_score( svc, train, trainlabel, scoring=training_manCV().metric_scores, cv=10, n_jobs=-1) df_raw0 = DataFrame({'cm': training_manCV.secret_cm}) score_C.append(np.mean(df_raw0['cm'].tail(10))) #score_C_this.append(np.mean(this_scores)) print(np.mean(this_scores)) print("%r cycle finished, %r left" % (count, numC - count)) df_C_gamma[C] = score_C #df_this[C] = score_C_this return df_C_gamma
def trainauc(self, train, trainlabel, seed, Cmin, Cmax, numC, rmin, rmax, numr, degree=3, method='roc_auc', rad_stat=2): C_range = np.logspace(Cmin, Cmax, num=numC, base=2, endpoint=True) gamma_range = np.logspace(rmin, rmax, num=numr, base=2, endpoint=True) svc = SVC(kernel=seed) # mean_score=[] df_C_gamma = DataFrame({'gamma_range': gamma_range}) # df_this = DataFrame({'gamma_range':gamma_range}) count = 0 for C in C_range: score_C = [] # score_C_this = [] count = count + 1 for gamma in gamma_range: svc.C = C svc.gamma = gamma svc.degree = degree svc.random_state = rad_stat this_scores = cross_val_score(svc, train, trainlabel, scoring=method, cv=10, n_jobs=-1 \ ) score_C.append(np.mean(this_scores)) #score_C_this.append(np.mean(this_scores)) print(np.mean(score_C)) print("%r cycle finished, %r left" % (count, numC - count)) df_C_gamma[C] = score_C #df_this[C] = score_C_this return df_C_gamma
def to_super(self): if self.kernel == "linear": superinstance = SVC(kernel="linear") # superinstance.coef_ = self.coef_ else: superinstance = SVC() superinstance.C = self.C superinstance._dual_coef_ = self._dual_coef_ superinstance._gamma = self._gamma superinstance._impl = self._impl superinstance._intercept_ = self._intercept_ superinstance._sparse = self._sparse superinstance.cache_size = self.cache_size superinstance.class_weight = self.class_weight superinstance.class_weight_ = self.class_weight_ superinstance.classes_ = self.classes_ superinstance.coef0 = self.coef0 superinstance.decision_function_shape = self.decision_function_shape superinstance.degree = self.degree superinstance.dual_coef_ = self.dual_coef_ superinstance.epsilon = self.epsilon superinstance.fit_status_ = self.fit_status_ superinstance.gamma = self.gamma superinstance.intercept_ = self.intercept_ superinstance.kernel = self.kernel superinstance.max_iter = self.max_iter superinstance.n_support_ = self.n_support_ superinstance.nu = self.nu superinstance.probA_ = self.probA_ superinstance.probB_ = self.probB_ superinstance.probability = self.probability superinstance.random_state = self.random_state superinstance.shape_fit_ = self.shape_fit_ superinstance.shrinking = self.shrinking superinstance.support_ = self.support_ superinstance.support_vectors_ = self.support_vectors_ superinstance.tol = self.tol superinstance.verbose = self.verbose return superinstance
def construct_SVM(config, regression=False): """Construct a SVM classifier. Args: config (dict): Dictionary of the required config settings features (pandas dataframe): A pandas dataframe containing the features to be used for classification Returns: SVM/SVR classifier, parameter grid """ max_iter = config['max_iter'] if not regression: clf = SVC(class_weight='balanced', probability=True, max_iter=max_iter, random_state=config['random_seed']) else: clf = SVMR(max_iter=max_iter, random_state=config['random_seed']) clf.kernel = str(config['SVMKernel']) clf.C = config['SVMC'] clf.degree = config['SVMdegree'] clf.coef0 = config['SVMcoef0'] clf.gamma = config['SVMgamma'] # Check if we need to use a ranked SVM if config['classifiers'] == 'RankedSVM': clf = RankedSVM() param_grid = { 'svm': ['Poly'], 'degree': [2, 3, 4, 5], 'gamma': scipy.stats.uniform(loc=0, scale=1e-3), 'coefficient': scipy.stats.uniform(loc=0, scale=1e-2), } return clf
def trainSVC (self, train, trainlabel, seed, Cmin, Cmax, numC, rmin, rmax, numr, degree=3): C_range=np.logspace(Cmin, Cmax, num=numC, base=2,endpoint= True) gamma_range=np.logspace(rmin, rmax, num=numr, base=2,endpoint= True) svc = SVC(kernel=seed) # mean_score=[] df_C_gamma= DataFrame({'gamma_range':gamma_range}) # df_this = DataFrame({'gamma_range':gamma_range}) count = 0 for C in C_range: score_C=[] # score_C_this = [] count=count+1 for gamma in gamma_range: training_manCV.secret_cm=[] training_manCV.secret_score=[] svc.C = C svc.gamma = gamma svc.degree = degree this_scores = cross_val_score(svc, train, trainlabel, scoring=training_manCV().metric_scores, cv=10, n_jobs=-1) df_raw0 = DataFrame({'cm':training_manCV.secret_cm}) score_C.append(np.mean(df_raw0['cm'].tail(10))) #score_C_this.append(np.mean(this_scores)) print (np.mean(this_scores) ) print ("%r cycle finished, %r left" %(count, numC-count)) df_C_gamma[C]= score_C #df_this[C] = score_C_this return df_C_gamma
def iaml01cw2_q2_5(): Xsmall = [] Ysmall = [] counter = np.zeros(10) for i in range(len(Xtrn_nm)): if counter[Ytrn[i]] < 1000: Xsmall.append(Xtrn_nm[i]) Ysmall.append(Ytrn[i]) counter[Ytrn[i]] += 1 Xsmall = np.array(Xsmall) Ysmall = np.array(Ysmall) Cs = np.logspace(-2, 3, 10) svm = SVC(kernel='rbf', gamma='auto') means = [] maxx = 0. Cmax = 0. for C in Cs: svm.C = C this_scores = cross_val_score(svm, Xsmall, Ysmall, cv=3, scoring='accuracy') m = this_scores.mean() means.append(m) if m > maxx: maxx = m Cmax = C plt.plot(Cs, means) plt.xlabel('(log-scale) Regularisation parameter') plt.ylabel('Mean accuracies') plt.axes().set_xscale('log') plt.title('Plot of mean accuracy for 10 evenly log-spaced values of C') print('Max mean accuracy = {} for C = {}'.format(round(maxx, 5), round(Cmax, 4))) plt.show() return Cmax
train_features = bag.get_features(train_sentences) test_features = bag.get_features(test_sentences) scaled_train_sentences, scaled_test_sentences = normalize_data(train_features, test_features, type='l2') # testing print(scaled_train_sentences[:1]) print(scaled_test_sentences[:1]) # ### 6. Train a SVM with linear kernel that classifies spam/non-spam messages. Use parameter C of value 1. # # Calculate the `accuracy` and `F1-score` for the testing data. svm_model = SVC() svm_model.C = 1 svm_model.kernel = 'linear' # train - nonscaled data svm_model.fit(train_features, train_labels) # predict - nonscaled data predictions = svm_model.predict(test_features) print(accuracy_score(test_labels, predictions)) print(f1_score(test_labels, predictions)) svm_model = SVC() svm_model.C = 1 svm_model.kernel = 'linear' # train - scaled data
joblib.dump(gs, "grid_cv_rbm.pkl", compress=3) else: # 直接设置参数训练 bow = BoWFeature() bow.patch_num=10000 bow.patch_size=(20,20) bow.learning_rate=0.001 bow.n_components=512 bow.n_iter=100 bow.sample_num = 1000 bow.fit(x_train) svm = SVC(kernel='linear', probability = True, random_state=42) svm.C = 1000 #lr = LogisticRegression() #lr.C = 100 best = Pipeline([('bow', bow),('svm',svm)]) best.fit(x_train, y_train) print "*********************Save*******************************" joblib.dump(best, "classifier_rbm.pkl", compress=3) print "*********************Test*******************************" y_test_pre = best.predict(x_test) cm = confusion_matrix(y_test, y_test_pre) from map_confusion import plot_conf plot_conf(cm, range(le.classes_.size), 'RSDataset.png') from sklearn.metrics import classification_report
##C,gamma values to test #values = [[0.001, 0.1], # [0.01, 0.2], # [1, 0.5], # [10, 2], # [100, 0.02], # [100, 1]] # ##retrieve res from different exec xval_acc_mean = np.zeros((len(values, ))) xval_acc_std = np.zeros((len(values, ))) # #cross-valid for i, value_C in enumerate(values): #set the C value clf.C = value_C #create a vector to store accuracy results xval_acc = np.zeros((splitter.get_n_splits())) k = 0 #split data and labels into train and test for tr_idx, ts_idx in splitter.split(x_tr): x_tr_xval = x_tr[tr_idx, :] y_tr_xval = y_tr[tr_idx] x_ts_xval = x_tr[ts_idx, :] y_ts_xval = y_tr[ts_idx] #train a model clf.fit(x_tr_xval, y_tr_xval) #test the trained model yc = clf.predict(x_ts_xval) xval_acc[k] = np.mean(yc == y_ts_xval)
# Call the OneClassSVM. #Note the important parameters of the OneClassSVM 'nu' corresponds to the 'v' parameter clf = svm.OneClassSVM(kernel='rbf', max_iter=1000000, cache_size=200, nu=0.2) # Train the OneSVM with the X_train_filtered_bin matrix clf.fit(X_train_filtered) print("OneClass-SVM trained .... ") # Test the OneSVM with the X_test_bin sparce matrix y_predicted_OneClass = clf.predict(X_test) print("OneClass-SVM tested .... ") F1, R, P = clf_performance(y_outliers_test, y_predicted_OneClass, on_loop=0) ############################################################################### # Outlier SVM Classification ############################################################################### # Initialize a classic SVM clc = SVC(kernel='rbf') # Use this only for tunning C of the SVM (From the experiments the best value of C is 100) #C_tuned = CTune_SVM(clc, csr_matrix(X_train_bin), y_outliers_train, n_folds = 5, inf_pow = -4, sup_pow = 4) clc.C = 100 print("The selected value of C for the OutliersSVM is: ", str(clc.C)) # Train the SVM with X_train clc.fit(X_train, y_outliers_train) print("Outliers-SVM trained .... ") # Predict the labels of X_test y_predicted_outliers = clc.predict(X_test) print("Outliers-SVM test .... ") F1, R, P = clf_performance(y_outliers_test, y_predicted_outliers, on_loop=0)
def loop_test(document_topic, X_text): address = '/home/juan-laptop/Dropbox/AIRO/Neural Networks/Results/' textfile = open(address + document_topic + '.txt', 'w') textfile.write( 'SVM,n_features,representation,kernel,F1,P,R,Acc,FPR,TP,TN,FP,FN\n') n_features = np.array([10, 25, 50, 100]) representations = np.array(['binary', 'Nfrequency', 'tf-idf', 'hadamard']) kernel = np.array(['linear', 'poly', 'rbf', 'sigmoid']) for i in n_features: print('Evualating n_features : ' + str(i)) vectorizer = CountVectorizer(min_df=1, analyzer='word', stop_words='english') X_text_vec = vectorizer.fit_transform(X_text) # Transform an sparse matrix into a full matrix X_text_vec = X_text_vec.todense() X = reduce_features(X_text_vec, n_features=i) for j in representations: print('Document representation : ' + j) if j == 'binary': binarize_set(X) elif j == 'Nfrequency': transformer = TfidfTransformer(use_idf='False') X = transformer.fit_transform(X) X = X.todense() elif j == 'tf-idf': transformer = TfidfTransformer(use_idf='True') X = transformer.fit_transform(X) X = X.todense() elif j == 'hadamard': X = hadamard_product(X) X_train, X_train_filtered, X_test, y_outliers_train, y_outliers_test = dataset_split( X, 5, on_loop=1) for k in kernel: print('Kernel : ' + k) clf = svm.OneClassSVM(kernel=str(k), max_iter=1000000, cache_size=200, nu=0.2) clf.fit(X_train_filtered) y_predicted_OneClass = clf.predict(X_test) F1, P, R, Acc, FPR, TP, TN, FP, FN = clf_performance( y_outliers_test, y_predicted_OneClass, on_loop=1) textfile.write('OneClass' + ',' + str(i) + ',' + j + ',' + k + ',' + str(F1) + ',' + str(P) + ',' + str(R) + ',' + str(Acc) + ',' + str(FPR) + ',' + str(TP) + ',' + str(TN) + ',' + str(FP) + ',' + str(FN) + '\n') clc = SVC(kernel=str(k)) clc.C = 100 clc.fit(X_train, y_outliers_train) y_predicted_outliers = clc.predict(X_test) F1, P, R, Acc, FPR, TP, TN, FP, FN = clf_performance( y_outliers_test, y_predicted_outliers, on_loop=1) textfile.write('Outliers' + ',' + str(i) + ',' + j + ',' + k + ',' + str(F1) + ',' + str(P) + ',' + str(R) + ',' + str(Acc) + ',' + str(FPR) + ',' + str(TP) + ',' + str(TN) + ',' + str(FP) + ',' + str(FN) + '\n') textfile.close() print('finish!!')
'''return accuracy value''' return correct_predictions / len(sample_vector) if __name__ == '__main__': # data_formatter.save_files_as_numpy("rt-polarity.pos.txt", "rt-polarity.neg.txt") nlp = NLP() thresh_val_range = [[5, 2000], [25, 1000], [50, 500]] c_val_range = [0.0001, 1, 1000] # nlp.train_on_grid_search(thresh_val_range, c_val_range) '''predicting results on test set''' bag_of_words = nlp.generate_bag_of_words(nlp.vocabulary, nlp.vocab_lower_thresh, nlp.vocab_upper_thresh) training_vector = data_formatter.vectorize(nlp.x_train, bag_of_words) testing_vector = data_formatter.vectorize(nlp.x_test, bag_of_words) svm = SVC(kernel='linear', random_state=1) svm.C = nlp.c_val print("Training svm...") svm.fit(training_vector, nlp.y_train) print("Predicting model...") test_predictions = svm.predict(testing_vector) correct_predictions = 0 for i, _ in enumerate(test_predictions): if test_predictions[i] == nlp.y_test[i]: correct_predictions += 1 print("Final Accuracy: %" + str(correct_predictions / len(test_predictions) * 100))
vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(newsgroup.data) y = newsgroup.target C = np.power(10.0, np.arange(-5, 6)) grid = {'C': C} k_folder = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=241) clf = SVC(kernel='linear', random_state=241) grid_search = GridSearchCV(clf, grid, scoring='accuracy', cv=k_folder) grid_search.fit(X, y) optimal_parameters = {} max_score = max(x.mean_validation_score for x in grid_search.grid_scores_) optimal_c = next(x.parameters['C'] for x in grid_search.grid_scores_ if x.mean_validation_score == max_score) clf.C = optimal_c clf.fit(X, y) feature_mappings = vectorizer.get_feature_names() result = { 'words': list(feature_mappings[i] for i in clf.coef_.indices), 'values': list(abs(weight) for weight in clf.coef_.data), } coef = DataFrame(data=result) coef = coef.sort_values(by='values', ascending=False) words = coef.head(10)['words'].values.tolist() output = " ".join(sorted(words)) coursera.output("svm_text_analyze.txt", output)
#4.1 GridSearchCV #超参数 tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4] }, { 'kernel': ['linear'], 'degree': [3, 5, 7, 9] }] clf = SVC() C_s = np.logspace(1, 10, 100, 1000) scores = list() scores_std = list() for C in C_s: #交叉验证 clf.C = C this_scores = cross_val_score(clf, X_stand, Y, cv=5) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) #GridSearchCV选取超参数 clfg = GridSearchCV(SVC(), tuned_parameters, cv=5) clfg.c = C clfg.fit(X_stand, Y) print("Best parameters set found on development set:") print() print(clfg.best_params_) print() print("Grid scores on development set:") print() means = clfg.cv_results_['mean_test_score']
poly.degree=i for j in coef_list : print k poly.coef0=j score1=cross_val_score(poly,Xtrain,ytrain,cv=5) score.append(np.mean(score1)) if k==0 : i1=i j1=j if k>0 and np.mean(score1)>score[k-1] : i1=i j1=j print(i,j,np.mean(score1)) k=k+1''' for i in c_list: poly.C = i score1 = cross_val_score(poly, Xtrain, ytrain, cv=5) score.append(np.mean(score1)) if k == 0: i1 = i if k > 0 and np.mean(score1) > score[k - 1]: i1 = i print(k, i, ':', np.mean(score1)) '''0 (2, 0, 0.054740548179391016) 1 (2, 1, 0.1493362573941846) 2 (2, 2, 0.18009567595229553) 3