def mlPreddict(label): df = pd.read_csv('classification.csv') df_names = df Xfeatures = df_names['Names'] cv = CountVectorizer() x = cv.fit_transform(Xfeatures) y = df_names.Classes x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12) knn = KNeighborsClassifier(n_neighbors=158) r_c = RidgeClassifier() m_nb = MultinomialNB() dt_c = DecisionTreeClassifier() svm_c = svm.SVC() knn.fit(x_train, y_train) r_c.fit(x_train, y_train) m_nb.fit(x_train, y_train) dt_c.fit(x_train, y_train) svm_c.fit(x_train, y_train) knn.score(x_test, y_test) r_c.score(x_test, y_test) m_nb.score(x_test, y_test) dt_c.score(x_test, y_test) svm_c.score(x_test, y_test) sample_name = [label] vect = cv.transform(sample_name).toarray() prediction = svm_c.predict(vect) predict = ''.join(map(str, prediction)) print(predict) return predict
def fit_ridge(l2_reg, train_random_features, y_train, test_random_features, y_test): clf = RidgeClassifier(alpha=l2_reg) clf.fit(train_random_features, y_train.ravel()) train_accuracy = clf.score(train_random_features, y_train) test_accuracy = clf.score(test_random_features, y_test) print("Train accuracy:", train_accuracy) print("Test accuracy:", test_accuracy) return train_accuracy, test_accuracy
def fit_logistic_regression(self, X, y): X = review_train_logged y = review_train['has_reviewed'].to_numpy().astype(int) model = RidgeClassifier().fit(X, y) model.score(X, y) # 0.68 accuracy model.get_params()
def scikit_ridgec_test(size): X, y = datasets.make_classification(n_samples=1000, n_features=size, n_informative=2, n_redundant=2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) model = RidgeClassifier(random_state=42) model.fit(X_train, y_train) model.score(X_test, y_test)
def main(): # load data file_lssvm = "hw2_lssvm_all.dat" size_train = 400 with open(file_lssvm, 'r') as fr: list_line = fr.readlines() x_train, y_train = load_xy(list_line[:size_train]) x_test, y_test = load_xy(list_line[size_train:]) list_lambda = [0.05, 0.5, 5, 50, 500] # Q9, Q10: run regression ein = [] eout = [] for lam in list_lambda: rcf = RidgeClassifier(alpha=lam) rcf.fit(x_train, y_train) Ein = 1 - rcf.score(x_train, y_train) Eout = 1 - rcf.score(x_test, y_test) ein.append(Ein) eout.append(Eout) print("Ridge Classifier:") print("argminEin = {}, minEin_lambda = {}".format( min(ein), list_lambda[ein.index(min(ein))])) print("argminEout = {}, minEout_lambda = {}".format( min(eout), list_lambda[eout.index(min(eout))])) print("==========") # Q11, Q12: Bagging ein.clear() eout.clear() num_iter = 250 for lam in list_lambda: rcf = RidgeClassifier(alpha=lam) bcf = BaggingClassifier(base_estimator=rcf, n_estimators=num_iter, n_jobs=-1, random_state=0) bcf.fit(x_train, y_train) Ein = 1 - bcf.score(x_train, y_train) Eout = 1 - bcf.score(x_test, y_test) ein.append(Ein) eout.append(Eout) print("Bagging Ridge Classifier:") print("argminEin = {}, minEin_lambda = {}".format( min(ein), list_lambda[ein.index(min(ein))])) print("argminEout = {}, minEout_lambda = {}".format( min(eout), list_lambda[eout.index(min(eout))])) return
def train_linear_classificator(challenge, new=False): if new: unigram_tagger, st = spamFilter.prepare_tagger() idealist = list( importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv')) featurelist = {} for idea in idealist: idea['TRIGGERED'] = [] idea['PREDICTION'] = "Ham" idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st) if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""): ideafeatures["Spam"] = 1 else: ideafeatures["Spam"] = 0 for key in ideafeatures.keys(): featurelist[key] = featurelist.get(key, []) featurelist[key].append(ideafeatures[key]) else: if challenge == "all": idealist = [] for file in listdir(variables.linclasstrainingsdatapath): if isfile(join(variables.linclasstrainingsdatapath, file)): idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file))) else: idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv")) featurelist = {} for key in idealist[0].keys(): featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')] testdata = pd.DataFrame(featurelist) X = testdata.drop('Spam', axis=1) y = testdata['Spam'] importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist) clf = RidgeClassifier().fit(X, y) print(clf.score(X, y)) return clf, clf.coef_
def train_linear_classifier(featurelist): testdata = pd.DataFrame(featurelist) X = testdata.drop('Spam', axis=1) y = testdata['Spam'] clf = RidgeClassifier().fit(X, y) print(clf.score(X, y)) return clf, clf.coef_
def ridgeClass(features, response): from sklearn.linear_model import RidgeClassifier X_train, X_test, y_train, y_test = train_test_split( features, response, test_size=0.4) # , random_state=4) ridgeC = RidgeClassifier() ridgeC.fit(X_train, y_train) print(ridgeC.score(X_test, y_test))
def ridge_class_model(X_train2, X_validate2, X_test2, y_train2, y_validate2, y_test2): '''Creates a ridge classifier model shows it accuracy on train, validate and test''' # create the model object clf2 = RidgeClassifier(random_state=123) # fit to train only clf2.fit(X_train2, y_train2) y_pred = clf2.predict(X_train2) # evaluate with score, returns the mean accuracy on the given test data and labels print('Accuracy of Ridge Classifier Model on Train: \n', round(clf2.score(X_train2, y_train2), 4)) print('Accuracy of Ridge Classifier Model on Validate: \n', round(clf2.score(X_validate2, y_validate2), 4)) print('Accuracy of Ridge Classifier Model on Test: \n', round(clf2.score(X_train2, y_train2), 4))
def train(train_data, train_target, test_data, test_target, alphas): # model = RidgeClassifier() # parameters = {'alpha':alphas} # clf = GridSearchCV(model, parameters, cv=2, n_jobs=len(alphas)) warned = False start_time = time.time() test_scores = [] val_scores = [] with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always") # clf.fit(train_data, train_target) # val_scores = clf.cv_results_['mean_test_score'] for alpha in alphas: # validation score end_index = int(len(train_data) * 0.8) clf = RidgeClassifier(alpha=alpha, fit_intercept=False) # clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False, selection='random') clf.fit(train_data[:end_index], train_target[:end_index]) val_scores.append( clf.score(train_data[end_index:], train_target[end_index:])) # val_scores.append(clf_score(clf, train_data[end_index:],train_target[end_index:])) for alpha in alphas: clf = RidgeClassifier(alpha=alpha, fit_intercept=False) # clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False, selection='random') clf.fit(train_data, train_target) test_scores.append(clf.score(test_data, test_target)) # test_scores.append(clf_score(clf, test_data, test_target)) # for warning in caught_warnings: # # if warning.category == UnsupportedWarning: # print(str(warning.message)) # warned = True train_time = time.time() - start_time return val_scores, test_scores, caught_warnings, train_time
class RidgeModel(ccobra.CCobraModel): def __init__(self, name='Ridge', k=1): super(RidgeModel, self).__init__(name, ["moral"], ["single-choice"]) self.clf = RidgeClassifier(alpha=7) self.n_epochs = 1 def pre_train(self, dataset): x = [] y = [] for subj_train_data in dataset: for seq_train_data in subj_train_data: seq_train_data['task'] = seq_train_data['item'].task inp = create_input(seq_train_data) target = float(output_mppng[seq_train_data['response'][0][0]]) x.append(inp) y.append(target) x = np.array(x) y = np.array(y) self.train_x = x self.train_y = y self.train_network(self.train_x, self.train_y, self.n_epochs, verbose=True) def train_network(self, train_x, train_y, n_epochs, verbose=False): print('Starting training...') for epoch in range(self.n_epochs): # Shuffle the training data perm_idxs = np.random.permutation(np.arange(len(train_x))) train_x = train_x[perm_idxs] train_y = train_y[perm_idxs] self.clf.fit(train_x, train_y) print('Mean accuracy:') print(self.clf.score(train_x, train_y)) def predict(self, item, **kwargs): input = {'task': item.task} input['aux'] = kwargs x = np.array(create_input(input)).reshape(1, -1) output = self.clf.predict(x) self.prediction = output_mppngREV[output[0]] return self.prediction
def train(train_data, train_target, test_data, test_target, alphas): warned = False start_time = time.time() test_scores = [] with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always") # model = RidgeClassifier() # parameters = {'alpha':alphas} # if train_data.shape[1] <= 5000: # clf = GridSearchCV(model, parameters, cv=2, n_jobs=len(alphas)) # else: # # decrease memory usage for 5K+ dimensions # clf = GridSearchCV(model, parameters, cv=2, n_jobs=1) # clf.fit(train_data, train_target) # val_scores = clf.cv_results_['mean_test_score'] val_scores = [] for alpha in alphas: # validation score end_index = int(len(train_data) * 0.8) clf = RidgeClassifier(alpha=alpha) clf.fit(train_data[:end_index], train_target[:end_index]) val_scores.append(clf.score(train_data[end_index:], train_target[end_index:])) for alpha in alphas: clf = RidgeClassifier(alpha=alpha) clf.fit(train_data, train_target) test_scores.append(clf.score(test_data, test_target)) # for warning in caught_warnings: # # if warning.category == UnsupportedWarning: # print(str(warning.message)) # warned = True train_time = time.time() - start_time return val_scores, test_scores, caught_warnings, train_time
def linear_ridge(M, labels, seed, split=0.8): """ linear ridge algorithm for input M and output labels Inputs: M : matrix m*n where each row is a different example and the columns are composed of the features labels : vector m*1 where each row is the correponding class of the row of M seed : random seed to do the split between test/validation/training split: number between 0 and 1. Split between training and testing set. Default : 0.8 Ouputs: roc_auc_train: AUC score on the train set roc_auc_val: AUC score on the validation set roc_auc_test: AUC score on the test set """ M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str) M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test( M_float, seed, labels, nb_val=3, split=0.8) X_train = M_train_val Y_train = labels_train_val X_test = M_test Y_train = np.reshape(Y_train, (Y_train.shape[0], )) # Create our imputer to replace missing values with the mean e.g. imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp = imp.fit(X_train) # Impute our data, then train X_train_imp = imp.transform(X_train) clf = RidgeClassifier() clf = clf.fit(X_train_imp, Y_train) # Impute each test item, then predict X_test_imp = imp.transform(X_test) X_val_imp = imp.transform(M_val) # Compute the accuracy lin_acc = clf.score(X_test_imp, labels_test) # Compute the AUC pred_train = clf.decision_function(X_train_imp) pred = clf.decision_function(X_test_imp) pred_val = clf.decision_function(X_val_imp) fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, pred_train) roc_auc_train = auc(fpr_svm, tpr_svm) fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, pred_val) roc_auc_val = auc(fpr_svm, tpr_svm) fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, pred) roc_auc_test = auc(fpr_svm, tpr_svm) print( 'linear ridge: train set: %0.5f, validation: %0.5f, test set: %0.5f' % (roc_auc_train, roc_auc_val, roc_auc_test)) return roc_auc_train, roc_auc_val, roc_auc_test
def run_candidate(genome, X_train, X_test, y_train, y_test): clf = RidgeClassifier(alpha=1.0) para_return = Parallel(n_jobs=-1)( delayed(run_case)(train_case_X, train_case_y) for train_case_X, train_case_y in zip(X_train, y_train)) states = [] trains = [] files = [] for X, y in para_return: states.append(X) trains.append(y) trains = np.asarray(trains, dtype=np.float64) #selection_indices = np.asarray(range(1000,1500)) #np.random.random_integers(0, 1000, size=(500,)) #selection_indices = [i * period for i in range(10)] #np.random.random_integers(0, len(X_train[0]-1), size=(int(len(X_train[0])/2),)) X_train_pc = [ np.asarray(s['sp'])[np.asarray(genome, dtype=np.int)] for s in states ] print("RESERVOIR:", X_train_pc) clf.fit(X_train_pc, trains) #TESTING states = [] test_labels = [] test_para_return = Parallel(n_jobs=-1)(delayed(run_case)(X, y) for X, y in zip(X_test, y_test)) for X, y in test_para_return: states.append(X) test_labels.append(y) test_labels = np.asarray(test_labels, dtype=np.float64) X_test_pc = [ np.asarray(s['sp'])[np.asarray(genome, dtype=np.int)] for s in states ] #for prediction, true_label in zip(clf.predict(X_test_pc), test_labels): # print("Prediction: ", prediction, "True label ", true_label) #print("Score:", clf.score(X_test_pc, test_labels)) return clf.score(X_test_pc, test_labels)
def ml_predict(label): df = pd.read_csv('classification.csv') df_names = df Xfeatures = df_names['Names'] cv = CountVectorizer() x = cv.fit_transform(Xfeatures) y = df_names.Classes x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12) knn = KNeighborsClassifier(n_neighbors=158) r_c = RidgeClassifier() m_nb = MultinomialNB() dt_c = DecisionTreeClassifier() svm_c = svm.SVC() knn.fit(x_train, y_train) r_c.fit(x_train, y_train) m_nb.fit(x_train, y_train) dt_c.fit(x_train, y_train) svm_c.fit(x_train, y_train) k = knn.score(x_test, y_test) r = r_c.score(x_test, y_test) m = m_nb.score(x_test, y_test) d = dt_c.score(x_test, y_test) s = svm_c.score(x_test, y_test) score_list = [k, r, m, d, s] sorted(score_list, reverse=True) sample_name = ['Sara'] vect = cv.transform(sample_name).toarray() predict_w = r_c.predict(vect) prdct = ''.join(map(str, predict_w)) #print(predict_w) return prdct
def Classification_Test(N=400, eta=0.35, gamma=0.05, tau=400, bits=np.inf, num_waves=1000, test_size=0.1, preload=False, write=False, mask=0.1, activate='mg', beta=1.0, power=7, t=1, theta=0.2): """ Args: N: number of virtual high_nodes gamma: input gain eta: oscillation strength tau: loop delay length bits: bit precision preload: preload time-series data write: save created time-series data mask: amplitude of mask values activate: activation function to be used (sin**2,tanh,mg) beta: driver gain t: timestep used to solve diffeq power: exponent for MG equation theta: distance between virtual high_nodes Returns: Accuracy of Ridge Model on Testing Data """ X_train, X_test, y_train, y_test = make_training_testing_set(num_waves=num_waves, test_percent=test_size, preload=preload, write=write) clf = RidgeClassifier(alpha=0) m = np.array([random.choice([-mask, mask]) for i in range(N)]) # Instantiate Reservoir, feed in training and prediction data sets r1 = DelayReservoir(N=N, eta=eta, gamma=gamma, theta=theta, beta=beta, tau=tau, power=power) Xs = [X_train, X_test] new_Xs = [[], []] for k, data in enumerate(Xs): for idx in tqdm(range(len(data))): new_Xs[k].append(np.array(r1.calculate(data[idx], m, bits, t, activate))[:, -1]) new_Xs = np.array(new_Xs) clf.fit(new_Xs[0], y_train) return [clf.score(new_Xs[1], y_test), np.mean(margin(clf, X_test))]
print('Features shape: {}'.format(conv_features['train'].shape)) print('Determining binarization threshold...') best_score = 0 best_threshold = 0 for threshold in np.arange(0, 3, 0.2): clf = RidgeClassifier(alpha=1.0, fit_intercept=False) # clf = SGDClassifier(loss='squared_loss', penalty='elasticnet', alpha=1.0, l1_ratio=0.5, fit_intercept=False) # clf = ElasticNet(alpha=0.0001, l1_ratio=l1_ratio, fit_intercept=False, selection='random') features_train = threshold_binarize(conv_features['train'], threshold) features_test = threshold_binarize(conv_features['test'], threshold) clf.fit(features_train, labels_train) score = clf.score(features_test, labels_test) # score = clf_score(clf, features_test, labels_test) print('Threshold', threshold, 'Score', score) if score > best_score: best_score = score best_threshold = threshold print( 'Finished threshold determination. Best threshold: {}. Best Score: {}.' .format(best_threshold, best_score)) features_train = threshold_binarize(conv_features['train'], best_threshold) features_test = threshold_binarize(conv_features['test'],
plt.legend(loc="lower right") plt.show() #RidgeClassifier Algorithm from sklearn.linear_model import RidgeClassifier RC = RidgeClassifier() RC = RC.fit(X_train, y_train) RC #accuracy of RidgeClassifier Algorithm y_pred1 = RC.predict(X_test) print('Accuracy score= {:.2f}'.format(RC.score(X_test, y_test))) #ROC curve of RidgeClassifier Algorithm from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt fpr, tpr, thresholds = roc_curve(y_test, y_pred1) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange',
# np.savetxt(base_dir+"classifications/train_test_split.csv", idxs, delimiter=",") idxs = np.loadtxt(base_dir + "classifications/train_test_split.csv", delimiter=",").astype(int) cnt = 0 for idx in idxs: ntrain = int(np.round(0.7 * ndata)) idx_train = idx[0:ntrain] idx_test = idx[ntrain:] # use 'class2' here to train machine on two classes # only, aurora and non-aurora (instead of 'class6') X_train = features[idx_train, :] y_train = df["class6"][idx_train] X_test = features[idx_test, :] y_test = df["class6"][idx_test] clf = RidgeClassifier(random_state=10 * cnt, normalize=False, alpha=alpha) clf.fit(X_train, y_train) avgscore[cnt] = clf.score(X_test, y_test) cnt += 1 print("==========================") print("follow are testing results") print("==========================") print("average accuracy: ", np.mean(avgscore)) print("standard deviation of accuracy: ", np.std(avgscore)) #print( np.mean(avgscore), np.std(avgscore) ) y_pred = clf.predict(X_test) mat = confusion_matrix(y_test, y_pred) print("confusion matrix is:") print(mat)
def general_model(df, training_data): ml_names, ml_genders = [], [] for k, v in training_data.items(): k = re.sub('[^a-z]+', '?', k) ml_names.append(k) ml_genders.append(v) training_df = pd.DataFrame() training_df['name'] = ml_names training_df['gender'] = ml_genders training_df['-3'] = training_df['name'].str[-3] training_df['-2'] = training_df['name'].str[-2] training_df['-1'] = training_df['name'].str[-1] columns = ['-3', '-2', '-1'] X = [pd.get_dummies(training_df[i]) for i in columns] modelX = pd.concat([i for i in X], axis=1) def map_genders(x): if x == 'F': return 1 return 0 modelY = training_df['gender'].apply(map_genders) X_train, X_test, y_train, y_test = train_test_split(modelX, modelY, test_size=0.2, random_state=42) def to_gender_class(x): if x: return 'F' return 'M' model = RidgeClassifier(fit_intercept=False, solver='lsqr') model.fit(X_train, y_train) training_df['pred'] = model.predict(modelX) training_df['pred'] = training_df['pred'].apply(to_gender_class) print("Model Scores (Train/Test):") print(model.score(X_train, y_train)) print(model.score(X_test, y_test)) # training_df[training_df['gender'] != training_df['pred']] # Predict using model vectorized_df = pd.DataFrame() vectorized_df['cleaned_name'] = df['cleaned_name'] vectorized_df['-3'] = df['cleaned_name'].str[-3] vectorized_df['-2'] = df['cleaned_name'].str[-2] vectorized_df['-1'] = df['cleaned_name'].str[-1] columns = ['-3', '-2', '-1'] tempX = [pd.get_dummies(vectorized_df[i], ) for i in columns] necessary_columns = ['?'] + [chr(i) for i in range(97, 123)] for i in tempX: for j in necessary_columns: if j not in i.columns: i[j] = [0 for k in range(len(df))] actualX = pd.concat([i for i in tempX], axis=1) def map_to_gender(x): if x: return 'F' return 'M' predicted_values = model.predict(actualX) df['model_prediction'] = predicted_values df['model_prediction'] = df['model_prediction'].apply(map_to_gender) return df
plt.savefig('gmm_images/{}_h1_cder_n_{}.png'.format(n, num_dgms)) plt.close() # ----------------------------------------------------------------------------- # ------------------------------ CDER features -------------------------------- # ----------------------------------------------------------------------------- X_train_features_1 = get_all_features(X_train, ellipses, f_ellipse) X_test_features_1 = get_all_features(X_test, ellipses, f_ellipse) # ----------------------------------------------------------------------------- # ------------------------------ Ridge Classification ------------------------ # ----------------------------------------------------------------------------- t0 = time.time() X_train_features = np.column_stack( (X_train_features_0, X_train_features_1)) X_test_features = np.column_stack((X_test_features_0, X_test_features_1)) ridge_model = RidgeClassifier().fit(X_train_features, F_train) score_train.append(ridge_model.score(X_train_features, F_train)) score_test.append(ridge_model.score(X_test_features, F_test)) t1 = time.time() print('Ridge Classification: {}'.format(t1 - t0)) print(np.mean(score_train), np.std(score_train)) print(np.mean(score_test), np.std(score_test))
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(-dist**2 / (2 * sigma**2)) final_graph = graph_feat * sparse_graph # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc = Train.run_training(final_graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, params) print(test_acc) # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs, pathToSave, i): """ train_ind : indices of the training samples test_ind : indices of the test samples val_ind : indices of the validation samples graph_feat : population graph computed from phenotypic measures num_subjects x num_subjects features : feature vectors num_subjects x num_features y : ground truth labels (num_subjects x 1) y_data : ground truth labels - different representation (num_subjects x 2) params : dictionnary of GCNs parameters subject_IDs : list of subject IDs returns: test_acc : average accuracy over the test samples using GCNs test_auc : average area under curve over the test samples using GCNs lin_acc : average accuracy over the test samples using the linear classifier lin_auc : average area under curve over the test samples using the linear classifier fold_size : number of test samples """ tf.reset_default_graph() tf.app.flags._global_parser = argparse.ArgumentParser() print(len(train_ind)) # selection of a subset of data if running experiments with a subset of the training set #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs) labeled_ind = reader.site_percentage(train_ind,1.0) # feature selection/dimensionality reduction step x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features']) fold_size = len(test_ind) # Calculate all pairwise distances distv = distance.pdist(x_data, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2)) num_nodes = 662 final_graph = graph_feat * sparse_graph # Gender final_graph2 = graph_feat2 * sparse_graph # Age # Linear classifier clf = RidgeClassifier() clf.fit(x_data[train_ind, :], y[train_ind].ravel()) # Compute the accuracy lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel()) # Compute the AUC pred = clf.decision_function(x_data[test_ind, :]) lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred) print("Linear Accuracy: " + str(lin_acc)) # Classification with GCNs test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind, test_ind, idx, lr, params, pathToSave, i) # return number of correctly classified samples instead of percentage # test_acc = int(round(test_acc * len(test_ind))) # lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] weights_0 = weights[0] weights_1 = weights[1] scores_lin_ = np.sum(scores_lin) scores_auc_lin_ = np.mean(scores_auc_lin) scores_acc_ = np.sum(scores_acc) scores_auc_ = np.mean(scores_auc) if not os.path.exists(pathToSave + 'excel/'): os.makedirs(pathToSave + 'excel/') pathToSave2 = pathToSave + 'excel/' result_name = 'ABIDE_classification.mat' sio.savemat(pathToSave2 + str(trial) + result_name, {'lin': scores_lin_, 'lin_auc': scores_auc_lin_, 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1}) df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_], 'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1}) prediction.append(df) # Create a Pandas Excel writer using XlsxWriter as the engine. writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. df.to_excel(writer_n, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file. writer_n.save() test_acc = int(round(test_acc * len(test_ind))) lin_acc = int(round(lin_acc * len(test_ind))) scores_acc = [test_acc] scores_auc = [test_auc] scores_lin = [lin_acc] scores_auc_lin = [lin_auc] fold_size = [fold_size] # return number of correctly classified samples instead of percentage test_acc = int(round(test_acc * len(test_ind))) return test_acc, test_auc, lin_acc, lin_auc, fold_size
# print(accuracy_score(y_val, ypred)) # print(confusion_matrix(y_val, ypred)) # # Linreg Using sklearn # clf = LinearRegression() # clf.fit(X, y_train) # print(clf.score(X_val[selected_features], y_val)) # print(accuracy_score(y_val, clf.predict(X_val[selected_features]))) # print(confusion_matrix(y_val , clf.predict(X_val[selected_features]))) # # Polynomial features ? # from sklearn.preprocessing import PolynomialFeatures # polynomial_features= PolynomialFeatures(degree=3) # Xp = polynomial_features.fit_transform(X) # model = sm.OLS(y_train,Xp) # results = model.fit() # print(results.summary()) #%% Feature Selection - Ridge Regression estimator = RidgeClassifier(class_weight='balanced') n_features = 5 selector = RFE(estimator, n_features_to_select=n_features, step=1, verbose=0) selector = selector.fit(X_train, y_train) selected_features = np.take(features, np.where(selector.support_)[0]) clf = RidgeClassifier(class_weight='balanced') clf.fit(X_train[selected_features], y_train) print(clf.score(X_val[selected_features], y_val)) print(balanced_accuracy_score(y_val, clf.predict(X_val[selected_features]))) print(confusion_matrix(y_val, clf.predict(X_val[selected_features])))
import numpy as np import pandas as pd from sklearn.datasets import load_breast_cancer from sklearn.linear_model import RidgeClassifier import matplotlib.pyplot as plt X, y = load_breast_cancer(return_X_y=True) # 取一个特征 X = X[:, 2][:, np.newaxis] # train,test X_train, X_test = X[:-30], X[-30:] y_train, y_test = y[:-30], y[-30:] clf = RidgeClassifier().fit(X_train, y_train) y_pred = clf.predict(X_test) print(y_test) print("Train Set score: {}".format(clf.score(X_train, y_train))) print("Test Set score: {}".format(clf.score(X_test, y_test)))
X_train_features_Z1_tent, X_train_features_H1_tent, X_train_features_S1_tent, X_train_features_V1_tent)) X_test_features = np.column_stack( (X_test_features_R0_tent, X_test_features_G0_tent, X_test_features_B0_tent, X_test_features_X0_tent, X_test_features_Y0_tent, X_test_features_Z0_tent, X_test_features_H0_tent, X_test_features_S0_tent, X_test_features_V0_tent, X_test_features_R1_tent, X_test_features_G1_tent, X_test_features_B1_tent, X_test_features_X1_tent, X_test_features_Y1_tent, X_test_features_Z1_tent, X_test_features_H1_tent, X_test_features_S1_tent, X_test_features_V1_tent)) ### Ridge Model ridge_model = RidgeClassifier().fit(X_train_features, y_train) tent_train_accuracy_ridge[k] = ridge_model.score(X_train_features, y_train) tent_test_accuracy_ridge[k] = ridge_model.score(X_test_features, y_test) ### SVM Model c = 5 svm_model = SVC(kernel='rbf', C=c).fit(X_train_features, y_train) tent_train_accuracy_svm[k] = svm_model.score(X_train_features, y_train) tent_test_accuracy_svm[k] = svm_model.score(X_test_features, y_test) ### CDER Adaptive X_train_features_R0_cder, X_test_features_R0_cder = adaptive_features( R0_train_sample, R0_test_sample, "cder", y_train) X_train_features_G0_cder, X_test_features_G0_cder = adaptive_features( G0_train_sample, G0_test_sample, "cder", y_train) X_train_features_B0_cder, X_test_features_B0_cder = adaptive_features( B0_train_sample, B0_test_sample, "cder", y_train)
print(precision_score(y_test,log_predict,average='binary')) #%%Ridge regression from sklearn.linear_model import RidgeClassifier #We can set up a grid search here to find the optimal value of the learning #rate, alpha alpharange = np.arange(start=0.05,stop=1.0,step=0.05) ridge_trainS = [] ridge_testS = [] for a in alpharange: ridge = RidgeClassifier(alpha=a) ridge.fit(X_train,y_train) ridge_trainS.append(ridge.score(X_train,y_train)) ridge_testS.append(ridge.score(X_test,y_test)) plt.plot(alpharange, ridge_trainS, label="Training Accuracy") plt.plot(alpharange, ridge_testS, label="Test Accuracy") plt.title("Ridge Scores") plt.ylabel("Accuracy") plt.xlabel("Alpha") plt.grid() plt.legend() #%% KNN Classifier neighbors = np.arange(10)+1 knn_trainS = []
return cs if __name__ == '__main__': # Add LDA component to auto-sklearn. autosklearn.pipeline.components.feature_preprocessing.add_preprocessor( AutoBinning) # Create dataset. from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) from sklearn.linear_model import RidgeClassifier clf = RidgeClassifier().fit(X_train, y_train) print(clf.score(X_test, y_test)) print('binning') pp = AutobinningTransform(binning_method='xgb') X_train = pp.fit_transform(X_train, y_train) X_test = pp.transform(X_test) clf = RidgeClassifier().fit(X_train, y_train) print(clf.score(X_test, y_test)) print('=' * 100) # Configuration space. cs = AutoBinning.get_hyperparameter_search_space() print(cs) # Fit the model using LDA as preprocessor. clf = autosklearn.classification.AutoSklearnClassifier( include_preprocessors=['AutoBinning'], )
# %% feature sets prefeatures = data.drop(columns=['loyal','subsequent_purchases','insert_num','distance_missing','purchase_make','purchase_model','purchase_make_cat']) postfeatures = prefeatures.drop(columns=['gender_missing','income_missing','customer_age','distance_binned']) labels = data['loyal'] # %% features = postfeatures X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42) X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42) # %% model = RidgeClassifier() model.fit(X_train, y_train) predictions = model.predict(X_test) print(model.score(X_val, y_val),metrics.accuracy_score(y_test, predictions)) # %% model = RandomForestClassifier(n_estimators=50,max_depth=10,class_weight='balanced') model.fit(X_train, y_train) predictions = model.predict(X_test) print(model.score(X_val, y_val),metrics.accuracy_score(y_test, predictions)) df = pd.DataFrame() df['feat'] = X_train.columns df['score'] = model.feature_importances_ print(df.sort_values(by='score',ascending=False)) print(metrics.confusion_matrix(predictions,y_test)) print(metrics.classification_report(predictions,y_test)) # %% effect = []
f = np.frompyfunc(mysub, 1, 1) # Defining the Numeric Features column Expicitely NumCols = [0, 2, 5, 6, 7, 9] # Picking Object features column list StrCols = [i for i in range(data.shape[1]) if i not in NumCols] # Encoding data by applying the label encoder for the object features train data EncData = np.hstack([ f(data[:, NumCols]), np.apply_along_axis(le.fit_transform, 1, data[:, StrCols]) ]) # Converting the Encoded object features data to numpy float data type EncData = np.array(EncData, dtype=np.float64) # Picking the Predicting variable from traind data PredVariable = data[:, 1].astype(int) #Obtain mean of columns as you need, nanmean is just convenient. col_mean = np.nanmean(EncData, axis=0) #Find indicies that you need to replace inds = np.where(np.isnan(EncData)) #Place column means in the indices. Align the arrays using take EncData[inds] = np.take(col_mean, inds[1]) # Splitting the data to train and test X_train, X_test, y_train, y_test = train_test_split(EncData, PredVariable, test_size=0.33) # Defining and applying Ridge aclassifier to data clf = RidgeClassifier().fit(X_train, y_train) clf.score(X_train, y_train) # Pring the Classification report of predictions print(classification_report(y_test, clf.predict(X_test)))