def RBM_SVM(trainfeatures, testfeatures, trainlabels, testlabels): # ******************* Scikit-learning RBM + SVM ******************* print "train RBM+SVM model" ## trainfeatures = (trainfeatures - np.min(trainfeatures, 0)) / (np.max(trainfeatures, 0) + 0.0001) # 0-1 scaling min_max_scaler = preprocessing.MinMaxScaler() trainfeatures_fs = min_max_scaler.fit_transform(trainfeatures) testfeatures_fs = min_max_scaler.transform(testfeatures) # SVM parameters clf = svm.SVC(C=5.0, kernel='sigmoid', degree=3, gamma=0.5, coef0=10.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) # RBM parameters rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 # Machine learning pipeline classifier = Pipeline(steps=[('rbm', rbm), ('svm', clf)]) # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 400 classifier.fit(trainfeatures_fs, trainlabels) results = classifier.predict(testfeatures_fs) results = results.ravel() testerror = float(len(testlabels) - np.sum(testlabels == results))/float(len(testlabels)) # print"error rate with SVM is %.4f" %testerror return testerror
def train_nn(data, expected_values): data, expected_values = preprocess_data(data, expected_values, remove_high_rr=False) logger.info("Starting feature reduction.") X = np.asarray(data[1:], 'float64') logger.info("Done with feature reduction.") Y = expected_values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logger.info("Starting NeuralNetwork training.") logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 1.0 clf.fit(X_train, Y_train) # Evaluation #TODO: Make unified evaluation logger.info("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(Y_test, clf.predict(X_test)))) logger.info("Done with NeuralNetwork training.") return lambda x: wrap_threshold_distribtuion( np.array(clf.predict(x)).astype(float))
def rbm(): X_train, Y_train, X_test, Y_test = train_test_data(is_feature=False) rbm = BernoulliRBM(random_state=0, verbose=True) logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1) rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 10 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 50 X_train = X_train.reshape(X_train.shape[0], -1) # Training RBM-Logistic Pipeline rbm_features_classifier.fit(X_train, Y_train) # # Training the Logistic regression classifier directly on the pixel # raw_pixel_classifier = clone(logistic) # raw_pixel_classifier.C = 100. # raw_pixel_classifier.fit(X_train, Y_train) X_test = X_test.reshape(X_test.shape[0], -1) Y_pred = rbm_features_classifier.predict(X_test) # print("Logistic regression using RBM features:\n%s\n" % ( # metrics.classification_report(Y_test, Y_pred))) # Y_pred = raw_pixel_classifier.predict(X_test) result_analysis(Y_pred, Y_test, 'BernoulliRBM')
def Logistic(): logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 121 rbm.n_components = 700 logistic.C= 1.0 # Training RBM-Logistic Pipeline classifier.fit(data_train,target_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=1.0) logistic_classifier.fit(data_train,target_train) print("printing_results") print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test)))) cm3 = confusion_matrix(target_test,classifier.predict(data_test)) plt.matshow(cm3) plt.title('Confusion Matrix Logistic Regression with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix3.jpg') print("Logistic regression using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,logistic_classifier.predict(data_test)))) cm4 = confusion_matrix(target_test,logistic_classifier.predict(data_test)) plt.matshow(cm4) plt.title('Confusion Matrix Logistic Regression') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix4.jpg') #Logistic()
def SGD(): SGD = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,epsilon=0.001) rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 15 rbm.n_components = 50 SGD.alpha=0.0001 SGD.C=1 # Training SGD SGD_classifier = linear_model.SGDClassifier(loss='hinge',penalty='l2',random_state=42,n_jobs=-1,alpha=0.0001, epsilon=0.001) SGD_classifier.fit(data_train,target_train) # Training RBM-SGD Pipeline classifier.fit(data_train,target_train) print("printing_results") print("SGD using RBM features:\n%s\n" % (metrics.classification_report(target_test,classifier.predict(data_test)))) cm = confusion_matrix(target_test,classifier.predict(data_test)) plt.matshow(cm) plt.title('Confusion Matrix SVM with SDG with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix1.jpg') print("SGD using raw pixel features:\n%s\n" % (metrics.classification_report(target_test,SGD_classifier.predict(data_test)))) cm1 = confusion_matrix(target_test,SGD_classifier.predict(data_test)) plt.matshow(cm1) plt.title('Confusion Matrix SVM with SDG Raw Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix2.jpg')
def restrictedBoltzmannMachine(trainData, trainLabels, testData): logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000, multi_class='multinomial') rbm = BernoulliRBM(random_state=0, batch_size = 2000, verbose=True) rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # ############################################################################# # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000 # Training RBM-Logistic Pipeline rbm_features_classifier.fit(trainData, trainLabels) labels = rbm_features_classifier.predict(testData) #labels = list(labels) return labels '''
def build_classifier(clf_name): clf = None parameters = {} if clf_name == "svm": clf = svm.SVC(kernel='linear', C=10) parameters = {} elif clf_name == "knn": clf = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='brute', leaf_size=30, metric='cosine', metric_params=None) elif clf_name == "rmb": logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.01 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 6000 clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) #parameters = {'clf__C': (1, 10)} elif clf_name == "tsne": clf = TSNE(n_components=2, init='random', metric='cosine') return clf, parameters
def runRBM(arr, clsfr):#iters, lrn_rate, logistic_c_val, logistic_c_val2, n_comp, filename): global file_dir, nEvents, solutionFile iters = int(arr[0]*10) lrn_rate = arr[1] logistic_c_val = arr[2]*1000.0 logistic_c_val2 = arr[3]*100.0 n_comp = int(arr[4]*100) filename = 'rbm_iter'+str(iters)+'_logc'+str(log_c_val)+'_logcc'+str(log_c_val2)+'_lrn'+str(learn_rate)+'_nc'+str(n_comp)# low logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = lrn_rate #0.10#0.06 rbm.n_iter = iters #20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = n_comp # 250 logistic.C = logistic_c_val #6000.0 # Training RBM-Logistic Pipeline classifier.fit(sigtr[train_input].values, sigtr['Label'].values) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=logistic_c_val2)#100.0 logistic_classifier.fit(sigtr[train_input].values, sigtr['Label'].values) ############################################################################### # Evaluation if clsfr == 0: clsnn_pred=classifier.predict(sigtest[train_input].values) solnFile('clsnn_'+filename,clsnn_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write(filename+': ' + str(ams_score)+'\n') elif clsfr == 1: log_cls_pred = logistic_classifier.predict(sigtest[train_input].values) solnFile('lognn_'+filename,log_cls_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+'lognn_'+filename+'.out', nEvents) print ams_score logfile.write('lognn ' + filename+': ' + str(ams_score)+'\n') else: logistic_classifier_tx = linear_model.LogisticRegression(C=logistic_c_val2) logistic_classifier_tx.fit_transform(sigtr[train_input].values, sigtr['Label'].values) log_cls_tx_pred = logistic_classifier_tx.predict(sigtest[train_input].values) solnFile('lognntx_'+filename,log_cls_tx_pred,sigtest['EventId'].values)#,bkgtest) ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write('lognntx '+ filename+': ' + str(ams_score)+'\n') return -1.0*float(ams_score)
def train_rbm(X, n_components=100, n_iter=10): X = X.astype(np.float64) X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # scale to [0..1] rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = n_iter rbm.n_components = n_components rbm.fit(X) return rbm
def brbm_rf(Xtr, ytr, Xte=None, yte=None): randomforest = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=100) rbm = BernoulliRBM(random_state=0) classifier = Pipeline(steps=[('rbm', rbm), ('randomforest', randomforest)]) rbm.learning_rate = 0.025 rbm.n_iter = 250 rbm.n_components = 100 return simple_classification(classifier, Xtr, ytr, Xte, yte)
def rbm_dbn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): dbn = DBN(epochs=200,learn_rates=0.01) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('dbn', dbn)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def rbm_logistic_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): logistic = linear_model.LogisticRegression(C=6000) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def rbm_knn_train_and_predict(train_set_x,train_set_y,test_set_x,test_set_y): knn = KNeighborsClassifier(n_neighbors=5) rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 classifier = Pipeline(steps=[('rbm', rbm), ('knn', knn)]) classifier.fit(train_set_x,train_set_y) PRED = classifier.predict(test_set_x) return PRED
def train_model(): global ocr_map count = 1 a = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] for char in a: ocr_map[count] = char count += 1 data_frames = [] X_train = [] Y_train = [] X_test = [] Y_test = [] logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) # classifier = Pipeline(steps=[ ('logistic', LinearSVC())]) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in range(1, count): l = get_data(i) print len(l) for data in range(0, 900): X_train.append(l[data]['text']) Y_train.append(l[data]['label']) for data in range(900, len(l)): X_test.append(l[data]['text']) Y_test.append(l[data]['label']) # X_train, Y_train = nudge_dataset(X_train, Y_train) # X_test, Y_test = nudge_dataset(X_test, Y_test) X_train = (X_train - np.min(X_train, 0)) / (np.max(X_train, 0) + 0.0001 ) # 0-1 scaling X_test = (X_test - np.min(X_test, 0)) / (np.max(X_test, 0) + 0.0001 ) # 0-1 scaling print X_train.shape, X_test.shape # skf = StratifiedKFold(Y, n_folds=2) # joblib.dump(X_train, 'X_train.pkl',compress=3) # joblib.dump(Y_train, 'Y_train.pkl',compress=3) # joblib.dump(X_test, 'X_test.pkl',compress=3) # joblib.dump(Y_test, 'Y_test.pkl',compress=3) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 # logistic.C = 6000.0 classifier.fit(X_train, Y_train) f = open("ocr_results.txt", 'w') answers = classifier.predict(X_test) print confusion_matrix(Y_test, answers) score_data = accuracy_score(Y_test, answers) print score_data f.write(str(score_data)) f.close()
def run_auto(): X = load_data('gender/male') X = X.astype(np.float32) / 256 rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 2000 rbm.fit(X) cimgs = [comp.reshape(100, 100) for comp in rbm.components_] smartshow(cimgs[:12]) return rbm
def useNeuralNetwork(self): #Set up logistic regression unit: logistic = linear_model.LogisticRegression() #Set up neural net unit; tune its parameters ##TODO: grid search for params rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 50 #Make classifier a pipeline self.classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
def train_rbm(data, n_hidden, n_iter=10, learning_rate=0.1): model = BernoulliRBM(n_components=n_hidden) # Set hyperparameters model.n_iter = n_iter model.learning_rate = learning_rate # Train model model.fit(data) return model
def RBM_train(data, target): """ Train RBM + SVM """ train_data, test_data, train_labels, test_labels = train_test_split( data, target, test_size=0.33, random_state=42) svm_data = svm.SVC(gamma=0.001) rbm = BernoulliRBM() classifier = Pipeline(steps=[('rbm', rbm), ('svm', svm_data)]) rbm.learning_rate = 0.06 rbm.n_iter = 40 rbm.n_components = 100 classifier.fit(train_data, train_labels) predicted = classifier.predict(test_data) get_cost(predicted, test_labels)
def getNeuralModel(self,X,Y): logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(verbose=True) classifier = linear_model.LogisticRegression(penalty='l2', tol=.0001)#Pipeline(steps = [('rbm', rbm),('logistic',logistic)]) rbm.learning_rate = 0.0001 rbm.n_iter = 1000 rbm.n_components = 1000 classifier.fit(X, Y) return classifier
def train_with_svm(self): rbm = BernoulliRBM(random_state=0, verbose=False) svc = LinearSVC(C=10.0,class_weight='balanced',max_iter=100) classifier = Pipeline(steps=[('rbm', rbm), ('svm', svc)]) rbm.learning_rate = 0.05 rbm.n_iter = 30 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 classifier.fit(self.X, self.Y) self.classifier = classifier joblib.dump(classifier,"rbm.pkl")
def train_with_logistic(self): rbm = BernoulliRBM(random_state=0, verbose=False) logistic = linear_model.LogisticRegression(C=100) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.05 rbm.n_iter = 30 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 30 classifier.fit(self.X, self.Y) self.classifier = classifier joblib.dump(classifier,"rbm-logistic.pkl")
def neural_net(): digits = datasets.load_digits() X = np.asarray(digits.data, 'float32') sidelength = int(np.sqrt(X.shape[1])) X, Y = nudge_dataset(X, digits.target, dimen=(sidelength, sidelength)) #Scale the data to be between zero and 1 at all pixels: X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) + 0.0001) #Split the data set into a training and testing set: X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) #Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) #The classifier classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression #logistic_classifier = linear_model.LogisticRegression(C=100.0) #logistic_classifier.fit(X_train, Y_train) ############################################################################### # Evaluation print "" print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(Y_test, classifier.predict(X_test)))) #Predict a few individual cases: print classifier.predict(X_test[:5, :]), Y_test[:5]
def neural_net(): digits = datasets.load_digits() X = np.asarray(digits.data, 'float32') sidelength = int(np.sqrt(X.shape[1])) X,Y = nudge_dataset(X,digits.target,dimen=(sidelength,sidelength)) #Scale the data to be between zero and 1 at all pixels: X = (X - np.min(X,axis=0))/(np.max(X,axis=0)+0.0001) #Split the data set into a training and testing set: X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0) #Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) #The classifier classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression #logistic_classifier = linear_model.LogisticRegression(C=100.0) #logistic_classifier.fit(X_train, Y_train) ############################################################################### # Evaluation print "" print("Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report( Y_test, classifier.predict(X_test)))) #Predict a few individual cases: print classifier.predict(X_test[:5,:]),Y_test[:5]
def train(cls) -> str: """ Returns classification results """ X_train, X_test, Y_train, Y_test = RestrictedBoltzmann.load_data() logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1) rbm = BernoulliRBM(random_state=0, verbose=True) rbm_features_classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 10 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000 # Training RBM-Logistic Pipeline rbm_features_classifier.fit(X_train, Y_train) # Training the Logistic regression classifier directly on the pixel raw_pixel_classifier = clone(logistic) raw_pixel_classifier.C = 100. raw_pixel_classifier.fit(X_train, Y_train) RestrictedBoltzmann.store_model("rbm_features", rbm_features_classifier) RestrictedBoltzmann.store_model("raw_pixel", raw_pixel_classifier) # Evaluation Y_pred = rbm_features_classifier.predict(X_test) report1 = "Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report(Y_test, Y_pred)) Y_pred = raw_pixel_classifier.predict(X_test) report2 = "Logistic regression using raw pixel features:\n%s\n" % ( metrics.classification_report(Y_test, Y_pred)) return f"{report1} \n\n {report2}"
def estimate_n_components(): X = load_data('gender/male') X = X.astype(np.float32) / 256 n_comp_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200] scores = [] for n_comps in n_comp_list: rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 50 rbm.n_components = 100 rbm.fit(X) score = rbm.score_samples(X).mean() scores.append(score) plt.figure() plt.plot(n_comp_list, scores) plt.show() return n_comp_list, scores
def build_model_rbm(): np.random.seed(12) rbm_estimators = list() # rbm = BernoulliRBM(random_state=12, verbose=0, n_components=in_dim) rbm = BernoulliRBM(random_state=np.random.randint(1, 100), verbose=0) lr = LogisticRegression() rbm.learning_rate = 0.0001 # rbm.n_iter = 20 # rbm.n_components = 50 lr.C = 10.0 rbm_estimators.append(('rbm', rbm)) rbm_estimators.append(('lr', lr)) return rbm_estimators
def SGD(): SGD = linear_model.SGDClassifier(loss='hinge', penalty='l2', random_state=42, n_jobs=-1, epsilon=0.001) rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('SGD', SGD)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 15 rbm.n_components = 50 SGD.alpha = 0.0001 SGD.C = 1 # Training SGD SGD_classifier = linear_model.SGDClassifier(loss='hinge', penalty='l2', random_state=42, n_jobs=-1, alpha=0.0001, epsilon=0.001) SGD_classifier.fit(data_train, target_train) # Training RBM-SGD Pipeline classifier.fit(data_train, target_train) print("printing_results") print("SGD using RBM features:\n%s\n" % (metrics.classification_report( target_test, classifier.predict(data_test)))) cm = confusion_matrix(target_test, classifier.predict(data_test)) plt.matshow(cm) plt.title('Confusion Matrix SVM with SDG with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix1.jpg') print("SGD using raw pixel features:\n%s\n" % (metrics.classification_report(target_test, SGD_classifier.predict(data_test)))) cm1 = confusion_matrix(target_test, SGD_classifier.predict(data_test)) plt.matshow(cm1) plt.title('Confusion Matrix SVM with SDG Raw Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix2.jpg')
def Train(): """ Train Function """ if os.path.exists('X_train.pkl') == False: print("generate data and split to train test set.") with open('X.pkl') as Xf: X = cPickle.load(Xf) with open('Y.pkl') as Yf: Y = cPickle.load(Yf) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) print("load data from pickled files..") with open('X_train.pkl') as x_train_f: X_train = cPickle.load(x_train_f) with open('X_test.pkl') as x_test_f: X_test = cPickle.load(x_test_f) with open('Y_train.pkl') as y_train_f: Y_train = cPickle.load(y_train_f) with open('Y_test.pkl') as y_test_f: Y_test = cPickle.load(y_test_f) print("Load Data success!") logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 300 rbm.n_components = 1000 logistic.C = 6000.0 clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) clf.fit(X_train,Y_train) #logistic_classifier = linear_model.LogisticRegression(C=100.0) #logistic_classifier.fit(X_train, Y_train) #print("Logistic regression using raw pixel features:\n%s\n" % ( #metrics.classification_report( # Y_test, # logistic_classifier.predict(X_test)))) print("fit complete..") print("Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report( Y_test, clf.predict(X_test)))) with open('clf.pkl','a+') as clf_f: cPickle.dump(clf,clf_f)
def ParaTun2(X_dev, Y_dev): rbm = BernoulliRBM(random_state=0, verbose=True) steps = [('rbm', rbm), ('classifier', OneVsRestClassifier(LinearSVC()))] rbm.learning_rate = 0.005 rbm.n_iter = 200 rbm.n_components = 100 #rbm.batch_size = 10 pipeline = Pipeline(steps) params = {'classifier__estimator__C': [10]} #scorer = make_scorer(roc_auc_score, average='macro', needs_proba=True) predictor = GridSearchCV(pipeline, params, cv=2, n_jobs=1) #predictor = GridSearchCV(pipeline, params, n_jobs=1) print '2' result = predictor.fit(X_dev, Y_dev) print result.best_score_ #print result.cv_results_ print result.best_params_ return predictor
def RBM(): filename = "../data/smaller.dta" raw_data = open(filename, 'rt') data = np.loadtxt(raw_data, delimiter=" ") X = data[:, :3] Y = data[:, 3] print(X) print(Y) print("training on RBM") rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 rbm.fit(X, Y) predictions = rbm.transform(X) params = rbm.get_params() print("predictions = ", predictions) print("rbm = ", rbm) print("params = ", params)
def _get_classification_pipeline(self): """Builds and returns the classification Pipeline for this classifier :return: A Pipeline with the required classification steps """ rbm = BernoulliRBM() rbm.n_components = 100 rbm.learning_rate = 0.01 rbm.n_iter = 10 logistic_regression = linear_model.LogisticRegression() logistic_regression.C = 10000 classification_steps = [ ("rbm", rbm), ("logistic", logistic_regression) ] return Pipeline(steps=classification_steps)
def main(): X, Y = load_csv_file('train.csv') estimators = 1000 test_size = 0.05 X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=test_size, random_state=0) X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=test_size, random_state=42) log.info('Loaded training file') X_test, _ = load_csv_file('test.csv', cut_end=False) log.info('Loaded test file') #Classifier Setup logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1, random_state=0, max_depth=None) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 500 logistic.C = 6000.0 pipeline = make_pipeline(tree_clf, rbm, logistic) #clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1) clf = pipeline log.info('Fitting Boltzman with %s' str([name for name, _ in pipeline.steps])) clf.fit(X_train_real, Y_train_real) clf_probs = clf.predict_proba(X_test_real) score = log_loss(Y_test_real, clf_probs) log.info('Log Loss score un-trained = %f' % score) # Calibrate Classifier using ground truth in X,Y_valid sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit") log.info('Fitting CalibratedClassifierCV') sig_clf.fit(X_valid, Y_valid) sig_clf_probs = sig_clf.predict_proba(X_test_real) sig_score = log_loss(Y_test_real, sig_clf_probs) log.info('Log loss score trained = %f' % sig_score) # Ok lets predict the test data with our funky new classifier sig_submission_probs = sig_clf.predict_proba(X_test) write_out_submission(sig_submission_probs, 'submission.csv')
def LogRegWithRBMFeatures(x_train, y_train, x_cv, y_cv): """ Logistic regression using RBM features http://scikit-learn.org/stable/auto_examples/plot_rbm_logistic_classification.html """ logistic = linear_model.LogisticRegression() #rbm = BernoulliRBM(random_state=0, verbose=True) rbm = BernoulliRBM() classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 5000 #logistic.C = 6000.0 classifier.fit(x_train, y_train) #classifier = BernoulliRBM(n_components = 10) #classifier.fit(x_train, y_train) return classifier
def RBM(X_train, X_test, y_train, y_test): #logistic = LogisticRegression(solver='newton-cg', tol=1) nn = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50, 25, 1), random_state=1) rbm = BernoulliRBM(random_state=0, verbose=True) rbm_features_classifier = Pipeline( #steps=[('rbm', rbm), ('logistic', logistic)]) steps=[('rbm', rbm), ('nn', nn)]) rbm.learning_rate = 0.06 rbm.n_iter = 10 rbm.n_components = 100 #logistic.C = 6000 rbm_features_classifier.fit(X_train, y_train) prediction = rbm_features_classifier.predict(X_test) print(100 * accuracy_score(y_test, prediction)) print(confusion_matrix(y_test, prediction))
def train_new(path): thumbnail = get_thumbnail(Image.open('images/{0}'.format(path))) vectors = [] for pixel_tuple in thumbnail.getdata(): vec = [] for val in pixel_tuple: vec.append(float(val)) vectors.append(vec) X = np.asarray(vectors, 'float32') Y = np.array(X.shape) X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) rbm = BernoulliRBM(random_state=1, verbose=True) rbm.learning_rate = 0.09 rbm.n_iter = 1 rbm.n_components = 16 rbm.batch_size = 2 return rbm.fit(X).components_
def train(image_matrix, images): X = np.asarray(image_matrix, 'float32') Y = np.array(X.shape) X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) rbm = BernoulliRBM(random_state=1, verbose=True) rbm.learning_rate = 0.09 rbm.n_iter = 1 rbm.n_components = 16 rbm.batch_size = 2 y_new = np.zeros(X.shape) for i in range(len(X)): x_new = rbm.fit(X[i]) y_new[i] = x_new.components_ global model model = { 'matrix': y_new, 'images': images }
def Logistic(): logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # RBM parameters obtained after cross-validation rbm.learning_rate = 0.01 rbm.n_iter = 121 rbm.n_components = 700 logistic.C = 1.0 # Training RBM-Logistic Pipeline classifier.fit(data_train, target_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=1.0) logistic_classifier.fit(data_train, target_train) print("printing_results") print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(target_test, classifier.predict(data_test)))) cm3 = confusion_matrix(target_test, classifier.predict(data_test)) plt.matshow(cm3) plt.title('Confusion Matrix Logistic Regression with RBM Features') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix3.jpg') print("Logistic regression using raw pixel features:\n%s\n" % (metrics.classification_report( target_test, logistic_classifier.predict(data_test)))) cm4 = confusion_matrix(target_test, logistic_classifier.predict(data_test)) plt.matshow(cm4) plt.title('Confusion Matrix Logistic Regression') plt.colorbar() plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.savefig('confusion_matrix4.jpg') #Logistic()
def rbm(X,Y): # Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2,random_state=0) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 1000 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=100.0) logistic_classifier.fit(X_train, Y_train) # Evaluation print() print("Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report( Y_test, classifier.predict(X_test)))) print("Logistic regression using raw pixel features:\n%s\n" % ( metrics.classification_report( Y_test, logistic_classifier.predict(X_test))))
def train_deep_boltzman(self): rbm1 = BernoulliRBM(random_state=0, verbose=False) logistic = linear_model.LogisticRegression(class_weight='balanced') classifier = Pipeline(steps=[('rbm', rbm1), ('logistic', logistic)]) # More components tend to give better prediction performance, but larger # fitting time params = { "rbm__learning_rate": [0.1, 0.03, 0.01], "rbm__n_iter": [20, 40, 80], "rbm__n_components": [50, 75, 100], "logistic__C": [1.0, 10.0, 100.0]} # gs = grid_search.GridSearchCV(classifier,params) # gs.fit(self.X, self.Y) print "grid search done, training pipelined classifier" rbm1.n_components = 100 rbm1.n_iter = 40 rbm1.learning_rate = 0.01 logistic.C = 10.0 classifier.fit(self.X, self.Y) self.classifier = classifier "classification" joblib.dump(classifier,"two-layerRbm-logistic.pkl")
def get_price_signal(stock, data): # ------------------------------------------- # Use Logistic Regression classifier with BernoulliRBM Neural Netowrk # Return 1 if buy signal on stock return, Return 0 if sell signal price = data.history(stock, 'price', bar_count=50, frequency='1d') price = price.fillna(method='ffill') logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=False) rbm.learning_rate = 0.017 rbm.n_iter = 30 rbm.n_components = 150 logistic.C = 6000.0 classifier = skp.Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # Make a list of 1's and 0's, 1 when the price increased from the prior bar returns = np.diff(price) changes = (np.diff(price) > 0).astype(int) lag = 1 X = (returns)[:-lag].astype(float) # Add the prior changes X_data = X.reshape((len(X), 1)) Y = changes[lag:] # Add dependent variable, the final change Y_data = Y.reshape((len(Y), 1)) if len( Y ) >= 30: # There needs to be enough data points to make a good model try: classifier.fit(X_data, Y_data) # Generate the model prediction = classifier.predict(returns[-lag:]) # Predict except: return None return prediction[-1]
print(percentage) #mmodel number 2 bigMatrixTrain = (bigMatrixTrain - np.min(bigMatrixTrain, 0)) / (np.max(bigMatrixTrain, 0) + 0.0001) # 0-1 scaling #Divide dataset for cross validation purposes X_train, X_test, y_train, y_test = cross_validation.train_test_split( bigMatrixTrain, y, test_size = 0.4, random_state = 0) #fix this # specify parameters and distributions to sample from # Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger fitting time rbm.n_components = 300 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, y_train) print() print("Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report(y_test, classifier.predict(X_test)))) print("Logistic regression using RBM features:\n%s\n" % ( confusion_matrix(y_test, classifier.predict(X_test))))
# Y = mnist.target # X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # Chargement des digits X, Y = utils.load_data() print(X.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Models we will use rbm_layer_1 = BernoulliRBM(random_state=0, verbose=True) rbm_layer_2 = BernoulliRBM(random_state=0, verbose=True) logistic = linear_model.LogisticRegression() # pour comparaison avec RBM + regression logistique ############################################################################### # Training du premier rbm rbm_layer_1.learning_rate = 0.01 rbm_layer_1.n_iter = 50 rbm_layer_1.n_components = 300 # Training RBM print("Debut training RBM1") print(X_train.shape) t0 = time.clock() rbm_layer_1.fit(X_train) print(time.clock() - t0) # creation d'une base de train a partir d'echantillonnage # de variable cachees du premier rbm n_sample_second_layer_training = 3*int(X.shape[0]) H1_train = np.zeros(shape=(n_sample_second_layer_training, rbm_layer_1.n_components)) comp = 0 while (comp < n_sample_second_layer_training): rng = check_random_state(rbm_layer_1.random_state)
#print pred pred["Actual"] = y_test # Create and fit the three models print "Hit Rates:" models = [("Linear", linear_model.LinearRegression()), ("LR", LogisticRegression()), ("KNN", neighbors.KNeighborsClassifier(n_neighbors=2)), ("SVM", SVC(C=1)), ("RF", RandomForestClassifier(n_estimators=1))] for m in models: fit_model(m[0], m[1], X_train, y_train, X_test, pred) logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = .06 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 6000 classifier.fit(X_train, y_train) logistic_classifier = LogisticRegression(C=100.0) logistic_classifier.fit(X_train, y_train) score = classifier.score(X_train, y_train) print score text_file.write('Neural Network : ' + str(score) + '\n') # 100 Days text_file.write('100 Days Prediction Accuracies\n') snpret = create_lagged_series("NDAQ", training_date, datetime.datetime(2015, 8, 6),
# To apply a classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(line_images) data = line_images.reshape((n_samples, -1)) is_rmb = False # Create a classifier # classifier = svm.SVC() # classifier = neural_network.MLPClassifier() # classifier = RandomForestClassifier() logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.00000001 rbm.n_iter = 30 rbm.n_components = 50 logistic.C = 6000.0 # regularization - smaller means more is_rmb = True # We learn the lines on the first half of the lines classifier.fit(data[:n_samples / 2], line_labels[:n_samples / 2]) # Now predict the value of the digit on the second half: expected = line_labels[n_samples / 2:] predicted = classifier.predict(data[n_samples / 2:]) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
features_file = "combined_data\\RBMsentence_training.txt" labels_file = "polarity_sentences_kaggle\\training.txt" with open(data_path + labels_file, 'r', encoding="utf8") as f: sentences = [x for x in f.readlines()] labels = [x[0] for x in sentences] labels = np.array(labels) labels = labels.astype(float) print(labels) features = np.loadtxt(data_path + features_file, dtype=float) rbm_data = np.c_[features, labels].astype(float) RBM = BernoulliRBM(random_state=0, verbose=True) RBM.n_components = 20 RBM.learning_rate = 0.05 RBM.n_iter = 20 MLP = MLPClassifier(activation='relu', alpha=1e-05, batch_size=10, beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100, 50), learning_rate='adaptive', learning_rate_init=0.01, max_iter=200, momentum=0.01, nesterovs_momentum=True,
def RBMtest01(): #利用RBM进行non-linear feature extraction #相对于直接进行logistic regression, RBM features 可以提高分类精度 import numpy as np import matplotlib.pyplot as plt from scipy.ndimage import convolve from sklearn import linear_model, datasets, metrics from sklearn.cross_validation import train_test_split from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline def nudge_dataset(X, Y): direction_vectors = [ [[0, 1, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [1, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 1], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 1, 0]] ] shift = lambda x, w: convolve(x.reshape((8, 8)), mode = 'constant', weights = w).ravel() X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) Y = np.concatenate([Y for _ in range(5)], axis = 0) return X, Y digits = datasets.load_digits() X = np.asarray(digits.data, 'float32') #这里应该就是进行了一下数据类型转换 a#list to array X, Y = nudge_dataset(X, digits.target) #相当于重新生成了5倍的X,Y #print np.max(X, 0) #print np.min(X, 0) X = (X - np.min(X, 0)) / (np.max(X, 0) - - np.min(X, 0) + 0.0001) # 0-1 scaling 这里做了归一化(每一维分别归一化) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0) print set(Y_train) #''' #新建模型 logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state = 0, verbose = True) #感觉这里的pipeline就是一个连续进行fit, transform的过程 #而rbm模型transform的结果是Latent representations of the data. classifier = Pipeline(steps = [('rbm', rbm), ('logistic', logistic)]) #Training #这里的参数是根据cross-validation选出来的 -- GridSearchCV rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 #这里就是利用rbm 训练出100个特征 logistic.C = 6000 #rbm.fit(X_train, Y_train) rbm.fit(X_train) #rbm从数据的维数来看,首先是一个非监督的训练过程,就是从X_train中求出N个代表性的vector, #然后再把原始的X_trian投影到这N的向量上,获得X_train的新N维feature #与PCA类似 predicted_Y = rbm.transform(X_train) print rbm.components_ #rbm.components_是 100 * 64的矩阵 print len(rbm.components_) print len(rbm.components_[0]) print predicted_Y print len(predicted_Y) print len(predicted_Y[0]) print len(X_train) print len(X_train[0]) # Training RBM-Logistic Pipeline #相当于这里输入的还是每一维都进行了归一化之后的X_train #对应的Y_train还是0-9 表示label print "Start Training RBM-Logistic Pipeline" classifier.fit(X_train, Y_train) # Training Logistic regression, logistic_classifier = linear_model.LogisticRegression(C = 100.0) logistic_classifier.fit(X_train, Y_train) #Evaluation print "Logistic regression using RBM features: \n%s\n" %(metrics.classification_report(Y_test, classifier.predict(X_test))) print "Logistic regression using raw features: \n%s\n" %(metrics.classification_report(Y_test, logistic_classifier.predict(X_test))) #Plotting plt.figure(figsize = (4.2, 4)) for i, comp in enumerate(rbm.components_): plt.subplot(10, 10, i + 1) #这里获得的还是100个64维vector,然后把每一个vector都reshape到8*8显示出来 plt.imshow(comp.reshape(8,8), cmap=plt.cm.gray_r) plt.xticks(()) plt.yticks(()) plt.suptitle('100 components extracted by RBM', fontsize = 16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.23) plt.show()
from scipy.ndimage import convolve from sklearn import linear_model, datasets, metrics from sklearn.cross_validation import train_test_split from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from NeuralNetwork import NeuralNetwork selector = SelectKBest(f_classif, k=20) X_new= selector.fit_transform(data, label) X_train, X_test, Y_train, Y_test = train_test_split(X_new,label,test_size=0.3,random_state=0) logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.1 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 8000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=200.0)
X_train, Y_train = np_train_set[:, n_labels:], np.argmax(np_train_set[:, :n_labels], axis=1) X_test, Y_test = np_test_set[:, n_labels:], np.argmax(np_test_set[:, :n_labels], axis=1) # Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[("rbm", rbm), ("logistic", logistic)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = learning_rate rbm.n_iter = training_epochs rbm.batch_size = batch_size # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = n_hidden logistic.C = 1000.0 # Training RBM-Logistic Pipeline classifier.fit(np_train_set[:, n_labels:], Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=100.0) logistic_classifier.fit(X_train, Y_train) ###############################################################################
# Models we will use logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) svc = sklearn.svm.SVC() # classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) classifier = Pipeline(steps=[('rbm', rbm), ('svc', svc)]) ############################################################################### # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=100.0) logistic_classifier.fit(X_train, Y_train) ############################################################################### # Evaluation
parameters = {'rbm_layer_1__learning_rate': np.linspace(0.04, 0.05, num=10)} gridSearch = grid_search.GridSearchCV(classifier, parameters) # Training RBM-Logistic Pipeline print("Performing grid search...") print("pipeline:", [name for name, _ in classifier.steps]) print("parameters:") pprint(parameters) print(gridSearch.fit(X_train, Y_train)) print("Best score: %0.3f" % gridSearch.best_score_) print("Best parameters set:") best_parameters = gridSearch.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) rbm_layer_1.learning_rate = 0.048888888888888891 rbm_layer_1.n_iter = 25 rbm_layer_1.n_components = 100 print("Debut training RBM1") print(X_train.shape) t0 = time.clock() rbm_layer_1.fit(X_train) print(time.clock() - t0) # creation d'une base de train a partir d'echantillonnage # de variable cachees du premier rbm n_sample_second_layer_training = int(X.shape[0]) H1_train = np.zeros(shape=(n_sample_second_layer_training, rbm_layer_1.n_components)) H1_label_train = np.zeros(shape = (n_sample_second_layer_training, 1)) comp = 0
def classifier(train_num, use_profile=False): X,Y = feature_extractor(train_num, use_profile) logistic_classifier = linear_model.LogisticRegression(C=100.0, penalty='l1') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logistic_classifier.fit(X_train, Y_train) logistic = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.06 rbm.n_iter = 20 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline # classifier.fit(X_train, Y_train) # print("Logistic regression using RBM features:\n%s\n" % ( # metrics.classification_report( # Y_test, # classifier.predict(X_test)))) # param_grid = {'penalty':['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } # GridSearch = GridSearchCV(logistic_classifier, param_grid, cv = 10) # GridSearch.fit(X_train, Y_train) # bestLRclf = GridSearch.best_estimator_ bestLRclf = logistic_classifier print("Logistic regression using raw features:\n%s\n" % ( metrics.classification_report( Y_test, bestLRclf.predict(X_test)))) print bestLRclf.coef_ # print "logistic_classifier RBM accuracy", metrics.accuracy_score(Y_test, classifier.predict(X_test)) print "logistic_classifier accuracy", metrics.accuracy_score(Y_test, bestLRclf.predict(X_test)) print "logistic_regression mean_squared_error", metrics.mean_squared_error(Y_test, bestLRclf.predict(X_test)) logProb = bestLRclf.predict_log_proba(X_test) second_col = logProb[:,1] sorted_index = np.argsort(second_col) correct_count = 0 for i in range(1, 427): index = sorted_index[-i] if Y_test[index] == 1: correct_count += 1 correct_percentage = correct_count / 426.0 print "correct_percentage", correct_percentage return metrics.accuracy_score(Y_test, bestLRclf.predict(X_test))
[nn_clf1.score(x_train1,y_train1), nn_clf1.score(x_test1,y_test1)] # Commented out IPython magic to ensure Python compatibility. print("MLPClassifier:\n%s\n"\ # %(classification_report(y_test1, nn_clf1.predict(x_test1)))) x_train1_scaled=\ (x_train1-numpy.min(x_train1,0))/(numpy.max(x_train1,0)+0.0001) x_test1_scaled=\ (x_test1-numpy.min(x_test1,0))/(numpy.max(x_test1,0)+0.0001) logistic=LogisticRegression(solver='liblinear',multi_class='ovr', max_iter=50,tol=0.0001,C=5000.0) brbm=BernoulliRBM(random_state=0,verbose=False) brbm.learning_rate,brbm.n_iter,brbm.n_components=0.05,50,64 nn_clf2=Pipeline(steps=[('brbm',brbm),('logistic',logistic)]) nn_clf2.fit(x_train1_scaled,y_train1) # Commented out IPython magic to ensure Python compatibility. print("Logistic regression using BRBM features:\n%s\n"\ # %(classification_report(y_test1, nn_clf2.predict(x_test1_scaled)))) """# Classification 2""" n_total=42000; n_labeled=38000 X2=numpy.copy(x_train2.reshape(-1,784)[:n_total]) y2=numpy.copy(y_train2[:n_total]).astype('int64') y2[n_labeled:]=-1 lp_model=label_propagation\
#plt.show() #pre-train networks using Restricted Boltzmann Machine #first layer min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) bigMatrix = min_max_scaler.fit_transform(bigMatrix) vectorOfOnes = np.tile(1.0, (bigMatrix.shape[0], 1)) bigMatrix = np.hstack((vectorOfOnes, bigMatrix)) #Divide train Matrix and Test Matrix (for which I don't have labels) trainMatrixReduced = bigMatrix[someOtherNumbers, :] testMatrixReduced = bigMatrix[testIndexes, :] RBM1 = BernoulliRBM(verbose = True) RBM1.learning_rate = 0.04 RBM1.n_iter = 20 RBM1.n_components = 700 RBM1.fit(bigMatrix) ThetaHiddenOne = RBM1.components_.T bigMatrix = sigmoid(np.dot(bigMatrix, ThetaHiddenOne)) vectorOfOnes = np.tile(1.0, (bigMatrix.shape[0], 1)) bigMatrix = np.hstack((vectorOfOnes, bigMatrix)) #second layer RBM2 = BernoulliRBM(verbose = True) RBM2.learning_rate = 0.03 RBM2.n_iter = 20