def __init__(self, methodname='linear regression', trainingpart=0.9, ): """ 类初始化函数。 :param self: 类变量本身 :param method='linear regression': model类别,可以为'linear regression', 'svc', 'neural netword' :param trainingPart=0.9: 训练集占整体的比例,默认为0.9 """ if trainingpart <= 0 or trainingpart >=1: raise Exception("Training Part is belong to (0, 1)") # 设置model if methodname == 'linear regression': self.model = LinearRegression() elif methodname == 'svc': self.model = SVC() print 'Warning: your\'s y data\'s type need to be int!' elif methodname == 'neural netword': self.model = MLPClassifier() print 'Warning: your\'s y data\'s type need to be int!' else: methodname = 'linear regression' self.model = LinearRegression() # 设置其他属性 self.trainingpart = trainingpart self.methodname = methodname self.X = None self.y = None self.train_X = None self.test_X = None self.train_y = None self.test_y = None
class BCISignal(): def __init__(self, fs, bands, ch_names, states_labels, indexes): self.states_labels = states_labels self.bands = bands self.prefilter = FilterSequence([ButterFilter((0.5, 45), fs, len(ch_names))]) self.csp_pools = [SpatialDecompositionPool(ch_names, fs, bands, 'csp', indexes) for _label in states_labels] self.csp_transformer = None self.var_detector = InstantaneousVarianceFilter(len(bands)*len(indexes)*len(states_labels), n_taps=fs//2) self.classifier = MLPClassifier(hidden_layer_sizes=(), early_stopping=True, verbose=True) #self.classifier = RandomForestClassifier(max_depth=3, min_samples_leaf=100) def fit(self, X, y=None): X = self.prefilter.apply(X) for csp_pool, label in zip(self.csp_pools, self.states_labels): csp_pool.fit(X, y == label) self.csp_transformer = FilterStack([pool.get_filter_stack() for pool in self.csp_pools]) X = self.csp_transformer.apply(X) X = self.var_detector.apply(X) self.classifier.fit(X, y) print('Fit accuracy {}'.format(sum(self.classifier.predict(X) == y)/len(y))) def apply(self, chunk: np.ndarray): chunk = self.prefilter.apply(chunk) chunk = self.csp_transformer.apply(chunk) chunk = self.var_detector.apply(chunk) predicted_labels = self.classifier.predict(chunk) return predicted_labels
def init_Q(): # make some dummy training set board = init_board() board_vec = board2vec(board) X = np.array([board_vec]) y = [(BOARD_SIZE-1)**2] board_vec = np.invert(board_vec) X = np.append(X,np.array([board_vec]),axis=0) y.append(0) edges = get_potential_moves(board) # all the edges, since the board is empty for edge in edges: i = edge2ind(edge) board_vec[i] = False X = np.append(X,np.array([board_vec]),axis=0) y.append(check_surrounding_squares(board,edge,0)) board_vec[i] = True Q = MLPClassifier(warm_start=True, hidden_layer_sizes=(BOARD_SIZE,10*BOARD_SIZE,BOARD_SIZE), tol = 1e-10, ) # Q = DecisionTreeRegressor() # shf = range(len(y)) # for j in xrange(100): # random.shuffle(shf) # Xshf = [X[i] for i in shf] # yshf = [y[i] for i in shf] triedy = range((BOARD_SIZE-1)**2+1) Q.partial_fit(np.repeat(X,100,axis=0),np.repeat(y,100,axis=0),classes=triedy) print(Q.predict(X)) return(Q)
def main(): enc = OneHotEncoder(n_values=[7,7,7,7,7,7]) conn = sqlite3.connect('server.db') cursor = conn.cursor() all_ = pandas.read_sql_query('SELECT layers.burger, labels.output, layers.layer0, layers.layer1, layers.layer2, layers.layer3, layers.layer4, layers.layer5 FROM layers,labels WHERE layers.burger = labels.burger', conn, index_col='burger') X = all_.drop(['output'], axis=1) y = all_['output'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) clf = MLPClassifier(solver='adam', activation='relu', verbose=False, max_iter=10000, tol=1e-9, random_state=1) X_train_categoricals = X_train[column_names] tX_train_categoricals = enc.fit_transform(X_train_categoricals) clf.fit(tX_train_categoricals, y_train.as_matrix().astype(int)) X_test_categoricals = X_test[column_names] tX_test_categoricals = enc.fit_transform(X_test_categoricals) prediction = clf.predict(tX_test_categoricals) print(classification_report(y_test, prediction)) print_eval(y_test, prediction)
def test_gradient(): # Test gradient. # This makes sure that the activation functions and their derivatives # are correct. The numerical and analytical computation of the gradient # should be close. for n_labels in [2, 3]: n_samples = 5 n_features = 10 X = np.random.random((n_samples, n_features)) y = 1 + np.mod(np.arange(n_samples) + 1, n_labels) Y = LabelBinarizer().fit_transform(y) for activation in ACTIVATION_TYPES: mlp = MLPClassifier(activation=activation, hidden_layer_sizes=10, solver='lbfgs', alpha=1e-5, learning_rate_init=0.2, max_iter=1, random_state=1) mlp.fit(X, y) theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_]) layer_units = ([X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]) activations = [] deltas = [] coef_grads = [] intercept_grads = [] activations.append(X) for i in range(mlp.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) deltas.append(np.empty((X.shape[0], layer_units[i + 1]))) fan_in = layer_units[i] fan_out = layer_units[i + 1] coef_grads.append(np.empty((fan_in, fan_out))) intercept_grads.append(np.empty(fan_out)) # analytically compute the gradients def loss_grad_fun(t): return mlp._loss_grad_lbfgs(t, X, Y, activations, deltas, coef_grads, intercept_grads) [value, grad] = loss_grad_fun(theta) numgrad = np.zeros(np.size(theta)) n = np.size(theta, 0) E = np.eye(n) epsilon = 1e-5 # numerically compute the gradients for i in range(n): dtheta = E[:, i] * epsilon numgrad[i] = ((loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]) / (epsilon * 2.0)) assert_almost_equal(numgrad, grad)
def mlp_cv_architecture(X,Y): kfold = KFold(X.shape[0], n_folds = 10) architectures = ( (500,2), (400,2), (400,100,2), (400,200,2), (400,100,50,2), (400,200,50,2) ) res_dict = {} for architecture in architectures: mlp = MLPClassifier( algorithm = 'sgd', learning_rate = 'adaptive', hidden_layer_sizes = architecture, random_state = 1) train_times = [] train_accuracy = [] test_accuracy = [] for train, test in kfold: t_tr = time.time() mlp.fit( X[train], Y[train] ) train_times.append( time.time() - t_tr ) acc_train = np.sum( np.equal( mlp.predict( X[train]), Y[train] ) ) / float(X[train].shape[0]) acc_test = np.sum( np.equal( mlp.predict( X[test]), Y[test] ) ) / float(X[test].shape[0]) train_accuracy.append( acc_train ) test_accuracy.append( acc_test ) res_dict[str(architecture)] = (np.mean(train_accuracy), np.std(train_accuracy), np.mean(test_accuracy), np.std(test_accuracy), np.mean(train_times), np.std(train_times)) with open('./../results/res_nncv_architecture.pkl', 'w') as f: pickle.dump(res_dict,f)
def train(): utl.print_title('Getting data...') X, Tc, X_test, Tc_test = dpp.getdata_arnold() #X, Tc, X_test, Tc_test = dpp.getdata_mnist() utl.print_title('Preparing data...') X, X_test = dpp.scale_data(X, X_test) T = dpp.one_hot_encode(Tc) T_test = dpp.one_hot_encode(Tc_test) utl.print_title('Sanity checks...') print('Shape X:', X.shape) print('Shape Tc:', Tc.shape) print('Shape T:', T.shape) print('Shape X_test:', X_test.shape) print('Shape Tc_test:', Tc_test.shape) print('Shape T_test:', T_test.shape) utl.print_title('Training the network...') classifier = MLPClassifier(solver='adam', learning_rate_init=1e-3, hidden_layer_sizes=(100), verbose=True, max_iter=200) classifier.fit(X, T) train_score, Pc = get_results(classifier, X, T) test_score, Pc_test = get_results(classifier, X_test, T_test) utl.print_title('Results:') print('Classification counts train (target): ', np.bincount(Tc.reshape(-1))) print('Classification counts train (prediction): ', np.bincount(Pc)) print('\nClassification counts test (target): ', np.bincount(Tc_test.reshape(-1))) print('Classification counts test (prediction): ', np.bincount(Pc_test)) print('\nTrain score: ', train_score) print('Test score: ', test_score)
def create(self): csvPath = self.sourceCsvFile dataset = np.loadtxt( csvPath, dtype='int', delimiter=",", skiprows=1,converters={ \ 4: convertCell, \ 5: convertCell, \ 6: convertCell, \ 7: convertCell, \ 8: convertCell, \ 9: convertCell, \ 10: convertCell, \ 11: convertCell, \ 12: convertCell, \ 13: convertCell, \ 14: convertCell, \ 15: convertCell \ } ) non_cat_data = dataset[:, [0,1,2] ] cat_data = dataset[:, [4,5,6,7,8,9,10,11,12,13,14,15] ] output_data = dataset[:, 3] enc = preprocessing.OneHotEncoder() enc.fit(cat_data) cat_out = enc.transform(cat_data).toarray() merge_data = np.concatenate((non_cat_data,cat_data),axis=1) d(merge_data[0]) clf = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) #clf = tree.DecisionTreeClassifier() clf = clf.fit(merge_data, output_data) s = pickle.dumps(clf) dtFileName = "%s\\save.pkl"%self.outDir dtFile = open( dtFileName, 'w' ) print dtFileName dtFile.write( s ); dtFile.close() choicesFile = open( "%s\\choices.pkl"%self.outDir, 'w' ) s = pickle.dumps(choiceArr) choicesFile.write( s ); choicesFile.close() sample_inputs = [] for i in range( 100 ): sample_inputs.append( merge_data[i*500] ) file = open( "%s\\sampleInputs.pkl"%self.outDir, 'w' ) file.write( pickle.dumps(sample_inputs) ) file.close() file = open( "%s\\def.txt"%self.outDir, 'w' ) file.write( "input file: %s\n"%self.sourceCsvFile ) file.close() print dataset[722] print merge_data[722] print output_data[722] print clf.predict( sample_inputs )
class NeuralLearner(Learner.Learner): def __init__(self, FeatureMask): super(NeuralLearner, self).__init__(FeatureMask) self.expected = FeatureMask.LabelsForAllPoints #self.model = MLPClassifier(algorithm='sgd', hidden_layer_sizes=(64,32)) self.model = MLPClassifier(algorithm = 'sgd', learning_rate = 'constant', momentum = .9, nesterovs_momentum = True, learning_rate_init = 0.2) def FitAndPredict(self, mask): return self.Predict(self.Fit(mask)) def SetupInputActivations(self, FeatureMask): arr = np.hstack([FeatureMask.ForceStd.reshape(-1,1), FeatureMask.ForceMinMax.reshape(-1,1), FeatureMask.CannyFilter.reshape(-1,1)]) expected = FeatureMask.LabelsForAllPoints return arr, expected def Fit(self, mask): arr, expected = self.SetupInputActivations(mask) self.model.fit(arr, expected) def Predict(self, mask): arr, expected = self.SetupInputActivations(mask) return self.model.predict(arr).reshape(-1,1)
def train_on_source(X,Y): print "Start Learning Net on source" clf = MLPClassifier( algorithm = 'l-bfgs', alpha = 1e-5, hidden_layer_sizes = (500,2), random_state = 1, warm_start = 1, max_iter = 400) clf.fit(X,Y) #new_loss = 0 #old_loss = 10000 #for step in range(200): # clf.fit(X,Y) # new_loss = clf.loss_ # # stop training, if improvement is small # improvement = abs(new_loss - old_loss) # print "Step:", step, "Loss:", new_loss, "Improvement:", improvement # if improvement < 1.e-5: # print "Training converged!" # break # old_loss = new_loss print "Pretrained CLF on Source with num_iter:", clf.n_iter_ return clf
def test_partial_fit_classes_error(): # Tests that passing different classes to partial_fit raises an error X = [[3, 2]] y = [0] clf = MLPClassifier(solver='sgd') clf.partial_fit(X, y, classes=[0, 1]) assert_raises(ValueError, clf.partial_fit, X, y, classes=[1, 2])
def train(classes, y_samples, feature_dict, classes_dict): # Using dev version of slearn, 1.9 from sklearn.neural_network import MLPClassifier clf = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(50, 25), random_state=1, verbose=True) clf.fit(y_samples, classes) return clf
def test_adaptive_learning_rate(): X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd', learning_rate='adaptive') clf.fit(X, y) assert_greater(clf.max_iter, clf.n_iter_) assert_greater(1e-6, clf._optimizer.learning_rate)
def mlp_train(self,x_train,y_train): scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) clf = MLPClassifier(max_iter=500,alpha=1e-5,hidden_layer_sizes=(40,100,80),warm_start=True,random_state=0) clf.fit(x_train,y_train) return clf
def test_tolerance(): # Test tolerance. # It should force the solver to exit the loop when it converges. X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd') clf.fit(X, y) assert_greater(clf.max_iter, clf.n_iter_)
def main(): iris = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target) classifier = MLPClassifier(max_iter=1000) classifier.fit(X_train, y_train) s = classifier.score(X_test, y_test) print(s)
def fitMLPs(trainIndexes, datasets): classifiers = [] for (x,y) in datasets: cl = MLPClassifier(algorithm='l-bfgs', alpha=1e-4, hidden_layer_sizes=(76, 30), random_state=1, momentum=0.8) data, target = listToData(trainIndexes, x, y) cl.fit(data, target) classifiers.append(cl) return classifiers
def fit_and_score_ann(x_train, y_train, x_test, y_test, config): ann = MLPClassifier(solver=config.ann.solver, max_iter=Configuration.ANN_MAX_ITERATIONS, alpha=config.ann.alpha, hidden_layer_sizes=(config.ann.hidden_neurons,), learning_rate='adaptive') ann.fit(x_train, y_train) return ann.score(x_test, y_test)
def mlpTest(self): mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4, solver ='sgd', verbose=10, tol=1e-4, random_state=1) mlp.fit(self.X_train,self.Y_train) predicted = mlp.predict(self.X_test) print("Classification report for classifier %s:\n%s\n" % (mlp, metrics.classification_report(self.Y_test, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(self.Y_test, predicted))
def do_mlp(x_train, x_test, y_train, y_test): clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (10, 4), random_state = 1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred))
def do_mlp(x_train, x_test, y_train, y_test): #mlp clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) do_metrics(y_test,y_pred)
def main(): np.random.seed(RANDOM_STATE) pd.set_option('display.width', 0) pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) data = pd.read_csv('data/train.csv') #test_data = pd.read_csv('data/test.csv') records = [] #n = 42000*0.8 n = 10000 X, y = extract_data(data, n) activation = 'tanh' param_dict = {'batch_size': [100, 200], 'momentum': [0.9, 0.99 ], 'learning_rate_init':[0.001, 0.01, 0.1]} #param_dict = {'batch_size': [200], 'momentum': [0.9], 'learning_rate_init':[0.1]} for param in ParameterGrid(param_dict): nn = MLPClassifier(algorithm='sgd', tol=float('-inf'), warm_start = True, max_iter=1, hidden_layer_sizes = [200], random_state=RANDOM_STATE) #nn_params = {'algorithm': 'sgd', 'tol': float nn_params = nn.get_params() nn_params.update(param) nn.set_params(**nn_params) #nn = MLPClassifier(**nn_params) time_limits = list(range(1, 60, 60)) try: evaluation_list = trainer_by_time(X, y, time_limits, nn) except: evaluation_list = [{}] for i in range(len(evaluation_list)): evaluation = evaluation_list[i] record = {} record['n'] = n record['time limit'] = time_limits[i] record.update(evaluation) record.update(param) records.append(record) df = pd.DataFrame(records) cols = list(df.columns) keys = evaluation_list[0].keys() cols = [item for item in cols if item not in keys] cols += keys df = df.reindex(columns=cols) now = datetime.datetime.now() result_file = open('result.txt', 'a') print(now,file=result_file) print(df) print(df,file=result_file)
def MLP_classifier(train_x, train_y): clf = MLPClassifier(activation='relu', algorithm='adam', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08, hidden_layer_sizes=([50,50]), learning_rate='constant', learning_rate_init=0.01, max_iter=3000, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True, validation_fraction=0.1, verbose=False, warm_start=False) clf.fit(train_x, train_y) return clf
def test_early_stopping_stratified(): # Make sure data splitting for early stopping is stratified X = [[1, 2], [2, 3], [3, 4], [4, 5]] y = [0, 0, 0, 1] mlp = MLPClassifier(early_stopping=True) with pytest.raises( ValueError, match='The least populated class in y has only 1 member'): mlp.fit(X, y)
def do_mlp(x_train, x_test, y_train, y_test): # Building deep neural network clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes = (5, 2), random_state = 1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred)) print metrics.confusion_matrix(y_test, y_pred)
def neural_network_voting_systemLogistic(): import pydotplus a,b,c,d,e,f = traing_test_data_set(); iterations = [75, 60, 90, 95, 95]; voting_pred = list(); for i in range(0, len(d[0])): voting_pred.append([]); import random for feature_number in range(1, 6): print("Feature Number : " + str(feature_number)); train_data, train_label = a[feature_number - 1], b[feature_number - 1]; test_data, test_label = c[feature_number - 1], d[feature_number - 1]; # use feature scaling for rbf kernel # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler(); # scaler.fit(train_data); # train_data = scaler.transform(train_data); # test_data = scaler.transform(test_data); #rnd = list(zip(train_data,train_label)); #random.shuffle(rnd); #train_data, train_label = zip(*rnd) from sklearn.preprocessing import StandardScaler scaler = StandardScaler(); scaler.fit(train_data); train_data = scaler.transform(train_data); test_data = scaler.transform(test_data); from sklearn.neural_network import MLPClassifier clf = MLPClassifier(alpha=1, hidden_layer_sizes=(15,), random_state=1, activation='logistic',max_iter =1000,early_stopping=False) clf.fit(train_data, train_label) tot = len(test_label); cnt = 0; print(clf.n_iter_); for i in range(0, len(test_data)): voting_pred[i].append(clf.predict([test_data[i]])[0]); tot = len(test_label); cnt = 0; prediction = list(); for i in range(0, len(test_data)): prediction.append(most_common(voting_pred[i])); if prediction[i] != test_label[i]: print(str(i) + " " + str(prediction[i]) + " " + str(test_label[i])); cnt += 1; from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import f1_score print("Complete for Voting system :"); print("Total test set size : " + str(len(test_label))); print("Correct prediction : " + str(tot - cnt)); print("Incorrect Prediction : " + str(cnt)); print("Accuracy : " + str(accuracy_score(test_label, prediction) * 100.0)) print("Precision : " + str(precision_score(test_label, prediction, average='weighted') * 100.0)) print("F1 Score : " + str(f1_score(test_label, prediction, average='weighted') * 100.0)) print("Error Rate : " + str(cnt / tot * 100.0)); print("---------------------------------------\n");
def test_bool_and(self): x = ((0, 0), (1, 1), (1, 0), (0, 1)) y = ( 0, 1, 0, 0) mlp = MLPClassifier(hidden_layer_sizes=(), activation='logistic', max_iter=2, alpha=1e-4, algorithm='l-bfgs', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1) mlp.fit(x, y) assert mlp.predict(((0, 0))) == 0 assert mlp.predict(((0, 1))) == 0 assert mlp.predict(((1, 0))) == 0 assert mlp.predict(((1, 1))) == 1
def Neural_network(self, X_train, Y_train, X_test, Y_test): from sklearn import metrics from sklearn.neural_network import MLPClassifier modle = MLPClassifier() modle.fit(X_train, Y_train) expected = Y_test prediceted = modle.predict(X_test) ftp, tpr, thres = metrics.roc_curve(expected, prediceted) print metrics.classification_report(expected, prediceted) # print metrics.confusion_matrix(expected, prediceted) print metrics.auc(ftp, tpr)
def test_bool_onehot(self): X = [x for x in itertools.combinations_with_replacement([True, False], 9)] y = [True if sum(a) == 1 else False for a in X] X_r = repeat_data(X) y_r = repeat_data(y) mlp = MLPClassifier(hidden_layer_sizes=(2), activation='logistic', max_iter=10000, alpha=1e-4, algorithm='l-bfgs', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1) mlp.fit(X_r, y_r) assert (mlp.score(X, y) > 0.9) for x in X: self.assertEqual(mlp.predict(x), (sum(x) == 1))
class AnnClassifier(AbstractClassifier): def __init__(self, features, target, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1): self.solver = solver self.alpha = alpha self.hidden_layer_sizes = hidden_layer_sizes self.random_state = random_state super(AnnClassifier, self).__init__(features, target) def __fit(self, features): self.clf = MLPClassifier(solver=self.solver, alpha=self.alpha, hidden_layer_sizes=self.hidden_layer_sizes, random_state=self.random_state) self.clf.fit(features, self.target)
x = pd.get_dummies(cf) cetagorical = ['token','next','previous'] x = x.to_numpy() #x = pd.get_dummies(df[columns=cetagorical]) #print(x) """ print(type(y)) print(df) print("0th token = ",token_exp[0]) print("rest") print(df.shape) print(df['class']) print("Type of dummy is : ",type(dummy))""" print(type(y)) X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.25,random_state=40) classifier = MLPClassifier(activation="logistic",solver='sgd',alpha=0.1,hidden_layer_sizes=(5,15)) classification = classifier.fit(X_train,Y_train) # Save the model as a pickle in a file joblib.dump(classifier, 'NeuralNet.pkl') # Load the model from the file nn_from_joblib = joblib.load('NeuralNet.pkl') Y_pred = nn_from_joblib.predict(X_test) confusion = confusion_matrix(Y_test,Y_pred) print("confusion matrix : \n",confusion) accuracy = accuracy_score(Y_test,Y_pred)*100 print("System accuracy = ",accuracy) c = precision_score(Y_test, Y_pred, average='macro')*100 print("Precission of the system = ",c) d = recall_score(Y_test,Y_pred,average='micro')*100 print("Recall of the system = ",d)
df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test( df_glass, y_col_glass, x_cols_glass, train_test_ratio) #%% CREATE DICTIONARY OF VARIOUS CLASSIFIERS TO TRY dict_classifiers = { "Logistic Regression": LogisticRegression(), "Nearest Neighbors": KNeighborsClassifier(), "Linear SVM": SVC(), "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000), "Decision Tree": tree.DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(n_estimators=1000), "Neural Net": MLPClassifier(alpha=1), "Naive Bayes": GaussianNB(), "AdaBoost": AdaBoostClassifier(), "QDA": QuadraticDiscriminantAnalysis(), "Gaussian Process": GaussianProcessClassifier() } #%% BATCH CLASSIFIER def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers=11, verbose=True):
def mplc(x_train, x_test, y_train, y_test, solver): clf = MLPClassifier(solver = solver) clf.fit(x_train, y_train.values.ravel()) predict(clf, x_train, x_test, y_train, y_test)
## print training accuracy print('train acc: ', accuracy_score(rsvc.predict(X_train), y_train)) ## print test accuracy print('test acc: ', accuracy_score(rsvc.predict(X_test), y_test)) # Isn't that just amazing accuracy? # ## Basic Neural Network # You should never do neural networks in sklearn. Use Keras (which we will teach you later in this class), Tensorflow, PyTorch, etc. However, in an effort to keep this homework somewhat cohesive, let us proceed. # Basic neural networks proceed in layers. Each layer has a certain number of nodes, representing how expressive that layer can be. Below is a sample network, with an input layer, one hidden (middle) layer of 50 neurons, and finally the output layer. # In[ ]: nn = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', verbose=1) ## fit the nn nn.fit(X, y) print('Basic Neural Network Accuracy') ## print training accuracy print('train acc: ', accuracy_score(nn.predict(X_train), y_train)) ## print test accuracy print('test acc: ', accuracy_score(nn.predict(X_test), y_test)) # Fiddle around with the hiddle layers. Change the number of neurons, add more layers, experiment. You should be able to hit 98% accuracy. # Neural networks are optimized with a technique called gradient descent (a neural net is just one big function - so we can take the gradient with respect to all its parameters, then just go opposite the gradient to try and find the minimum). This is why it requires many iterations to converge. # ## Turning In # Convert this notebook to a PDF (file -> download as -> pdf via latex) and submit to Gradescope.
classifier1 = neighbors.KNeighborsClassifier(weights='distance') classifier1.fit(trainX, trainY) prediction_clf1 = classifier1.predict(testX) print(prediction_clf1) print(metrics.accuracy_score(testY, prediction_clf1)) classifier2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 6), random_state=1, learning_rate='invscaling', max_iter=200) classifier2.fit(trainX, trainY) prediction_clf2 = classifier2.predict(testX) print(prediction_clf2) print(metrics.accuracy_score(testY, prediction_clf2)) #print(trainX)
from builtins import range # Note: you may need to update your version of future # sudo pip install -U future import numpy as np from util import getKaggleMNIST from datetime import datetime from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.neural_network import MLPClassifier # get the data: https://www.kaggle.com/c/digit-recognizer Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() # scale first pipeline = Pipeline([ # ('scaler', StandardScaler()), ('mlp', MLPClassifier(hidden_layer_sizes=(500,), activation='tanh')), ]) t0 = datetime.now() pipeline.fit(Xtrain, Ytrain) print("train duration:", datetime.now() - t0) t0 = datetime.now() print("train score:", pipeline.score(Xtrain, Ytrain), "duration:", datetime.now() - t0) t0 = datetime.now() print("test score:", pipeline.score(Xtest, Ytest), "duration:", datetime.now() - t0)
cross = 10 test_size = (1 / cross) X_train, X_test, y_train, y_test = train_test_split( datanew, target, stratify=target, test_size=test_size) rf = RandomForestClassifier() rf.fit(X_train, y_train) predrf = rf.predict_proba(X_test) #print("rf: ",rf.score(X_test,y_test)) knn = KNeighborsClassifier(n_neighbors=10) knn.fit(X_train, y_train) predknn = knn.predict_proba(X_test) #print("knn: ",knn.score(X_test,y_test)) mlp = MLPClassifier(hidden_layer_sizes=(50, 25, 10)) mlp.fit(X_train, y_train) predmlp = mlp.predict_proba(X_test) #print("mlp : ",mlp.score(X_test,y_test)) y_pred = [] for i in range(len(predrf)): l1 = predrf[i] l2 = predknn[i] l3 = predmlp[i] n1 = np.array(l1) n2 = np.array(l2) n3 = np.array(l3) pr = n1 + n2 + n3
rf = RandomForestClassifier() rf.fit(X_train, y_train) y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf) roc_auc_rf = auc(fpr_rf, tpr_rf) dt = DecisionTreeClassifier() dt.fit(X_train, y_train) y_pred_dt = dt.predict_proba(X_test)[:, 1] fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt) precision_dt, recall_dt, _ = precision_recall_curve(y_test, y_pred_dt) roc_auc_dt = auc(fpr_dt, tpr_dt) mlp = MLPClassifier() mlp.fit(X_train, y_train) y_pred_mlp = mlp.predict_proba(X_test)[:, 1] fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_pred_mlp) precision_mlp, recall_mlp, _ = precision_recall_curve(y_test, y_pred_mlp) roc_auc_mlp = auc(fpr_mlp, tpr_mlp) svm = svm.SVC(probability=True) svm.fit(X_train, y_train) y_pred_svm = svm.predict_proba(X_test)[:, 1] fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm) precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_pred_svm) roc_auc_svm = auc(fpr_svm, tpr_svm) sgd = SGDClassifier(loss='log') sgd.fit(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier X = df.headline y = df.label cv = ShuffleSplit(n_splits=20, test_size=0.2) models = [ MultinomialNB(), BernoulliNB(), LogisticRegression(), SGDClassifier(), LinearSVC(), RandomForestClassifier(), MLPClassifier() ] sm = SMOTE() # Init a dictionary for storing results of each run for each model results = { model.__class__.__name__: { 'accuracy': [], 'f1_score': [], 'confusion_matrix': [] } for model in models } for train_index, test_index in cv.split(X):
) base.loc[base.age < 0, 'age'] = 40.92 previsores = base.iloc[:, 1:4].values classe = base.iloc[:, 4].values from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(previsores[:, 1:4]) previsores[:, 1:4] = imputer.transform(previsores[:, 1:4]) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinameto, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.25, random_state=0) from sklearn.neural_network import MLPClassifier classificador = MLPClassifier(verbose=True, max_iter=1000) classificador.fit(previsores_treinameto, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score precisao = accuracy_score(classe_teste, previsoes) matriz = confusion_matrix(classe_teste, previsoes) from collections import Counter Counter(classe_teste)
predicted_class_name=['prediction'] df=pd.read_csv("C:\Python27\heart2.csv") x= df[feature_column_names].values y=df[predicted_class_name].values split_test_size=.2 x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=split_test_size,random_state=42) mlp=MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(500,),random_state=42) mlp.fit(x_train, y_train.ravel()) expected=y_test predicted_mlp=mlp.predict(x_test) accuracy_mlp= mlp.score(x_test, y_test) print(accuracy_mlp) print(metrics.classification_report(expected,predicted_mlp)) print(metrics.confusion_matrix(expected,predicted_mlp)) cm_mlp=metrics.confusion_matrix(expected,predicted_mlp) cm_mlp_list=cm_mlp.tolist() cm_mlp_list[0].insert(0,'Real True') cm_mlp_list[1].insert(0,'Real False') print tabulate(cm_mlp_list,headers=['Real/Pred','Pred True', 'Pred False'])
imgToDigit.convert_to_bw(thr) digits = imgToDigit.split() imgToDigit.to_32_32() X, Y = imgToDigit.featureExtract_new() from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neural_network import MLPClassifier # 标准化数据,保证每个维度的特征数据方差为1,均值为0,使得预测结果不会被某些维度过大的特征值而主导 ma = X.max(0) # 求每列的最大值 ma[ma == 0] = 1 # 有很多列是0,保证不被0除而出错 mi = X.min(0) X = X - mi X = X / ma clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(200, 4), alpha=1e-5, random_state=1, max_iter=40000) #最大迭代次数 clf.fit(X, Y) # 训练模型 import pickle with open('handWriting.bin', 'wb') as f: rs = pickle.dumps(clf) f.write(rs) # %% from PIL import Image import pickle import numpy as np from ImageDigit import ImageDigit with open('handWriting.bin', 'rb') as f: clf = pickle.load(f)
from sklearn import datasets, svm, metrics from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier digits = datasets.load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) #flattens the image NBclass = GaussianNB() print("Running NB...") NBclass.fit(data[:int(n_samples*2/3)], digits.target[:int(n_samples*2/3)]) NBpred = NBclass.predict(data[int(n_samples/3):]) MLPclass = MLPClassifier(alpha=1, hidden_layer_sizes=(25, 15), random_state=1) print("Running MLP...") MLPclass.fit(data[:int(n_samples*2/3)], digits.target[:int(n_samples*2/3)]) MLPpred = MLPclass.predict(data[int(n_samples/3):]) SVCclass = svm.SVC(gamma=1) print("Running SVC...") SVCclass.fit(data[:int(n_samples*2/3)], digits.target[:int(n_samples*2/3)]) SVCpred = SVCclass.predict(data[int(n_samples/3):]) KNEIclass = KNeighborsClassifier(3) print("Running KNEI...") KNEIclass.fit(data[:int(n_samples*2/3)], digits.target[:int(n_samples*2/3)]) KNEIpred = KNEIclass.predict(data[int(n_samples/3):]) print("Calculating means..."); predicted = []
""" import pandas as pd import numpy as np train = pd.read_csv( "/Users/jaynanda/Desktop/Assignments/660/Project/Numeric Data/art_foreign_numeric.csv" ) feature = pd.DataFrame(train['Genre']) train = train.drop('Genre', axis=1) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(train, feature, test_size=0.30) from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) clf.fit(X_train, y_train) res = clf.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(res, y_test))
matrix = np.array(data, dtype=int) # For the model selection part, we will not work on all the data set, this could take too much time. training_length = 40200 training_set = matrix[:training_length, 1:] / 255.0 labels_training = matrix[:training_length, 0] validation_length = 19800 validation_set = matrix[training_length:training_length + validation_length, 1:] / 255.0 labels_validation = matrix[training_length:training_length + validation_length, 0] X, y = training_set, labels_training # Now, since our data set is ready, we can find the best random weights #We manually evaluate the accuracy and zero-one-loss after 10 iterations for both the training and the validation set for seed in [1, 26, 42, 67, 123]: mlp = MLPClassifier(hidden_layer_sizes=(100), alpha=0.05, max_iter=200, random_state=seed) mlp.fit(X, y) labels_training_pred = mlp.predict(X) labels_validation_pred = mlp.predict(validation_set) print("Seed: {}, accuracy training: {}, accuracy validation: {}".format( seed, accuracy_score(y, labels_training_pred), accuracy_score(labels_validation, labels_validation_pred)))
import pickle import pandas as pd import matplotlib.pyplot as plt from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate from load_data import DATA nn = MLPClassifier( hidden_layer_sizes=(144,), activation="relu", max_iter=1000, early_stopping=True, learning_rate_init=0.01, random_state=0, ) # raw data print("Running NN on raw data...") X, y = DATA["credit"] cv_results = cross_validate( nn, X, y, scoring=["accuracy", "balanced_accuracy", "precision", "recall", "f1", "roc_auc"], cv=10, return_train_score=True, ) res_mean = {key: [] for key in cv_results} res_std = {key: [] for key in cv_results} for key in cv_results: res_mean[key].append(cv_results[key].mean())
def Cross_Validation(X, y): model1 = MultinomialNB() scores1 = cross_validation.cross_val_score(model1, X, y, cv=5, scoring='accuracy') print("Naive Bayes with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores1))))) model2 = GradientBoostingClassifier() scores2 = cross_validation.cross_val_score(model2, X, y, cv=5, scoring='accuracy') print("Gradient Boost with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores2))))) model3 = SVC() scores3 = cross_validation.cross_val_score(model3, X, y, cv=5, scoring='accuracy') print("SVC with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores3))))) model4 = LogisticRegression() scores4 = cross_validation.cross_val_score(model4, X, y, cv=5, scoring='accuracy') print("Logistic Regression with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores4))))) model5 = KNeighborsClassifier(n_neighbors=2) scores5 = cross_validation.cross_val_score(model5, X, y, cv=5, scoring='accuracy') print("K-Neighbours-Classifier with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores5))))) model6 = ExtraTreesClassifier() scores6 = cross_validation.cross_val_score(model6, X, y, cv=5, scoring='accuracy') print("Tree-Classifier with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores6))))) model7 = DecisionTreeClassifier() scores7 = cross_validation.cross_val_score(model7, X, y, cv=5, scoring='accuracy') print("Decision-Tree-Classifier with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores7))))) model8 = MLPClassifier(solver='adam', alpha=0.01, hidden_layer_sizes=(10, 10)) scores8 = cross_validation.cross_val_score(model8, X, y, cv=5, scoring='accuracy') print("MLP classifier's with 5 Cross validation Accuracy:", (np.mean(np.sqrt(abs(scores8)))))
keras.layers.Dense(60, activation=tf.nn.relu), keras.layers.Dense(1, activation=tf.nn.sigmoid), ]) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, epochs=10, batch_size=32) test_loss, test_acc = model.evaluate(X_test, y_test) print('Test accuracy:', test_acc) print('Test loss:', test_loss) from sklearn.svm import SVC from sklearn import metrics svc=SVC() #Default hyperparameters svc.fit(X_train,y_train) y_pred=svc.predict(X_test) print('Accuracy Score:') print(metrics.accuracy_score(y_test,y_pred)) clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=50, alpha=0.0001, solver='sgd', verbose=10, random_state=21,tol=0.000000001) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy_score(y_test, y_pred)) cm = confusion_matrix(y_test, y_pred) cm sns.heatmap(cm, center=True) plt.show()
titles_ser = pd.Series(clustering_tree[cluster]["child_titles"], dtype=str) cluster_label_enc = title_encoding.index( titles_ser.value_counts().index[0]) # Matrix cluster_matrix = data_tfidf_matrix[clustering_tree[cluster] ["child_indices"]] output_column = np.array([[cluster_label_enc] * cluster_matrix.shape[0] ]).reshape(-1, 1) cluster_matrix = np.concatenate((cluster_matrix, output_column), axis=1) np.random.shuffle(cluster_matrix) training_data_list.append(cluster_matrix[:int(cfg.train_test_frac * cluster_matrix.shape[0]), :]) training_data_matrix = np.concatenate( [matrix for matrix in training_data_list], axis=0) np.random.shuffle(training_data_matrix) X_train, y_train = training_data_matrix[:, :-1], training_data_matrix[:, -1] scaler = StandardScaler() X_train = scaler.fit_transform(X_train) print("Fitting model") print(X_train.shape) mlp = MLPClassifier( hidden_layer_sizes=(X_train.shape[1], int((2 / 3) * X_train.shape[1]), len(data_pipeline.label_encoder.classes_)), max_iter=1000, verbose=True) mlp.fit(X_train, y_train) dump(mlp, cfg.binary_path + "MLPClassifier_model.joblib")
X['Created'] = X['Created'].map(gettime) X['dFollowers'] = (X['Followers at Posting'].diff( periods=-3)) / (X['Created'].diff(periods=-3)) X['Sentiment'] = X['Description'].map(getsent) X['Punctuation'] = X['Description'].str.count('!!!|ebron|rving|urry|iannis|arden|Why') \ + 2*X['Description'].str.count('@|#|ames') X['Description'] = X['Description'].str.len() X['dTime'] = X['Created'].diff(periods=-3) # Splits the data into training and testing sets, and resolves NaNs X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) X_train = X_train.fillna(X_train.mean()) y_train = y_train.fillna(y_train.mean()) X_test = X_test.fillna(X_test.mean()) y_test = y_test.fillna(y_test.mean()) # Scales the feature set for MLP sensitivity scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # Creates the Classifier and fits it to training data mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100), max_iter=10000, learning_rate='adaptive') mlp.fit(X_train, y_train.values.ravel()) # Pickles MLP for use with MLP_Creator.py pickle.dump(mlp, open('MLP_EC', 'wb'))
predicted_test_knn = knn_clf.predict(test_data) # Train SVM classifier svc_clf = svm.SVC(gamma='auto', kernel='rbf', decision_function_shape='ovo', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False).fit(train_data, train_labels) predicted_test_svc = svc_clf.predict(test_data) nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, ), random_state=1).fit(train_data, train_labels) predicted_test_nn = nn_clf.predict(test_data) fpr_rf, recall_rf = metrics_cal(test_labels, predicted_test_rf) fpr_dt, recall_dt = metrics_cal(test_labels, predicted_test_dt) fpr_knn, recall_knn = metrics_cal(test_labels, predicted_test_knn) fpr_svc, recall_svc = metrics_cal(test_labels, predicted_test_svc) fpr_nn, recall_nn = metrics_cal(test_labels, predicted_test_nn) print('Detection rate | False alarm rate ') print(recall_dt, fpr_dt) print(recall_rf, fpr_rf) print(recall_knn, fpr_knn) print(recall_svc, fpr_svc) print(recall_nn, fpr_nn)
def classBonus(filename): ''' This function performs experiment Bonus explores all the classifiers Parameters filename : string, the name of the npz file from Task 2 ''' decisionTreeResult = [] AdaBoostResult = [] MLPResult = [] RandomForResult = [] # load data data = np.load(filename) data = data['arr_0'] # getting y value X = data[:, :-1] y = data[:, -1] # splitting data into test and training 20%,80% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66) maxDep = range(1, 16) # Random Forest performance may be different for each train for depth in maxDep: print("Depth: " + str(depth)) model = RandomForestClassifier(max_depth=depth, n_estimators=10) model.fit(X_train, y_train) y_pred = model.predict(X_test) C = confusion_matrix(y_test, y_pred) print(C) output = ["RandomForestClassifier"] + [depth] + [ accuracy(C) ] + recall(C) + precision(C) + np.ravel(C).tolist() RandomForResult.append(output) aList = [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.05, 0.025, 0.01] # MLP performance may be different for each train for alpha in aList: print("Alpha: " + str(alpha)) model = MLPClassifier(alpha=0.05) model.fit(X_train, y_train) y_pred = model.predict(X_test) C = confusion_matrix(y_test, y_pred) print(C) output = ["MLPClassifier"] + [alpha] + [ accuracy(C) ] + recall(C) + precision(C) + np.ravel(C).tolist() MLPResult.append(output) learnRate = [0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0] # AdaBoost for rate in learnRate: print("learnRate: " + str(learnRate)) model = AdaBoostClassifier(learning_rate=rate) model.fit(X_train, y_train) y_pred = model.predict(X_test) C = confusion_matrix(y_test, y_pred) print(C) output = ["AdaBoost"] + [rate] + [ accuracy(C) ] + recall(C) + precision(C) + np.ravel(C).tolist() AdaBoostResult.append(output) maxFeatList = ['log2', 'sqrt', None] # Decision Tree for feat in maxFeatList: print("max Feats: " + str(feat)) model = DecisionTreeClassifier(random_state=66, max_features=feat) model.fit(X_train, y_train) y_pred = model.predict(X_test) C = confusion_matrix(y_test, y_pred) print(C) output = ["Decision Tree"] + [feat] + [ accuracy(C) ] + recall(C) + precision(C) + np.ravel(C).tolist() decisionTreeResult.append(output) bestAccuracy = -1 result = [decisionTreeResult, AdaBoostResult, MLPResult, RandomForResult] with open('a1_bonus.csv', 'w', newline='') as csvFile: csvWriter = csv.writer(csvFile, delimiter=',') for r in result: csvWriter.writerows(r)
y = to_categorical(y) # scaling of features to fit range 0-1 x = MinMaxScaler().fit_transform(x) # shuffle and split into training and test x, x_test, y, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=42) # fit multi-layer perceptron classifier mlp = MLPClassifier(hidden_layer_sizes=(64, 48, 10), activation='relu', solver='adam', max_iter=250, random_state=42, verbose=True) mlp = mlp.fit(x, y) # output mean mlp accuracy on test data mlp_accuracy = mlp.score(x_test, y_test) print('sklearn', mlp_accuracy) # calculcate confusion matrix for predicted labels label_pred = np.argmax(mlp.predict(x_test), axis=1) label_true = np.argmax(y_test, axis=1) cf_matrix = confusion_matrix(label_true, label_pred) print(cf_matrix) # construct multi-layer keras network
def main(num): import pandas as pd col_names = [ 'DNS', 'TCP', 'HTTP', 'BROWSER', 'IGMPv3', 'SSDP', 'NBSS', 'NBNS', 'SMB', 'LANMAN', 'IRC', 'SSL', 'SSLv2', 'SSLv3', 'TLSv1', 'SMTP', 'SMTP|IMF', 'VICP', 'HTTP/XML', 'ICMP', 'Packets Sent', 'Packets Received', 'Bytes Sent', 'Bytes Received', 'Country', 'Label' ] file = pd.read_csv("final_features.csv", ) #feature_cols = [ 'TCP', 'HTTP', 'SSL', 'Country'] num = 25 feature_cols = [ 'DNS', 'TCP', 'HTTP', 'ICMP', 'Packets Sent', 'Packets Received', 'Bytes Sent', 'Bytes Received', 'Country' ] X = file[feature_cols] Y = file.Label from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=num) # Baysian Stuff from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(100, 4), random_state=num) model.fit(X_train, Y_train) Y_pred = model.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix matrix = confusion_matrix(Y_test, Y_pred) print(matrix) print(classification_report(Y_test, Y_pred)) from sklearn import metrics print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) print("Precision:", metrics.precision_score(Y_test, Y_pred)) print("Recall:", metrics.recall_score(Y_test, Y_pred)) import numpy as np import matplotlib.pyplot as plt import seaborn as sns class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) cnf_matrix = confusion_matrix(Y_test, Y_pred) sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g') plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.text(0.5, 257.44, 'Predicted label')
#this code does stuff # split training and testing data X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3) # Create MLP classifier and define hyperameters model = MLPClassifier(verbose=1, learning_rate_init=0.5, hidden_layer_sizes=( 344, 172, ), batch_size=500, learning_rate='adaptive', activation='relu', solver='sgd', max_iter=200) # # model = OneVsRestClassifier(MLPClassifier(verbose=1, learning_rate_init=0.01, hidden_layer_sizes=(256, 256, 256), batch_size=200, # learning_rate='adaptive', activation='sigmoid', solver='sgd', max_iter=500)) # # #model = TPOTClassifier(generations=5, population_size=50, verbosity=3) # # # # # # # Fit the classifier to the data # model.fit(X_train,y_train) model = joblib.load('saved_model_3.pkl')
import re import tqdm import jieba import json import chardet ban_word = open('ban_word.txt', encoding='utf8').read().split('\n') classifiers = [ KNeighborsClassifier(), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis()] def cal_tfidf(data): res1 = ' '.join(jieba.lcut(data)) corpus = [res1] vector = TfidfVectorizer(stop_words=ban_word) try: tfidf = vector.fit_transform(corpus) except: return 0 return res1
for i in range(7, 11): img1 = np.column_stack((img1, gimg(i))) img = 1 - np.row_stack((img, img1)) plt.imshow(img) plt.show() # select training data size using 75% of the source train_size = int(X.shape[0] * .75) # perform principle component analysis and use 50 feature pca = PCA(n_components=50) training_data = pca.fit_transform(X[:train_size], y[:train_size]) # create simple neural network and train clf = MLPClassifier(solver='lbfgs', alpha=1e-5, activation='relu', max_iter=3000, hidden_layer_sizes=(30, ), random_state=1) clf.fit(training_data, y[:train_size].ravel()) # get predictions for the data not used in the classifier predicted = clf.predict(pca.transform(X[train_size:])) actual = y[train_size:] print(metrics.classification_report(actual, predicted)) print(metrics.confusion_matrix(actual, predicted)) joblib.dump(pca, '../trained/sklearn_pca.pkl') joblib.dump(clf, '../trained/sklearn_neural_network.pkl')
def _build_estimator(Y_train, method, cv, cv_scoring, cv_n_folds, **options): if cv: #from sklearn.cross_validation import StratifiedKFold #cv_obj = StratifiedKFold(n_splits=cv_n_folds, shuffle=False) cv_obj = cv_n_folds # temporary hack (due to piclking issues otherwise, this needs to be fixed) else: cv_obj = None _rename_main_thread() if method == 'LinearSVC': from sklearn.svm import LinearSVC if cv is None: cmod = LinearSVC(**options) else: try: from freediscovery_extra import make_linearsvc_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_linearsvc_cv_model(cv_obj, cv_scoring, **options) elif method == 'LogisticRegression': from sklearn.linear_model import LogisticRegression if cv is None: cmod = LogisticRegression(**options) else: try: from freediscovery_extra import make_logregr_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_logregr_cv_model(cv_obj, cv_scoring, **options) elif method == 'NearestCentroid': cmod = NearestCentroidRanker() elif method == 'NearestNeighbor': cmod = NearestNeighborRanker() elif method == 'xgboost': try: import xgboost as xgb except ImportError: raise OptionalDependencyMissing('xgboost') if cv is None: try: from freediscovery_extra import make_xgboost_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_model(cv_obj, cv_scoring, **options) else: try: from freediscovery_extra import make_xgboost_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_cv_model(cv, cv_obj, cv_scoring, **options) elif method == 'MLPClassifier': if cv is not None: raise NotImplementedFD('CV not supported with MLPClassifier') from sklearn.neural_network import MLPClassifier cmod = MLPClassifier(solver='adam', hidden_layer_sizes=10, max_iter=200, activation='identity', verbose=0) else: raise WrongParameter('Method {} not implemented!'.format(method)) return cmod
def Baselin_predict(mask): df_pred_x = df_test.drop(["label", 0], 1) df_pred_x = df_pred_x.loc[:, mask] x_pred = np.array(df_pred_x) CU_X, Y = feature_selection(mask) # rbfsvm = svm.SVC() # lsvm = svm.LinearSVC() mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100), max_iter=2000) skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) fold_accuracy = [] scaler = StandardScaler() tfidf = TfidfTransformer(norm=None) dense = Data_Utils.DenseTransformer() for train, test in skf.split(CU_X, Y): # train split CU_train_data = CU_X[train] train_labels = Y[train] # test split CU_eval_data = CU_X[test] eval_labels = Y[test] # tf-idf tfidf.fit(CU_train_data) CU_train_data = dense.transform(tfidf.transform(CU_train_data)) CU_eval_data = dense.transform(tfidf.transform(CU_eval_data)) # standardization scaler.fit(CU_train_data) CU_train_data = scaler.transform(CU_train_data) CU_eval_data = scaler.transform(CU_eval_data) # normalization CU_train_data = normalize(CU_train_data) CU_eval_data = normalize(CU_eval_data) train_data = CU_train_data eval_data = CU_eval_data # evaluation # rbfsvm.fit(train_data, train_labels) # lsvm.fit(train_data, train_labels) mlp.fit(train_data, train_labels) # rbfsvm_acc = rbfsvm.score(eval_data, eval_labels) # lsvm_acc = lsvm.score(eval_data, eval_labels) mlp_acc = mlp.score(eval_data, eval_labels) fold_accuracy.append(mlp_acc) print("accuracy Measure", np.mean(fold_accuracy)) CU_pred_data = dense.transform(tfidf.transform(x_pred)) CU_pred_data = scaler.transform(CU_pred_data) CU_pred_data = normalize(CU_pred_data) pred = [mlp.predict(i.reshape(1, -1))[0] for i in CU_pred_data] df_test["pred"] = pred df_out = df_test[[0, "pred"]] df_res = df_out.sort_values(by=[0]) df_res.to_csv("AdversarialTestResults.txt", header=None, index=None, sep=' ')
def class33(X_train, X_test, y_train, y_test, i, X_1k, y_1k): ''' This function performs experiment 3.3 Parameters: X_train: NumPy array, with the selected training features X_test: NumPy array, with the selected testing features y_train: NumPy array, with the selected training classes y_test: NumPy array, with the selected testing classes i: int, the index of the supposed best classifier (from task 3.1) X_1k: numPy array, just 1K rows of X_train (from task 3.2) y_1k: numPy array, just 1K rows of y_train (from task 3.2) ''' kList = {5, 10, 20, 30, 40, 50} csvResult = [] pval1 = [] pval32 = [] # find the best k features p values for 1k and 32k for i in kList: selector = SelectKBest(f_classif, k=i) X_new = selector.fit_transform(X_1k, y_1k) pp = sorted(selector.pvalues_) pval1.append(pp[:i]) print(pval1) for i in kList: selector = SelectKBest(f_classif, k=i) X_new = selector.fit_transform(X_train, y_train) pp = sorted(selector.pvalues_) pval32.append(pp[:i]) csvResult.append([i] + pp) print(pval32) # 1k and 32k with 5 features selector = SelectKBest(f_classif, k=5) X_train1k = selector.fit_transform(X_1k, y_1k) X_test1k = selector.transform(X_test) print(X_train1k) selector = SelectKBest(f_classif, k=5) X_train32k = selector.fit_transform(X_train, y_train) X_test32k = selector.transform(X_test) print(X_train32k) if iBest == 1: model = LinearSVC(max_iter=10000) elif iBest == 2: model = SVC(max_iter=10000, gamma=2) elif iBest == 3: model = RandomForestClassifier(max_depth=5, n_estimators=10) elif iBest == 4: model = MLPClassifier(alpha=0.05) else: model = AdaBoostClassifier() accuracies = [] model.fit(X_train1k, y_1k) y_predict1k = model.predict(X_test1k) accuracies.append( accuracy(confusion_matrix(y_test, y_predict1k, label=[0, 1, 2, 3]))) model.fit(X_train32k, y_train) y_predict32k = model.predict(X_test32k) accuracies.append( accuracy(confusion_matrix(y_test, y_predict32k, label=[0, 1, 2, 3]))) csvResult.append(accuracies) with open("a1_3.3.csv", "w", newline="") as csvFile: csvWriter = csv.writer(csvFile) csvWriter.writerows(csvResult)