def test_training_continuation(self): digits_2class = load_digits(2) digits_5class = load_digits(5) X_2class = digits_2class['data'] y_2class = digits_2class['target'] X_5class = digits_5class['data'] y_5class = digits_5class['target'] dtrain_2class = xgb.DMatrix(X_2class, label=y_2class) dtrain_5class = xgb.DMatrix(X_5class, label=y_5class) gbdt_01 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 gbdt_02 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=0) gbdt_02.save_model('xgb_tc.model') gbdt_02a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02) gbdt_02b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \ mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \ mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') gbdt_03a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) gbdt_03b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 assert mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) == \ mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \ mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \ mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree assert np.any(gbdt_05.predict(dtrain_5class) != gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)) == False
def load_data(digits=[]): """ Loads data from sklearn's digits dataset (http://scikit-learn.org/stable/datasets/) and performs preprocessing. ---- Note that the digits dataset has: d = 64 (dimensionality) m = ~180 (number of instances per class) z = 10 (number of classes) digits: An np array which has the digits you want to train on. The digits must be in the range of [0,9]. Output: Returns the train/test, digits and targets data after performing preprocessing. """ #Loads the data and the targets, resp. #Note they should be indexed the same way. So digits_data[n] corresponds #to digits_labels[n] for any n. digits_data = pd.DataFrame(datasets.load_digits().data) digits_labels = pd.Series(datasets.load_digits().target) #If the digits to train on are not specified, pick randomly if len(digits) == 0: r_digits = range(0,10) random.shuffle(r_digits) #0-6 is 70% of the data training_digits = set() testing_digits = set() for a in range(0,7): training_digits.add(r_digits[a]) for a in range(7,10): testing_digits.add(r_digits[a]) else: if len(digits) > 0: #If they specify digits outside of the range, throw if (max(digits)>9 or min(digits)<0): raise ValueError('The dataset only has digits 0-9. The parameter passed to load_data had a digit outside of that range') if len(digits) >= 10: raise ValueError('The dataset only has digits 0-9. You said to train on all of them leaving no testing data') all_digits = set([0,1,2,3,4,5,6,7,8,9]) training_digits = set(digits) testing_digits = all_digits - training_digits #Training data raw_train_labels = digits_labels[digits_labels.isin(training_digits)] training_data = digits_data.loc[raw_train_labels.index] #Maps the labels to 0...n training_labels = pd.DataFrame(preprocessing.LabelEncoder().fit_transform(raw_train_labels)) #Testing data raw_test_labels = digits_labels[digits_labels.isin(testing_digits)] testing_data = digits_data.loc[raw_test_labels.index] #Maps the labels to 0...n testing_labels = pd.DataFrame(preprocessing.LabelEncoder().fit_transform(raw_test_labels)) processed = collections.namedtuple('processed', ['training_data', 'training_labels', 'testing_data','testing_labels', 'training_digits', 'testing_digits']) return processed(training_data,training_labels,testing_data,testing_labels,training_digits,testing_digits)
def test_load_digits(): digits = load_digits() assert_equal(digits.data.shape, (1797, 64)) assert_equal(numpy.unique(digits.target).size, 10) # test return_X_y option X_y_tuple = load_digits(return_X_y=True) bunch = load_digits() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def run(sc): iris = datasets.load_iris() digits = [ datasets.load_digits(), datasets.load_digits()] def learn(x): clf = svm.SVC(gamma=0.001, C=100.) clf.fit(x.data[:-1], x.target[:-1] ) return clf.predict(x.data[-1]) return sc.parallelize(digits).map(learn).collect()
def main(): # http://scikit-learn.org/stable/tutorial/basic/tutorial.html#loading-an-example-dataset # "A dataset is a dictionary-like object that holds all the data and some # metadata about the data. This data is stored in the .data member, which # is a n_samples, n_features array. In the case of supervised problem, one # or more response variables are stored in the .target member." # Toy datasets iris = datasets.load_iris() # The iris dataset (classification) digits = datasets.load_digits() # The digits dataset (classification) #boston = datasets.load_boston() # The boston house-prices dataset (regression) #diabetes = datasets.load_diabetes() # The diabetes dataset (regression) #linnerud = datasets.load_linnerud() # The linnerud dataset (multivariate regression) print(iris.feature_names) print(iris.data) print(iris.target_names) print(iris.target) print(digits.images[0]) print(digits.target_names) print(digits.target) plt.imshow(digits.images[0], cmap='gray', interpolation='nearest') plt.show()
def pytest_funcarg__digits(request): digits = datasets.load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) ds = Dataset(data, digits.target).scale() ds.test_size = 0.5 return ds.train_test_split()
def handwritingClassTest(): # 导入数据 digits = datasets.load_digits() totalNum = len(digits.data) # 选出90%样本作为训练样本,其余10%测试 trainNum = int(0.8 * totalNum) trainX = digits.data[0 : trainNum] trainY = digits.target[0 : trainNum] testX = digits.data[trainNum:] testY = digits.target[trainNum:] errorCount = 0 testExampleNum = len( testX ) for i in range( testExampleNum ): # 测试样本在测试集中真实的类别 trueLabel = testY[i] classifierResult = classify0( testX[ i, : ], trainX, trainY, 5 ) print "\nThe classifier came back with: %d, the real answer is: %d"\ % ( classifierResult, trueLabel ) if trueLabel != classifierResult: errorCount += 1 else: pass print "\nThe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % ( errorCount / float( testExampleNum) )
def test_predict_probability(self): dataset = datasets.load_digits() x_train, x_test, y_train, y_test = train_test_split( dataset.data, dataset.target, train_size=0.7 ) x_train_before = x_train.copy() x_test_before = x_test.copy() y_train_before = y_train.copy() number_of_classes = len(np.unique(dataset.target)) pnnet = algorithms.PNN(verbose=False, std=10) pnnet.train(x_train, y_train) result = pnnet.predict_prob(x_test) n_test_inputs = x_test.shape[0] self.assertEqual(result.shape, (n_test_inputs, number_of_classes)) total_classes_prob = np.round(result.sum(axis=1), 10) np.testing.assert_array_equal( total_classes_prob, np.ones(n_test_inputs) ) old_result = result.copy() # Test problem with variable links np.testing.assert_array_equal(x_train, x_train_before) np.testing.assert_array_equal(x_test, x_test_before) np.testing.assert_array_equal(y_train, y_train_before) x_train[:, :] = 0 result = pnnet.predict_prob(x_test) total_classes_prob = np.round(result.sum(axis=1), 10) np.testing.assert_array_almost_equal(result, old_result)
def test_check_accuracy_on_digits(): # Non regression test to make sure that any further refactoring / optim # of the NB models do not harm the performance on a slightly non-linearly # separable dataset digits = load_digits() X, y = digits.data, digits.target binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8) X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] # Multinomial NB scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) assert_greater(scores.mean(), 0.86) scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.94) # Bernoulli NB scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) assert_greater(scores.mean(), 0.83) scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) assert_greater(scores.mean(), 0.92) # Gaussian NB scores = cross_val_score(GaussianNB(), X, y, cv=10) assert_greater(scores.mean(), 0.77) scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.86)
def test_multiclass_prediction_early_stopping(self): X, y = load_digits(10, True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 10, 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, verbose_eval=False, evals_result=evals_result) pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter)) self.assertLess(ret, 0.8) self.assertGreater(ret, 0.5) # loss will be higher than when evaluating the full model pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5} ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter)) self.assertLess(ret, 0.2)
def test_feature_importances(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits digits = load_digits(2) y = digits['target'] X = digits['data'] xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0., 0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0., 0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667, 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0.], dtype=np.float32) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) # numeric columns import pandas as pd y = pd.Series(digits['target']) X = pd.DataFrame(digits['data']) xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) # string columns, the feature order must be kept chars = list('abcdefghijklmnopqrstuvwxyz') X.columns = ["".join(random.sample(chars, 5)) for x in range(64)] xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
def test_load_digits(): digits = load_digits() assert_equal(digits.data.shape, (1797, 64)) assert_equal(numpy.unique(digits.target).size, 10) # test return_X_y option check_return_X_y(digits, partial(load_digits))
def sklearn_digits( classifier=None ): #estim = hyperopt_estimator( classifier=any_classifier('hai'), algo=tpe.suggest ) if classifier is None: classifier = any_classifier('any') estim = hyperopt_estimator( classifier=classifier ) digits = load_digits() X = digits.data y = digits.target test_size = 50 np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] estim.fit( X_train, y_train ) pred = estim.predict( X_test ) print( pred ) print ( y_test ) print( score( pred, y_test ) ) print( estim.best_model() )
def test_unsorted_indices(): # test that the result with sorted and unsorted indices in csr is the same # we use a subset of digits as iris, blobs or make_classification didn't # show the problem digits = load_digits() X, y = digits.data[:50], digits.target[:50] X_test = sparse.csr_matrix(digits.data[50:100]) X_sparse = sparse.csr_matrix(X) coef_dense = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X, y).coef_ sparse_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse, y) coef_sorted = sparse_svc.coef_ # make sure dense and sparse SVM give the same result assert_array_almost_equal(coef_dense, coef_sorted.toarray()) X_sparse_unsorted = X_sparse[np.arange(X.shape[0])] X_test_unsorted = X_test[np.arange(X_test.shape[0])] # make sure we scramble the indices assert_false(X_sparse_unsorted.has_sorted_indices) assert_false(X_test_unsorted.has_sorted_indices) unsorted_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse_unsorted, y) coef_unsorted = unsorted_svc.coef_ # make sure unsorted indices give same result assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray()) assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test))
def test_digits() : from sklearn.cross_validation import train_test_split from sklearn.datasets import load_digits from sklearn.metrics import confusion_matrix, classification_report, accuracy_score from sklearn.preprocessing import LabelBinarizer digits = load_digits() X = digits.data y = digits.target #labels X /= X.max() #norm nn = NeuralNetwork([64,100,10],'logistic') #8x8 input, 10 output X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) labels_train = LabelBinarizer().fit_transform(y_train) #convert no to vector labels_test = LabelBinarizer().fit_transform(y_test) nn.fit(X_train,labels_train,epochs=100) predictions = [] for i in range(X_test.shape[0]) : o = nn.predict(X_test[i]) predictions.append(np.argmax(o)) print confusion_matrix(y_test,predictions) print classification_report(y_test,predictions) print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
def main(): # parameters to cross-validate over parameters = { 'l2': np.logspace(-5, 0, num=6), } # load iris data in, make a binary decision problem out of it data = load_digits() X = Array2Dict().fit_transform(data.data) y = 2 * (data.target >= 5) - 1 i = int(0.8 * len(X)) X_train, X_test = X[:i], X[i:] y_train, y_test = y[:i], y[i:] # do the actual learning gs = GridSearchCV( VW_Classifier(loss='logistic', moniker='example_sklearn', passes=10, silent=True, learning_rate=10), param_grid=parameters, score_func=f1_score, cv=StratifiedKFold(y_train), ).fit(X_train, y_train) # print out results from cross-validation estimator = gs.best_estimator_ score = gs.best_score_ print 'Achieved a F1 score of %f using l2 == %f during cross-validation' % (score, estimator.l2) # print confusion matrix on test data y_est = estimator.fit(X_train, y_train).predict(X_test) print 'Confusion Matrix:' print confusion_matrix(y_test, y_est)
def test_plot_learning_curve(): from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits digits = load_digits() X, y = digits.data, digits.target title = "Learning Curves (Naive Bayes)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation # set. cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, test_size=0.2, random_state=0) estimator = GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) plt.show()
def test_classification(): from sklearn.datasets import load_digits from sklearn.cross_validation import KFold from sklearn.metrics import normalized_mutual_info_score digits = load_digits() X, y = digits.data, digits.target folds = 3 cv = KFold(y.shape[0], folds) total = 0.0 oo_score_bag = [] for tr, te in cv: mlp = MLPClassifier(use_dropout=True, n_hidden=200, lr=1.) print(mlp) mlp.fit(X[tr], y[tr], max_epochs=100, staged_sample=X[te]) t = normalized_mutual_info_score(mlp.predict(X[te]), y[te]) print("Fold training accuracy: %f" % t) total += t this_score = [] for i in mlp.oo_score: this_score.append(normalized_mutual_info_score(i, y[te])) oo_score_bag.append(this_score) from matplotlib import pyplot as plt plt.plot(oo_score_bag[0]) plt.show() print("training accuracy: %f" % (total / float(folds)))
def test_KMeans_scores(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) scaled = pp.scale(digits.data) df.data = df.data.pp.scale() self.assert_numpy_array_almost_equal(df.data.values, scaled) clf1 = cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10, n_init=10, random_state=self.random_state) clf1.fit(scaled) df.fit_predict(clf2) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.completeness_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.completeness_score(), expected) expected = m.v_measure_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.v_measure_score(), expected) expected = m.adjusted_rand_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.adjusted_rand_score(), expected) expected = m.homogeneity_score(digits.target, clf1.labels_) self.assertEqual(df.metrics.homogeneity_score(), expected) expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean', sample_size=300, random_state=self.random_state) result = df.metrics.silhouette_score(metric='euclidean', sample_size=300, random_state=self.random_state) self.assertAlmostEqual(result, expected)
def test_training_continuation(self): digits = load_digits(2) X = digits['data'] y = digits['target'] dtrain = xgb.DMatrix(X,label=y) gbdt_01 = xgb.train(self.xgb_params, dtrain, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 gbdt_02 = xgb.train(self.xgb_params, dtrain, num_boost_round=0) gbdt_02.save_model('xgb_tc.model') gbdt_02a = xgb.train(self.xgb_params, dtrain, num_boost_round=10, xgb_model=gbdt_02) gbdt_02b = xgb.train(self.xgb_params, dtrain, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 assert mean_squared_error(y, gbdt_01.predict(dtrain)) == mean_squared_error(y, gbdt_02a.predict(dtrain)) assert mean_squared_error(y, gbdt_01.predict(dtrain)) == mean_squared_error(y, gbdt_02b.predict(dtrain)) gbdt_03 = xgb.train(self.xgb_params, dtrain, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') gbdt_03a = xgb.train(self.xgb_params, dtrain, num_boost_round=7, xgb_model=gbdt_03) gbdt_03b = xgb.train(self.xgb_params, dtrain, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 assert mean_squared_error(y, gbdt_03a.predict(dtrain)) == mean_squared_error(y, gbdt_03b.predict(dtrain))
def test_sklearn_nfolds_cv(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits from sklearn.model_selection import StratifiedKFold digits = load_digits(3) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 3 } seed = 2016 nfolds = 5 skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed) cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) cv2 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, folds=skf, seed=seed) cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
def test_feature_importances(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits digits = load_digits(2) y = digits['target'] X = digits['data'] xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0., 0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0., 0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667, 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0.], dtype=np.float32) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) # numeric columns import pandas as pd y = pd.Series(digits['target']) X = pd.DataFrame(digits['data']) xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
def main(): digits = load_digits() X = digits.data y = digits.target mds = MDS() X_mds = mds.fit_transform(X) plot_embedding(X_mds, y)
def main(): data = datasets.load_digits() X = normalize(data.data) y = data.target # One-hot encoding of nominal y-values y = to_categorical(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) # Perceptron clf = Perceptron(n_iterations=5000, learning_rate=0.001, loss=CrossEntropy, activation_function=Sigmoid) clf.fit(X_train, y_train) y_pred = np.argmax(clf.predict(X_test), axis=1) y_test = np.argmax(y_test, axis=1) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Perceptron", accuracy=accuracy, legend_labels=np.unique(y))
def main(): # load iris data in, make a binary decision problem out of it data = load_digits() X = Array2Dict().fit_transform(data.data) y = data.target + 1 i = int(0.8 * len(X)) X_train, X_test = X[:i], X[i:] y_train, y_test = y[:i], y[i:] # do the actual learning m = VW_Classifier(loss='logistic', moniker='example_sklearn', passes=10, silent=True, learning_rate=10, raw=True, oaa = 10) m.fit(X_train, y_train) # print confusion matrix on test data y_est = m.predict_proba(X_test) lines = y_est #print y_est probs = [] for i, line in enumerate(lines): line = line.split() labels, vs = zip(*[[float(x) for x in l.split(':')] for l in line[:]]) probs__ = sigmoid(asarray(vs)) probs_ = probs__/probs__.sum() probs.append(probs_) probs = np.asarray(probs) print probs
def test_backprop(): # loading data digits = load_digits() X = digits['data'] y = digits['target'] # dividing in training, validation, and test set nsamples = X.shape[0] end_train_idx = int(0.5 * nsamples) end_val_idx = int(0.7 * nsamples) perm = np.random.permutation(nsamples) Xtrain = X[perm[:end_train_idx]] Xval = X[perm[end_train_idx:end_val_idx]] Xtest = X[perm[end_val_idx:]] ytrain = y[perm[:end_train_idx]] yval = y[perm[end_train_idx:end_val_idx]] ytest = y[perm[end_val_idx:]] # data normalization mean = Xtrain.mean(0) std = Xtrain.std(0) std[std == 0] = 1 Xtrain = (Xtrain - mean) / std Xval = (Xval - mean) / std Xtest = (Xtest - mean) / std # net params input_size = Xtrain.shape[1] hidden_size = 30 output_size = np.unique(y).size net = Sigmoidal2LayerMLP_WithSoftmax(input_size, hidden_size, output_size, bias_init=0.0, lr=0.0001, momen_decay=0.0, l2=0.1) x = Xtrain[0] yi = y[0] net.forward(x) loss = net.backward(yi) Wih_grad = net.Wih_grad.copy() Who_grad = net.Who_grad.copy() hb_grad = net.hb_grad.copy() ob_grad = net.ob_grad.copy() e = 1e-6 for i in xrange(net.Wih.shape[0]): for h in xrange(net.Wih.shape[1]): net.Wih[i, h] += e net.forward(x) loss1 = net.loss(yi) net.Wih[i, h] -= 2 * e net.forward(x) loss2 = net.loss(yi) print 'estimated grad W%d_%d = %.4f' % (i, h, (loss1 - loss2) / (2 * e)) print 'backprop grad = %.4f' % Wih_grad[i, h] net.Wih[i, h] += e
def code(): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target """X_folds = np.array_split(X_digits, 3) y_folds = np.array_split(y_digits, 3)""" svc = svm.SVC(C=1, kernel='linear') """scores = list() for k in range(3): # We use 'list' to copy, in order to 'pop' later on X_train = list(X_folds) X_test = X_train.pop(k) X_train = np.concatenate(X_train) y_train = list(y_folds) y_test = y_train.pop(k) y_train = np.concatenate(y_train) scores.append(svc.fit(X_train, y_train).score(X_test, y_test)) print(scores)""" k_fold = cross_validation.KFold(len(X_digits), n_folds=3) """for train_indices, test_indices in k_fold: print('Train: %s | test: %s' % (train_indices, test_indices))""" #print([svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) for train, test in k_fold]) print(cross_validation.cross_val_score(svc, X_digits, y_digits, cv=k_fold,n_jobs=-1))
def test_tutorial(self): """ Verifies we can do what sklearn does here: http://scikit-learn.org/stable/tutorial/basic/tutorial.html """ digits = datasets.load_digits() digits_data = digits.data # for now, we need a column vector rather than an array digits_target = digits.target p = Pipeline() # load data from a numpy dataset stage_data = NumpyRead(digits_data) stage_target = NumpyRead(digits_target) # train/test split stage_split_data = SplitTrainTest(2, test_size=1, random_state=0) # build a classifier stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.) # output to a csv stage_csv = CSVWrite(self._tmp_files('out.csv')) node_data, node_target, node_split, node_clf, node_csv = map( p.add, [ stage_data, stage_target, stage_split_data, stage_clf, stage_csv]) # connect the pipeline stages together node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['y_pred'] > node_csv['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv', True) # making sure we get the same result as sklearn clf = SVC(gamma=0.001, C=100.) # The tutorial just splits using array slicing, but we need to make # sure that both UPSG and sklearn are splitting the same way, so we # do something more sophisticated train_X, test_X, train_y, test_y = train_test_split( digits_data, digits_target, test_size=1, random_state=0) clf.fit(train_X, np.ravel(train_y)) control = clf.predict(test_X)[0] self.assertAlmostEqual(result, control) # model persistance s = pickle.dumps(stage_clf) stage_clf2 = pickle.loads(s) self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def ModelSelectionTest01(): from sklearn import datasets, svm import numpy as np digits = datasets.load_digits() X_digits = digits.data Y_digits = digits.target svc = svm.SVC(C = 1, kernel = 'linear') score = svc.fit(X_digits[:-100], Y_digits[:-100]).score(X_digits[-100:], Y_digits[-100:]) #print score X_folds = np.array_split(X_digits, 3) Y_folds = np.array_split(Y_digits, 3) #print len(X_folds[0]) scores = list() for k in range(3): X_train = list(X_folds) #这里的X_folds是一个具有3个元素的list X_test = X_train.pop(k) #test是train的第K个元素 X_train = np.concatenate(X_train) #这里是把X_train减去X_test #print len(X_train) Y_train = list(Y_folds) Y_test = Y_train.pop(k) Y_train = np.concatenate(Y_train) scores.append(svc.fit(X_train, Y_train).score(X_test, Y_test)) #print scores from sklearn import cross_validation k_fold = cross_validation.KFold(n = 6, n_folds = 3) for train_indices, test_indices in k_fold: print train_indices, test_indices k_fold = cross_validation.KFold(len(X_digits), n_folds = 3) scores = [svc.fit(X_digits[train], Y_digits[train]).score(X_digits[test], Y_digits[test]) for train , test in k_fold] #print scores scores = cross_validation.cross_val_score(svc, X_digits, Y_digits, cv = k_fold, n_jobs = 1) #print scores from sklearn.grid_search import GridSearchCV gammas = np.logspace(-6, -1, 10) clf = GridSearchCV(estimator = svc, param_grid = dict(gamma = gammas), n_jobs = 1) clf.fit(X_digits[:1000], Y_digits[:1000]) print clf.best_score_ print clf.best_estimator_.gamma from sklearn import linear_model, datasets lasso = linear_model.LassoCV() #这里的lassoCV和lasso有什么区别? diabetes = datasets.load_diabetes() X_diabetes = diabetes.data Y_diabetes = diabetes.target lasso.fit(X_diabetes, Y_diabetes) print lasso.alpha_
def test_constraint_removal(): digits = load_digits() X, y = digits.data, digits.target y = 2 * (y % 2) - 1 # even vs odd as +1 vs -1 X = X / 16. pbl = BinarySVMModel(n_features=X.shape[1]) clf_no_removal = OneSlackSSVM(model=pbl, max_iter=500, verbose=1, C=10, inactive_window=0, tol=0.01) clf_no_removal.fit(X, y) clf = OneSlackSSVM(model=pbl, max_iter=500, verbose=1, C=10, tol=0.01, inactive_threshold=1e-8) clf.fit(X, y) # results are mostly equal # if we decrease tol, they will get more similar assert_less(np.mean(clf.predict(X) != clf_no_removal.predict(X)), 0.02) # without removal, have as many constraints as iterations # +1 for true y constraint assert_equal(len(clf_no_removal.objective_curve_) + 1, len(clf_no_removal.constraints_)) # with removal, there are less constraints than iterations assert_less(len(clf.constraints_), len(clf.objective_curve_))
from sklearn.neural_network import MLPClassifier " Ferramentas e bases de dados do scikit-learn " from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import preprocessing from TrabalhoIA import NFoldsTrainTest import seaborn as sea from matplotlib import pyplot as plt import pandas as pd # Bases de dados para o trabalho bases = { "iris": datasets.load_iris(), "digits": datasets.load_digits(), "wine": datasets.load_wine(), "breast cancer": datasets.load_breast_cancer() } # Classificadores para realização do experimento classificadores = { "ZeroR": (ZeroR, { "discretizar": False }), "OneR": (OneR, { "discretizar": True }), "OneR Probabilistico": (OneRProb, { "discretizar": True }),
''' X, y = data Cs = np.logspace(-2, 2) zeros = [] for C in Cs: clf = LinearSVC(C=C, penalty='l1', dual=False) clf.fit(X, y) ### 计算零的个数 ### num = 0 for row in clf.coef_: for ele in row: if abs(ele) < 1e-5: num += 1 zeros.append(num) ##### 绘图 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(Cs, zeros) ax.set_xlabel("C") ax.set_xscale("log") ax.set_ylabel("zeros in coef") ax.set_title("Sparsity In SVM") plt.show() if __name__ == '__main__': # test_SelectFromModel() # 调用 test_SelectFromModel # data=load_diabetes() # 生成用于回归问题的数据集 # test_Lasso(data.data,data.target) # 调用 test_Lasso data = load_digits() # 生成用于分类问题的数据集 test_LinearSVC(data.data, data.target) # 调用 test_LinearSVC
j = k error_j = error_k # if came in this loop first time, we select alpha j randomly else: j = i while j == i: j = int(random.uniform(0, self.n)) error_j = self._cal_ei(j) return j, error_j if __name__ == '__main__': # 加载sklearn自带的手写数字识别数据集 digits = load_digits() features = digits.data targets = (digits.target > 4).astype(int) targets[targets == 0] = -1 # 随机打乱数据 shuffle_indices = np.random.permutation(features.shape[0]) features = features[shuffle_indices] targets = targets[shuffle_indices] # 划分训练、测试集 train_count = int(len(features) * 0.8) train_xs, train_ys = features[:train_count], targets[:train_count] test_xs, test_ys = features[train_count:], targets[train_count:] kernel_type = 'gauss'
# -*- coding: utf-8 -*- """ Created on Fri Oct 5 14:35:29 2018 @author: rtake """ import matplotlib.pyplot as plt from sklearn import datasets import numpy as np # digitsデータをロード digits = datasets.load_digits() # 画像を2行5列に表示 for label, img in zip(digits.target[:10], digits.images[:10]): plt.subplot(2, 5, label + 1) # 行列配置で表示をする plt.axis('off') # 軸の非表示 plt.imshow(img, cmap=plt.cm.gray_r, interpolation='nearest') # 画像表示をする plt.title('Digit:[0]'.format(label)) # 画像のラベルを「Digit:○」の形で表示する plt.show() #%% 分類器を作る #3と8のデータ一を求める flag_3_8 = (digits.target == 3) + (digits.target == 8) #3と8のデータ取得 images = digits.images[flag_3_8] labels = digits.target[flag_3_8]
self.X = (self.X - self.data_mean) / (self.data_std + EPS) else: data_mean = X.mean(axis=0) data_std = X.std(axis=0, ddof=1) X = (X - data_mean) / data_std return X, data_mean, data_std ''' DEMO CODE ''' if __name__ == "__main__": from sklearn import datasets import time data = datasets.load_digits().data som = SOM(DATA=data, alpha_max=0.05, num_units=100, height=10, width=10) #som.train_batch(100) #start = time.time() #som.train_stoch_theano(10) som.train_batch_theano(num_epoch=100) #som.train_stoch(10) #clusters = som.ins_unit_assign #print clusters #stop = time.time() # print som.unit_saliency #som_plot_scatter(som.W, som.X, som.activations) #som_plot_outlier_scatter(som.W, som.X, som.unit_saliency, som.inst_saliency, som.activations)
# coding: utf-8 # In[27]: from sklearn.datasets import load_digits from matplotlib import pyplot as plt get_ipython().magic(u'matplotlib inline') # In[37]: digit = load_digits() print(digits.data.shape) plt.imshow(digits.images[8]) plt.show() plt.imshow(digits.images[7]) plt.show() print digits.images[8] print digits.images[7] # In[ ]:
def test_digits_type_hints(): data = data_df_from_bunch(load_digits()) data_clean = clean( data, type_hints={"x{}".format(i): 'continuous' for i in range(64)}) assert data_clean.shape[1] == 65
cutdex = np.argmax(harmonics > spectral_cutoff) zeroindex = int(spectra[0].size / 2) linspectra = [] cutspectra = [] newspectra = [] for spectrum in spectra: print(len(spectrum[zeroindex:])) newspectra.append(spectrum[zeroindex:]) linspectra.append(spectrum[zeroindex:lindex]) cutspectra.append(spectrum[lindex:cutdex]) # for j in range(15): # plt.semilogy(harmonics[zeroindex:],newspectra[j]) # plt.show() digits = datasets.load_digits(return_X_y=0) def analysis(X, Y, tuned_parameters, test_fraction=0.8): # Use a support vector machine estimator with a grid search for hyperparameters # split data into training and testing sets train_data, test_data, train_target, test_target = train_test_split( X, Y, test_size=test_fraction) scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() classifier = GridSearchCV(SVC(),
from time import time import numpy as np from sklearn import metrics from sklearn.cluster import KMeans from sklearn.preprocessing import scale from sklearn.model_selection import train_test_split # some documents still include the cross-validation option but it no more exists in version 18.0 from sklearn.naive_ import GaussianNB from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score import pylab as plt ######################################################################################################################## ##################################### PART A ########################################################################### ######################################################################################################################## np.random.seed(42) # random seeding is performed digits = load_digits( ) # the whole data set with the labels and other information are extracted data = scale(digits.data) # the data is scaled with the use of z-score n_samples, n_features = data.shape # the no. of samples and no. of features are determined with the help of shape n_digits = len( np.unique(digits.target) ) # the number of labels are determined with the aid of unique formula labels = digits.target # get the ground-truth labels into the labels print(labels) # the labels are printed on the screen print(digits.keys() ) # this command will provide you the key elements in this dataset print(digits.DESCR) # to get the descriptive information about this dataset pl.gray() # turns an image into gray scale pl.matshow(digits.images[0]) pl.show()
from sklearn.externals.six.moves import cStringIO as StringIO from sklearn.metrics import roc_auc_score from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPRegressor from sklearn.preprocessing import LabelBinarizer from sklearn.preprocessing import StandardScaler, MinMaxScaler from scipy.sparse import csr_matrix from sklearn.utils.testing import (assert_raises, assert_greater, assert_equal, assert_false, ignore_warnings) from sklearn.utils.testing import assert_raise_message np.seterr(all='warn') ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] digits_dataset_multi = load_digits(n_class=3) X_digits_multi = MinMaxScaler().fit_transform(digits_dataset_multi.data[:200]) y_digits_multi = digits_dataset_multi.target[:200] digits_dataset_binary = load_digits(n_class=2) X_digits_binary = MinMaxScaler().fit_transform( digits_dataset_binary.data[:200]) y_digits_binary = digits_dataset_binary.target[:200] classification_datasets = [(X_digits_multi, y_digits_multi), (X_digits_binary, y_digits_binary)] boston = load_boston()
return data, label X_datasets, Y_datasets = [], [] # load data # X_coil20, y_coil20 = download_coil20(dataset_type='processed') # with open("./data/data_coil20.txt", "wb") as data: # pickle.dump((X_coil20,y_coil20), data) with open("./data/data_coil20.txt", "rb") as data: X_coil20, y_coil20 = pickle.load(data) X_coil20 = X_coil20.reshape((X_coil20.shape[0], -1)) X_datasets.append(X_coil20) Y_datasets.append(y_coil20) X_digits, y_digits = datasets.load_digits(n_class=10, return_X_y=True) X_datasets.append(X_digits) Y_datasets.append(y_digits) dft = pd.read_csv('./data/fashion-mnist_test.csv', dtype=int) # read test data X_fashion = dft.drop('label', axis=1) y_fashion = dft['label'] X_datasets.append(X_fashion) Y_datasets.append(y_fashion) # Set up algorithms methods = OrderedDict() methods['umap'] = umap.UMAP() methods['t-SNE'] = manifold.TSNE(n_components=2, init='pca', random_state=0) fig = plt.figure(figsize=(15, 8))
""" import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV """ pipeline 打包步骤: 1. pca降维 2. logistic 线性回归 """ digits = datasets.load_digits() # (1797, 64) X_digits = digits.data y_digits = digits.target def fill_pipeline(): plt.figure(1) plt.clf() # plt.axes([.2, .2, .7, .7]) m1_pca = decomposition.PCA() """ fit 与否 只影响 explained_variance_ 生成和展示 对最终 gscv 的 best n_components 选择没影响 (猜测:因为gscv自己最终也要fit)
SVC (support vector classifier) to improve the classification scores. """ print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_digits from sklearn.feature_selection import SelectPercentile, chi2 from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline from sklearn.svm import SVC # ############################################################################# # Import some data to play with X, y = load_digits(return_X_y=True) # Throw away data, to be in the curse of dimension settings X = X[:200] y = y[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) # ############################################################################# # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = SelectPercentile(chi2) clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
anim.save(None, writer=matplotlib_utils.SimpleMovieWriter(0.001)) # # Logistic regression # Your assignment is to implement the logistic regression # # Plan: # * Use a shared variable for weights # * Use a matrix placeholder for `X` # # We shall train on a two-class MNIST dataset # * please note that target `y` are `{0,1}` and not `{-1,1}` as in some formulae # In[30]: from sklearn.datasets import load_digits mnist = load_digits(2) X, y = mnist.data, mnist.target print("y [shape - %s]:" % (str(y.shape)), y[:10]) print("X [shape - %s]:" % (str(X.shape))) # In[31]: print('X:\n', X[:3, :10]) print('y:\n', y[:10]) plt.imshow(X[0].reshape([8, 8])) # It's your turn now! # Just a small reminder of the relevant math: #
from sklearn import datasets, metrics from scipy import sparse from sklearn.cross_validation import train_test_split #Choose one of the dataset using the datasets features in the scikit-learn from sklearn import svm from sklearn.datasets import load_boston, load_digits import numpy as np #import warnings #warnings.filterwarnings("ignore", category=DeprecationWarning) C = 1.0 #getting the data and response of the dataset #choosing digits = load_digits() #load dataset a = digits.data b = digits.target #According to your dataset, split the data to 20% testing data, 80% training data(you can also use any other number) a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2) #Apply SVC with Linear kernel model = svm.SVC(kernel='linear') model.fit(a_train, b_train) b_pred = model.predict(a_test) print("Accuracy for linear kernel in SVC " + str(metrics.accuracy_score(b_test, b_pred))) #Apply SVC with RBF kernel model = svm.SVC(kernel='rbf') model.fit(a_train, b_train) b_pred = model.predict(a_test) print("Accuracy for rbf kernel in SVC " + str(metrics.accuracy_score(b_test, b_pred)))
self.estimators.append(weak_clf) def predict(self, X): num_samples = X.shape[0] y_hat = np.zeros((num_samples, )) # iterate each weak clf for clf in self.estimators: _y_hat = np.full((num_samples, ), clf.cls) _y_hat[X[:, clf.feature_index] <= clf.feature_value] *= -1 y_hat += clf.alpha * _y_hat return np.sign(y_hat) if __name__ == '__main__': data = load_digits() digit1 = 1 digit2 = 8 idx = np.append( np.where(data.target == digit1)[0], np.where(data.target == digit2)[0]) y = data.target[idx] y[y == digit1] = -1 y[y == digit2] = 1 X = data.data[idx] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7) clf = Adaboost(n_estimators=5) clf.fit(X_train, y_train) y_pred = clf.predict(X_test)
from sklearn import datasets from sklearn import decomposition import matplotlib.pyplot as plt import numpy as np import seaborn from mpl_toolkits.mplot3d import Axes3D #matplotlib notebook mnist = datasets.load_digits() X = mnist.data y = mnist.target pca = decomposition.PCA(n_components=3) new_X = pca.fit_transform(X) fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(new_X[:, 0], new_X[:, 1], new_X[:, 2], c=y) plt.show()
# -*- coding: utf-8 -*- """ Created on Fri Mar 5 21:13:45 2021 @author: GANY """ from sklearn import datasets #import class dataset dari si scikit learn librari iris = datasets.load_iris() #memasukan dataset iris ke variable bernama iris digits = datasets.load_digits() #memasukan dataset digit ke variable digits print( digits.data ) #memberikan akses ke fitur yang dapat digunakan untuk mengklasifikasikan sample digit dan agar tertampil di console digits.target #memberikan informasi tentang data yang berhubungan atau dijadikan sebagai label digits.images[ 0] #data berupa array 2D, shape ( n.samles, n.features), meskipun data asliya, mungkin memiliki bentuk yang berbeda
columns.append("X" + str(i)) columns.append("target") i = 0 tmp = [] for m in matrixList: tmpColumns = np.asarray(m).reshape(-1) addTarget = np.append(tmpColumns, [i]) i = i + 1 tmp.append(addTarget) return pd.DataFrame(tmp, columns=columns) # In[27]: digits = load_digits() #training database from SKLearn tri = prepareTestSet( 'handwrittendigits/mag3_EE6653C8_digits_2017-12-08_14_18_28___0444df099e184398993a95d0f40446fe.csv' ) # In[28]: collectedFiles = [ f for f in listdir('handwrittendigits/') if isfile(join('handwrittendigits/', f)) ] file = collectedFiles.pop(0) trainingData = prepareTestSet('handwrittendigits/' + file) i = 0 for file in reversed(collectedFiles): print('\n' + file + '\n')
""" Train neuronal network to learn handwritten digits from the MNIST database. The MNIST database is composed of 8x8 images of handwritten digits. """ from matplotlib import pylab as plt from sklearn import datasets # Load MNIST database images, targets = datasets.load_digits(return_X_y=True) plt.imshow(images[0].reshape(8, 8), cmap=plt.cm.gray_r, interpolation="nearest") plt.show()
""" Module used to test the plot functions """ import matplotlib import numpy as np import pandas as pd import xgboost as xgb from sklearn import datasets from sklearn.model_selection import train_test_split from hipe4ml import analysis_utils, plot_utils from hipe4ml.model_handler import ModelHandler # data preparation DIGITS_DATA = datasets.load_digits(n_class=2) DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10]) # pylint: disable=E1101 Y_DIGITS = DIGITS_DATA.target # pylint: disable=E1101 SIG_DF = DIGITS[Y_DIGITS == 1] BKG_DF = DIGITS[Y_DIGITS == 0] TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split(DIGITS, Y_DIGITS, test_size=0.5, random_state=42) DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST] # -------------------------------------------- # training and testing INPUT_MODEL = xgb.XGBClassifier() MODEL = ModelHandler(INPUT_MODEL) MODEL.train_test_model(DATA)
import numpy as np import matplotlib.pylab as plt import sklearn.datasets as skdata import sklearn numeros = skdata.load_digits() target = numeros['target'] imagenes = numeros['images'] n_imagenes = len(target) data = imagenes.reshape((n_imagenes, -1)) from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split scaler = StandardScaler() x_train, x_test, y_train, y_test = train_test_split(data, target, train_size=0.7) def distancia(x_train, y_train): numero = 1 dd = y_train == numero cov = np.cov(x_train[dd].T) valores, vectores = np.linalg.eig(cov) valores = np.real(valores) vectores = np.real(vectores) ii = np.argsort(-valores) valores = valores[ii] vectores = vectores[:, ii] #Predicción de la mínima distancia
self.X_train['km_clust'] = y_labels_train self.X_test['km_clust'] = y_labels_test elif output == 'replace': self.X_train = y_labels_train[:, np.newaxis] self.X_test = y_labels_test[:, np.newaxis] else: raise ValueError('output should be either add or replace') return self # Prediction using only K-means' clusters: find a number of clusters equal to # the number of classes, replace the dataset by their assignation to cluster. # This is not so bad (accuracy 78%) but way less powerful than SVMs for example. # This is an approximation of the nearest-centroid classifier cl = ClusteringModel(load_digits()).clusterize(output='add').classify() print('X data:\n', cl.X_train) print('y data:\n', cl.y_train) # Prediction wthout using K-means' clusters: just use a SVMs on the digits # dataset. This gives 99.07% accuracy. ClusteringModel(load_digits()).classify() # Prediction using K-means clusters and another classifier: add to the dataset their # cluster of assignation. It does not seem to improve the model: this gives 98.8% # accuracy. Causes may be: # - the classifier already captured all the information and adding a new feature # confuses him
from time import time import numpy as np from scipy import ndimage from matplotlib import pyplot as plt from sklearn import manifold, datasets digits = datasets.load_digits(n_class=10) X = digits.data y = digits.target n_samples, n_features = X.shape np.random.seed(0) def nudge_images(X, y): # Having a larger dataset shows more clearly the behavior of the # methods, but we multiply the size of the dataset only by 2, as the # cost of the hierarchical clustering methods are strongly # super-linear in n_samples shift = lambda x: ndimage.shift( x.reshape((8, 8)), .3 * np.random.normal(size=2), mode='constant', ).ravel() X = np.concatenate([X, np.apply_along_axis(shift, 1, X)]) Y = np.concatenate([y, y], axis=0) return X, Y
def test_inference_batch(): model = train_model() X, _ = datasets.load_digits(return_X_y=True) predictions = model.predict(X[0:100]) assert np.all(predictions < 10)
def fit_model_to_initial_dataset(dataset_name, classifier, model_name, is_resized, is_grayscale, pca=None): if (dataset_name == 'mnist' or dataset_name == 'fashion_mnist'): digits = datasets.load_digits() if (is_resized): image_shape = (10, 10) else: image_shape = (28, 28) n_samples = len(digits.images) new_images = np.zeros((n_samples, ) + image_shape) #for i in range(len(digits.images)): # new_images[i] = cv2.resize(digits.images[i], dsize=image_shape, interpolation=cv2.INTER_CUBIC) # data_images = digits.images.reshape((n_samples, -1)) data_images = new_images.reshape((n_samples, -1)) d_X, t_X, d_Y, t_Y = train_test_split(data_images, digits.target) elif (dataset_name == 'cifar-10'): train_X, train_Y = load_svhn(is_grayscale) print(train_X.shape, train_Y.shape) n_samples = train_X.shape[0] train_X_new = train_X.reshape((n_samples, -1)) print(train_X_new.shape) d_X, t_X, d_Y, t_Y = train_test_split(train_X_new, train_Y) elif (dataset_name == 'sentiment_analysis'): #data_X, data_Y = np.empty((2, cv_X.shape[1])), np.empty((2, )) #index = 0 #for i in range(cv_Y.shape[0]): # if(cv_Y[i] == 0): # data_X[index] = cv_X[i] # data_Y[index] = cv_Y[i] # index = index + 1 # break #if(index == 3): # break #for i in range(cv_Y.shape[0]): # if(cv_Y[i] == 1): # data_X[index] = cv_X[i] # data_Y[index] = cv_Y[i] # index = index + 1 # break #if(index == 5): # break #data_Y = data_Y.astype('int') #_, _, cv_X, cv_Y, _, _ = load_sentiment_dataset() #data_X = np.random.rand(2, cv_X.shape[1]) data_X = np.random.rand(2, 768) data_Y = np.array([0, 1]) classifier.fit(data_X, data_Y) return classifier if (model_name == 'svm' or model_name == 'knn' or model_name == 'naive_bayes'): if (pca != None): data = pca.fit_transform(d_X) #print(pca.explained_variance_ratio_) print(data.shape) #a = input() data = data[:2500] d_Y = d_Y[:2500] else: data = d_X[:2500] d_Y = d_Y[:2500] classifier.fit(data, d_Y) #elif(model_name == 'knn'): # classifier.fit(d_X, d_Y) elif (model_name == "dt" or model_name == "lr"): d_X = d_X[:2500] d_Y = d_Y[:2500] classifier.train_model(d_X, d_Y, t_X, t_Y) ##print("The size of the Initial dataset on which Model A is trained is: ", d_X.shape) #print("Fitted model A to the initial dataset") return classifier
__author__ = 'Xialei' __date__ = '2018/4/22 16:10' from time import time import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import KMeans from sklearn.datasets import load_digits from sklearn.decomposition import PCA from sklearn.preprocessing import scale np.random.seed(42) # 随机种子,用于初始化聚类中心 digits = load_digits() # 载入数据 data = scale(digits.data) # 中心标准化数据 n_samples, n_features = data.shape # 1797,64 n_digits = len(np.unique(digits.target)) # 10 labels = digits.target # 数据真实标签 sample_size = 300 print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) print(79 * '_') print('% 9s' % 'init' ' time inertia h**o compl v-meas ARI AMI silhouette')
def test_inference_sample(): model = train_model() X, _ = datasets.load_digits(return_X_y=True) prediction = model.predict(X[0:1])[0] assert prediction == 0
if callback: callback(G, S, iter) if compute_err: logging.info("Violations of optimization objective: %d/%d " % ( int(np.sum(np.diff(obj) > 0)), len(obj))) return G, S if 0: # toy example for matrix completion rnds = np.random.RandomState(0) from sklearn import datasets R12 = datasets.load_digits(7).data obj_types = [0, 1, 2] obj_type2rank = {0: 100, 1: 200, 2:10} R12b = np.random.rand(R12.shape[0], R12.shape[1]) R12b[R12b < 0.3] = 0 # mask R12 grid = np.indices(R12.shape) idx = list(zip(grid[0].ravel(), grid[1].ravel())) rnds.shuffle(idx) idxi, idxj = list(zip(*idx[:int(0.9 * R12.size)])) mR12 = np.ma.array(R12) mR12[idxi, idxj] = np.ma.masked print("R12 mask: %d" % mR12.mask.sum())
import matplotlib.pyplot as plt import numpy as np import sklearn from sklearn import datasets import kmapper as km try: from scipy.misc import imsave, toimage except ImportError as e: print( "imsave requires you to install pillow. Run `pip install pillow` and then try again." ) sys.exit() # Load digits dat data, labels = datasets.load_digits().data, datasets.load_digits().target # Create images for a custom tooltip array tooltip_s = [] for image_data in data: output = io.BytesIO() img = toimage(image_data.reshape( (8, 8))) # Data was a flat row of 64 "pixels". img.save(output, format="PNG") contents = output.getvalue() img_encoded = base64.b64encode(contents) img_tag = """<img src="data:image/png;base64,{}">""".format( img_encoded.decode('utf-8')) tooltip_s.append(img_tag) output.close()