def test_training_continuation(self):
        digits_2class = load_digits(2)
        digits_5class = load_digits(5)

        X_2class = digits_2class['data']
        y_2class = digits_2class['target']

        X_5class = digits_5class['data']
        y_5class = digits_5class['target']

        dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
        dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)

        gbdt_01 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10)
        ntrees_01 = len(gbdt_01.get_dump())
        assert ntrees_01 == 10

        gbdt_02 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=0)
        gbdt_02.save_model('xgb_tc.model')

        gbdt_02a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02)
        gbdt_02b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model")
        ntrees_02a = len(gbdt_02a.get_dump())
        ntrees_02b = len(gbdt_02b.get_dump())
        assert ntrees_02a == 10
        assert ntrees_02b == 10
        assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
               mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class))
        assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
               mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))

        gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3)
        gbdt_03.save_model('xgb_tc.model')

        gbdt_03a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03)
        gbdt_03b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model")
        ntrees_03a = len(gbdt_03a.get_dump())
        ntrees_03b = len(gbdt_03b.get_dump())
        assert ntrees_03a == 10
        assert ntrees_03b == 10
        assert mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) == \
               mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))

        gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3)
        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
        assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
               mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))

        gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04)
        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
        assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
               mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))

        gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7)
        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
        gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05)
        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
        assert np.any(gbdt_05.predict(dtrain_5class) !=
                      gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)) == False
示例#2
0
def load_data(digits=[]):
    """
        Loads data from sklearn's digits dataset
        (http://scikit-learn.org/stable/datasets/)
        and performs preprocessing.
        ----
        Note that the digits dataset has:
        d = 64   (dimensionality)
        m = ~180 (number of instances per class)
        z = 10   (number of classes)
        digits: An np array which has the digits you want to train on. The
                digits must be in the range of [0,9].
        Output: Returns the train/test, digits and targets data after
                performing preprocessing.
    """

    #Loads the data and the targets, resp.
    #Note they should be indexed the same way. So digits_data[n] corresponds
    #to digits_labels[n] for any n.
    digits_data = pd.DataFrame(datasets.load_digits().data)
    digits_labels = pd.Series(datasets.load_digits().target)

    #If the digits to train on are not specified, pick randomly
    if len(digits) == 0:
        r_digits = range(0,10)
        random.shuffle(r_digits)
        #0-6 is 70% of the data
        training_digits = set()
        testing_digits = set()
        for a in range(0,7):
            training_digits.add(r_digits[a])
        for a in range(7,10):
            testing_digits.add(r_digits[a])
    else:
        if len(digits) > 0:
            #If they specify digits outside of the range, throw
            if (max(digits)>9 or min(digits)<0):
                raise ValueError('The dataset only has digits 0-9. The parameter passed to load_data had a digit outside of that range')
                if len(digits) >= 10:
                    raise ValueError('The dataset only has digits 0-9. You said to train on all of them leaving no testing data')

            all_digits = set([0,1,2,3,4,5,6,7,8,9])
            training_digits = set(digits)
            testing_digits = all_digits - training_digits

    #Training data
    raw_train_labels = digits_labels[digits_labels.isin(training_digits)]
    training_data = digits_data.loc[raw_train_labels.index]
    #Maps the labels to 0...n
    training_labels = pd.DataFrame(preprocessing.LabelEncoder().fit_transform(raw_train_labels))

    #Testing data
    raw_test_labels = digits_labels[digits_labels.isin(testing_digits)]
    testing_data = digits_data.loc[raw_test_labels.index]
    #Maps the labels to 0...n
    testing_labels = pd.DataFrame(preprocessing.LabelEncoder().fit_transform(raw_test_labels))

    processed = collections.namedtuple('processed', ['training_data', 'training_labels', 'testing_data','testing_labels', 'training_digits', 'testing_digits'])
    return processed(training_data,training_labels,testing_data,testing_labels,training_digits,testing_digits)
示例#3
0
def test_load_digits():
    digits = load_digits()
    assert_equal(digits.data.shape, (1797, 64))
    assert_equal(numpy.unique(digits.target).size, 10)

    # test return_X_y option
    X_y_tuple = load_digits(return_X_y=True)
    bunch = load_digits()
    assert_true(isinstance(X_y_tuple, tuple))
    assert_array_equal(X_y_tuple[0], bunch.data)
    assert_array_equal(X_y_tuple[1], bunch.target)
def run(sc):
	iris = datasets.load_iris()
	digits = [ datasets.load_digits(), datasets.load_digits()]


	def learn(x):
		clf = svm.SVC(gamma=0.001, C=100.)
		clf.fit(x.data[:-1], x.target[:-1] )
		return clf.predict(x.data[-1])

	return sc.parallelize(digits).map(learn).collect()
示例#5
0
def main():

    # http://scikit-learn.org/stable/tutorial/basic/tutorial.html#loading-an-example-dataset
    # "A dataset is a dictionary-like object that holds all the data and some
    # metadata about the data. This data is stored in the .data member, which
    # is a n_samples, n_features array. In the case of supervised problem, one
    # or more response variables are stored in the .target member."

    # Toy datasets

    iris = datasets.load_iris()         # The iris dataset (classification)
    digits = datasets.load_digits()     # The digits dataset (classification)

    #boston = datasets.load_boston()     # The boston house-prices dataset (regression)
    #diabetes = datasets.load_diabetes() # The diabetes dataset (regression)
    #linnerud = datasets.load_linnerud() # The linnerud dataset (multivariate regression)

    print(iris.feature_names)
    print(iris.data)
    print(iris.target_names)
    print(iris.target)

    print(digits.images[0])
    print(digits.target_names)
    print(digits.target)

    plt.imshow(digits.images[0], cmap='gray', interpolation='nearest')
    plt.show()
示例#6
0
def pytest_funcarg__digits(request):
    digits = datasets.load_digits()
    n_samples = len(digits.images)
    data = digits.images.reshape((n_samples, -1))
    ds = Dataset(data, digits.target).scale()
    ds.test_size = 0.5
    return ds.train_test_split()
示例#7
0
def handwritingClassTest():
    # 导入数据
    digits = datasets.load_digits()
    totalNum = len(digits.data)
    # 选出90%样本作为训练样本,其余10%测试
    trainNum = int(0.8 * totalNum)
    trainX = digits.data[0 : trainNum]
    trainY = digits.target[0 : trainNum]
    
    testX = digits.data[trainNum:]
    testY = digits.target[trainNum:]
    
    errorCount = 0
    testExampleNum = len( testX )
    for i in range( testExampleNum ):
        # 测试样本在测试集中真实的类别
        trueLabel = testY[i]
        classifierResult = classify0( testX[ i, : ], trainX, trainY, 5 )
        print "\nThe classifier came back with: %d, the real answer is: %d"\
            % ( classifierResult, trueLabel )
        if trueLabel != classifierResult:
            errorCount += 1
        else:
            pass
    print "\nThe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % ( 
        errorCount / float( testExampleNum) 
        )
示例#8
0
文件: test_pnn.py 项目: Neocher/neupy
    def test_predict_probability(self):
        dataset = datasets.load_digits()
        x_train, x_test, y_train, y_test = train_test_split(
            dataset.data, dataset.target, train_size=0.7
        )

        x_train_before = x_train.copy()
        x_test_before = x_test.copy()
        y_train_before = y_train.copy()

        number_of_classes = len(np.unique(dataset.target))

        pnnet = algorithms.PNN(verbose=False, std=10)
        pnnet.train(x_train, y_train)
        result = pnnet.predict_prob(x_test)

        n_test_inputs = x_test.shape[0]
        self.assertEqual(result.shape, (n_test_inputs, number_of_classes))

        total_classes_prob = np.round(result.sum(axis=1), 10)
        np.testing.assert_array_equal(
            total_classes_prob,
            np.ones(n_test_inputs)
        )
        old_result = result.copy()

        # Test problem with variable links
        np.testing.assert_array_equal(x_train, x_train_before)
        np.testing.assert_array_equal(x_test, x_test_before)
        np.testing.assert_array_equal(y_train, y_train_before)

        x_train[:, :] = 0
        result = pnnet.predict_prob(x_test)
        total_classes_prob = np.round(result.sum(axis=1), 10)
        np.testing.assert_array_almost_equal(result, old_result)
示例#9
0
def test_check_accuracy_on_digits():
    # Non regression test to make sure that any further refactoring / optim
    # of the NB models do not harm the performance on a slightly non-linearly
    # separable dataset
    digits = load_digits()
    X, y = digits.data, digits.target
    binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]

    # Multinomial NB
    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
    assert_greater(scores.mean(), 0.86)

    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.94)

    # Bernoulli NB
    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
    assert_greater(scores.mean(), 0.83)

    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.92)

    # Gaussian NB
    scores = cross_val_score(GaussianNB(), X, y, cv=10)
    assert_greater(scores.mean(), 0.77)

    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.86)
示例#10
0
    def test_multiclass_prediction_early_stopping(self):
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'num_class': 10,
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)

        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
        ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter))
        self.assertLess(ret, 0.8)
        self.assertGreater(ret, 0.5)  # loss will be higher than when evaluating the full model

        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5}
        ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter))
        self.assertLess(ret, 0.2)
def test_feature_importances():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits

    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)

    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
                    0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
                    0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0.,
                    0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0.,
                    0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667,
                    0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0.,
                    0.], dtype=np.float32)

    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    # numeric columns
    import pandas as pd
    y = pd.Series(digits['target'])
    X = pd.DataFrame(digits['data'])
    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    # string columns, the feature order must be kept
    chars = list('abcdefghijklmnopqrstuvwxyz')
    X.columns = ["".join(random.sample(chars, 5)) for x in range(64)]

    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
示例#12
0
def test_load_digits():
    digits = load_digits()
    assert_equal(digits.data.shape, (1797, 64))
    assert_equal(numpy.unique(digits.target).size, 10)

    # test return_X_y option
    check_return_X_y(digits, partial(load_digits))
def sklearn_digits( classifier=None ):
  #estim = hyperopt_estimator( classifier=any_classifier('hai'), algo=tpe.suggest )
  if classifier is None:
    classifier = any_classifier('any')
  estim = hyperopt_estimator( classifier=classifier )

  digits = load_digits()
  X = digits.data
  y = digits.target

  test_size = 50
  np.random.seed(0)
  indices = np.random.permutation(len(X))
  X_train = X[ indices[:-test_size]]
  y_train = y[ indices[:-test_size]]
  X_test = X[ indices[-test_size:]]
  y_test = y[ indices[-test_size:]]

  estim.fit( X_train, y_train )

  pred = estim.predict( X_test )
  print( pred )
  print ( y_test )

  print( score( pred, y_test ) ) 
  
  print( estim.best_model() )
示例#14
0
def test_unsorted_indices():
    # test that the result with sorted and unsorted indices in csr is the same
    # we use a subset of digits as iris, blobs or make_classification didn't
    # show the problem
    digits = load_digits()
    X, y = digits.data[:50], digits.target[:50]
    X_test = sparse.csr_matrix(digits.data[50:100])

    X_sparse = sparse.csr_matrix(X)
    coef_dense = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X, y).coef_
    sparse_svc = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X_sparse, y)
    coef_sorted = sparse_svc.coef_
    # make sure dense and sparse SVM give the same result
    assert_array_almost_equal(coef_dense, coef_sorted.toarray())

    X_sparse_unsorted = X_sparse[np.arange(X.shape[0])]
    X_test_unsorted = X_test[np.arange(X_test.shape[0])]

    # make sure we scramble the indices
    assert_false(X_sparse_unsorted.has_sorted_indices)
    assert_false(X_test_unsorted.has_sorted_indices)

    unsorted_svc = svm.SVC(kernel='linear', probability=True,
                           random_state=0).fit(X_sparse_unsorted, y)
    coef_unsorted = unsorted_svc.coef_
    # make sure unsorted indices give same result
    assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray())
    assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted),
                              sparse_svc.predict_proba(X_test))
def test_digits() :
    
    from sklearn.cross_validation import train_test_split 
    from sklearn.datasets import load_digits
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    from sklearn.preprocessing import LabelBinarizer
    
    digits = load_digits()
    X = digits.data
    y = digits.target   #labels
    X /= X.max()        #norm

    nn = NeuralNetwork([64,100,10],'logistic')  #8x8 input, 10 output
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    labels_train = LabelBinarizer().fit_transform(y_train)  #convert no to vector
    labels_test = LabelBinarizer().fit_transform(y_test)

    nn.fit(X_train,labels_train,epochs=100)
    predictions = []
    for i in range(X_test.shape[0]) :
        o = nn.predict(X_test[i])
        predictions.append(np.argmax(o))
    print confusion_matrix(y_test,predictions)
    print classification_report(y_test,predictions)
    print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
def main():
    # parameters to cross-validate over
    parameters = {
        'l2': np.logspace(-5, 0, num=6),
    }

    # load iris data in, make a binary decision problem out of it
    data = load_digits()

    X = Array2Dict().fit_transform(data.data)
    y = 2 * (data.target >= 5) - 1

    i = int(0.8 * len(X))
    X_train, X_test = X[:i], X[i:]
    y_train, y_test = y[:i], y[i:]

    # do the actual learning
    gs = GridSearchCV(
        VW_Classifier(loss='logistic', moniker='example_sklearn',
                      passes=10, silent=True, learning_rate=10),
        param_grid=parameters,
        score_func=f1_score,
        cv=StratifiedKFold(y_train),
    ).fit(X_train, y_train)

    # print out results from cross-validation
    estimator = gs.best_estimator_
    score = gs.best_score_
    print 'Achieved a F1 score of %f using l2 == %f during cross-validation' % (score, estimator.l2)

    # print confusion matrix on test data
    y_est = estimator.fit(X_train, y_train).predict(X_test)
    print 'Confusion Matrix:'
    print confusion_matrix(y_test, y_est)
示例#17
0
def test_plot_learning_curve():
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.datasets import load_digits

    digits = load_digits()
    X, y = digits.data, digits.target

    title = "Learning Curves (Naive Bayes)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation
    # set.
    cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
                                       test_size=0.2, random_state=0)

    estimator = GaussianNB()
    plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv,
                        n_jobs=4)

    title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
    # SVC is more expensive so we do a lower number of CV iterations:
    cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
                                       test_size=0.2, random_state=0)
    estimator = SVC(gamma=0.001)
    plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

    plt.show()
示例#18
0
文件: mlp.py 项目: JakeMick/sk-mlp
def test_classification():
    from sklearn.datasets import load_digits
    from sklearn.cross_validation import KFold
    from sklearn.metrics import normalized_mutual_info_score
    digits = load_digits()
    X, y = digits.data, digits.target
    folds = 3
    cv = KFold(y.shape[0], folds)
    total = 0.0
    oo_score_bag = []
    for tr, te in cv:
        mlp = MLPClassifier(use_dropout=True, n_hidden=200, lr=1.)
        print(mlp)
        mlp.fit(X[tr], y[tr], max_epochs=100, staged_sample=X[te])
        t = normalized_mutual_info_score(mlp.predict(X[te]), y[te])
        print("Fold training accuracy: %f" % t)
        total += t
        this_score = []
        for i in mlp.oo_score:
            this_score.append(normalized_mutual_info_score(i, y[te]))
        oo_score_bag.append(this_score)
    from matplotlib import pyplot as plt
    plt.plot(oo_score_bag[0])
    plt.show()

    print("training accuracy: %f" % (total / float(folds)))
示例#19
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
    def test_training_continuation(self):
        digits = load_digits(2)
        X = digits['data']
        y = digits['target']

        dtrain = xgb.DMatrix(X,label=y)

        gbdt_01 = xgb.train(self.xgb_params, dtrain, num_boost_round=10)
        ntrees_01 = len(gbdt_01.get_dump())
        assert ntrees_01 == 10

        gbdt_02 = xgb.train(self.xgb_params, dtrain, num_boost_round=0)
        gbdt_02.save_model('xgb_tc.model')

        gbdt_02a = xgb.train(self.xgb_params, dtrain, num_boost_round=10, xgb_model=gbdt_02)
        gbdt_02b = xgb.train(self.xgb_params, dtrain, num_boost_round=10, xgb_model="xgb_tc.model")
        ntrees_02a = len(gbdt_02a.get_dump())
        ntrees_02b = len(gbdt_02b.get_dump())
        assert ntrees_02a == 10
        assert ntrees_02b == 10
        assert mean_squared_error(y, gbdt_01.predict(dtrain)) == mean_squared_error(y, gbdt_02a.predict(dtrain))
        assert mean_squared_error(y, gbdt_01.predict(dtrain)) == mean_squared_error(y, gbdt_02b.predict(dtrain))

        gbdt_03 = xgb.train(self.xgb_params, dtrain, num_boost_round=3)
        gbdt_03.save_model('xgb_tc.model')

        gbdt_03a = xgb.train(self.xgb_params, dtrain, num_boost_round=7, xgb_model=gbdt_03)
        gbdt_03b = xgb.train(self.xgb_params, dtrain, num_boost_round=7, xgb_model="xgb_tc.model")
        ntrees_03a = len(gbdt_03a.get_dump())
        ntrees_03b = len(gbdt_03b.get_dump())
        assert ntrees_03a == 10
        assert ntrees_03b == 10
        assert mean_squared_error(y, gbdt_03a.predict(dtrain)) == mean_squared_error(y, gbdt_03b.predict(dtrain))
def test_sklearn_nfolds_cv():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits
    from sklearn.model_selection import StratifiedKFold

    digits = load_digits(3)
    X = digits['data']
    y = digits['target']
    dm = xgb.DMatrix(X, label=y)

    params = {
        'max_depth': 2,
        'eta': 1,
        'silent': 1,
        'objective':
        'multi:softprob',
        'num_class': 3
    }

    seed = 2016
    nfolds = 5
    skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed)

    cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
    cv2 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, folds=skf, seed=seed)
    cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
    assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
    assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
def test_feature_importances():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits

    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)

    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
                    0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
                    0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0.,
                    0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0.,
                    0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667,
                    0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0.,
                    0.], dtype=np.float32)

    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    # numeric columns
    import pandas as pd
    y = pd.Series(digits['target'])
    X = pd.DataFrame(digits['data'])
    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
def main():
    digits = load_digits()
    X = digits.data
    y = digits.target
    mds = MDS()
    X_mds = mds.fit_transform(X)
    plot_embedding(X_mds, y)
示例#24
0
def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    # One-hot encoding of nominal y-values
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1)

    # Perceptron
    clf = Perceptron(n_iterations=5000,
        learning_rate=0.001, 
        loss=CrossEntropy,
        activation_function=Sigmoid)
    clf.fit(X_train, y_train)

    y_pred = np.argmax(clf.predict(X_test), axis=1)
    y_test = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Perceptron", accuracy=accuracy, legend_labels=np.unique(y))
def main():
    # load iris data in, make a binary decision problem out of it
    data = load_digits()

    X = Array2Dict().fit_transform(data.data)
    y = data.target  + 1

    i = int(0.8 * len(X))
    X_train, X_test = X[:i], X[i:]
    y_train, y_test = y[:i], y[i:]

    # do the actual learning
    m =  VW_Classifier(loss='logistic', moniker='example_sklearn', passes=10, silent=True, learning_rate=10, raw=True, oaa = 10)
    m.fit(X_train, y_train)
    # print confusion matrix on test data
    y_est = m.predict_proba(X_test)
    lines = y_est
    #print y_est
    probs = []
    for i, line in enumerate(lines):
      line = line.split()
      labels, vs = zip(*[[float(x) for x in l.split(':')] for l in line[:]])
      probs__ = sigmoid(asarray(vs))
      probs_ = probs__/probs__.sum()
      probs.append(probs_)

    probs = np.asarray(probs)
    print probs
示例#26
0
def test_backprop():
    # loading data
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    # dividing in training, validation, and test set
    nsamples = X.shape[0]
    end_train_idx = int(0.5 * nsamples)
    end_val_idx = int(0.7 * nsamples)
    perm = np.random.permutation(nsamples)
    Xtrain = X[perm[:end_train_idx]]
    Xval = X[perm[end_train_idx:end_val_idx]]
    Xtest = X[perm[end_val_idx:]]
    ytrain = y[perm[:end_train_idx]]
    yval = y[perm[end_train_idx:end_val_idx]]
    ytest = y[perm[end_val_idx:]]

    # data normalization
    mean = Xtrain.mean(0)
    std = Xtrain.std(0)
    std[std == 0] = 1
    Xtrain = (Xtrain - mean) / std
    Xval = (Xval - mean) / std
    Xtest = (Xtest - mean) / std

    # net params
    input_size = Xtrain.shape[1]
    hidden_size = 30
    output_size = np.unique(y).size

    net = Sigmoidal2LayerMLP_WithSoftmax(input_size,
                                         hidden_size,
                                         output_size,
                                         bias_init=0.0,
                                         lr=0.0001,
                                         momen_decay=0.0,
                                         l2=0.1)

    x = Xtrain[0]
    yi = y[0]
    net.forward(x)
    loss = net.backward(yi)
    Wih_grad = net.Wih_grad.copy()
    Who_grad = net.Who_grad.copy()
    hb_grad = net.hb_grad.copy()
    ob_grad = net.ob_grad.copy()
    e = 1e-6
    for i in xrange(net.Wih.shape[0]):
        for h in xrange(net.Wih.shape[1]):
            net.Wih[i, h] += e
            net.forward(x)
            loss1 = net.loss(yi)
            net.Wih[i, h] -= 2 * e
            net.forward(x)
            loss2 = net.loss(yi)
            print 'estimated grad W%d_%d = %.4f' % (i, h,
                                                    (loss1 - loss2) / (2 * e))
            print 'backprop grad = %.4f' % Wih_grad[i, h]
            net.Wih[i, h] += e
def code():
    digits = datasets.load_digits()
    X_digits = digits.data
    y_digits = digits.target
    """X_folds = np.array_split(X_digits, 3)
    y_folds = np.array_split(y_digits, 3)"""
    svc = svm.SVC(C=1, kernel='linear')
    """scores = list()


    for k in range(3):
        # We use 'list' to copy, in order to 'pop' later on
        X_train = list(X_folds)
        X_test = X_train.pop(k)
        X_train = np.concatenate(X_train)
        y_train = list(y_folds)
        y_test  = y_train.pop(k)
        y_train = np.concatenate(y_train)
        scores.append(svc.fit(X_train, y_train).score(X_test, y_test))

    print(scores)"""

    k_fold = cross_validation.KFold(len(X_digits), n_folds=3)
    """for train_indices, test_indices in k_fold:
        print('Train: %s | test: %s' % (train_indices, test_indices))"""


    #print([svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) for train, test in k_fold])

    print(cross_validation.cross_val_score(svc, X_digits, y_digits, cv=k_fold,n_jobs=-1))
示例#28
0
    def test_tutorial(self):
        """

        Verifies we can do what sklearn does here:
        http://scikit-learn.org/stable/tutorial/basic/tutorial.html

        """
        digits = datasets.load_digits()
        digits_data = digits.data
        # for now, we need a column vector rather than an array
        digits_target = digits.target

        p = Pipeline()

        # load data from a numpy dataset
        stage_data = NumpyRead(digits_data)
        stage_target = NumpyRead(digits_target)

        # train/test split
        stage_split_data = SplitTrainTest(2, test_size=1, random_state=0)

        # build a classifier
        stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.)

        # output to a csv
        stage_csv = CSVWrite(self._tmp_files('out.csv'))

        node_data, node_target, node_split, node_clf, node_csv = map(
            p.add, [
                stage_data, stage_target, stage_split_data, stage_clf,
                stage_csv])

        # connect the pipeline stages together
        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']
        node_split['train0'] > node_clf['X_train']
        node_split['train1'] > node_clf['y_train']
        node_split['test0'] > node_clf['X_test']
        node_clf['y_pred'] > node_csv['input']

        self.run_pipeline(p)
        
        result = self._tmp_files.csv_read('out.csv', True)

        # making sure we get the same result as sklearn
        clf = SVC(gamma=0.001, C=100.)
        # The tutorial just splits using array slicing, but we need to make
        #   sure that both UPSG and sklearn are splitting the same way, so we
        #   do something more sophisticated
        train_X, test_X, train_y, test_y = train_test_split(
            digits_data, digits_target, test_size=1, random_state=0)
        clf.fit(train_X, np.ravel(train_y))
        control = clf.predict(test_X)[0]

        self.assertAlmostEqual(result, control)

        # model persistance
        s = pickle.dumps(stage_clf)
        stage_clf2 = pickle.loads(s)
        self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def ModelSelectionTest01():
	from sklearn import datasets, svm
	import numpy as np
	digits = datasets.load_digits()
	X_digits = digits.data
	Y_digits = digits.target
	svc = svm.SVC(C = 1, kernel = 'linear')
	score = svc.fit(X_digits[:-100], Y_digits[:-100]).score(X_digits[-100:], Y_digits[-100:])

	#print score

	X_folds = np.array_split(X_digits, 3)
	Y_folds = np.array_split(Y_digits, 3)

	#print len(X_folds[0])

	scores = list()

	for k in range(3):
		X_train = list(X_folds) #这里的X_folds是一个具有3个元素的list
		X_test = X_train.pop(k) #test是train的第K个元素
		X_train = np.concatenate(X_train) #这里是把X_train减去X_test
		#print len(X_train)
		Y_train = list(Y_folds)
		Y_test = Y_train.pop(k)
		Y_train = np.concatenate(Y_train)

		scores.append(svc.fit(X_train, Y_train).score(X_test, Y_test))

	#print scores


	from sklearn import cross_validation
	k_fold = cross_validation.KFold(n = 6, n_folds = 3)
	for train_indices, test_indices in k_fold:
		print train_indices, test_indices

	k_fold = cross_validation.KFold(len(X_digits), n_folds = 3)
	scores = [svc.fit(X_digits[train], Y_digits[train]).score(X_digits[test], Y_digits[test]) for train , test in k_fold]

	#print scores

	scores = cross_validation.cross_val_score(svc, X_digits, Y_digits, cv = k_fold, n_jobs = 1)
	#print scores

	from sklearn.grid_search import GridSearchCV
	gammas = np.logspace(-6, -1, 10)
	clf = GridSearchCV(estimator = svc, param_grid = dict(gamma = gammas), n_jobs = 1)
	clf.fit(X_digits[:1000], Y_digits[:1000])
	print clf.best_score_
	print clf.best_estimator_.gamma

	from sklearn import linear_model, datasets
	lasso = linear_model.LassoCV()    #这里的lassoCV和lasso有什么区别?
	diabetes = datasets.load_diabetes()
	X_diabetes = diabetes.data
	Y_diabetes = diabetes.target
	lasso.fit(X_diabetes, Y_diabetes)

	print lasso.alpha_
示例#30
0
def test_constraint_removal():
    digits = load_digits()
    X, y = digits.data, digits.target
    y = 2 * (y % 2) - 1  # even vs odd as +1 vs -1
    X = X / 16.
    pbl = BinarySVMModel(n_features=X.shape[1])
    clf_no_removal = OneSlackSSVM(model=pbl, max_iter=500, verbose=1, C=10,
                                  inactive_window=0, tol=0.01)
    clf_no_removal.fit(X, y)
    clf = OneSlackSSVM(model=pbl, max_iter=500, verbose=1, C=10, tol=0.01,
                       inactive_threshold=1e-8)
    clf.fit(X, y)

    # results are mostly equal
    # if we decrease tol, they will get more similar
    assert_less(np.mean(clf.predict(X) != clf_no_removal.predict(X)), 0.02)

    # without removal, have as many constraints as iterations
    # +1 for true y constraint
    assert_equal(len(clf_no_removal.objective_curve_) + 1,
                 len(clf_no_removal.constraints_))

    # with removal, there are less constraints than iterations
    assert_less(len(clf.constraints_),
                len(clf.objective_curve_))
from sklearn.neural_network import MLPClassifier

" Ferramentas e bases de dados do scikit-learn "
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from TrabalhoIA import NFoldsTrainTest
import seaborn as sea
from matplotlib import pyplot as plt
import pandas as pd

# Bases de dados para o trabalho
bases = {
    "iris": datasets.load_iris(),
    "digits": datasets.load_digits(),
    "wine": datasets.load_wine(),
    "breast cancer": datasets.load_breast_cancer()
}

# Classificadores para realização do experimento
classificadores = {
    "ZeroR": (ZeroR, {
        "discretizar": False
    }),
    "OneR": (OneR, {
        "discretizar": True
    }),
    "OneR Probabilistico": (OneRProb, {
        "discretizar": True
    }),
    '''
    X, y = data
    Cs = np.logspace(-2, 2)
    zeros = []
    for C in Cs:
        clf = LinearSVC(C=C, penalty='l1', dual=False)
        clf.fit(X, y)
        ### 计算零的个数 ###
        num = 0
        for row in clf.coef_:
            for ele in row:
                if abs(ele) < 1e-5: num += 1
        zeros.append(num)
    ##### 绘图
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(Cs, zeros)
    ax.set_xlabel("C")
    ax.set_xscale("log")
    ax.set_ylabel("zeros in coef")
    ax.set_title("Sparsity In SVM")
    plt.show()


if __name__ == '__main__':
    # test_SelectFromModel()  # 调用 test_SelectFromModel
    # data=load_diabetes() # 生成用于回归问题的数据集
    # test_Lasso(data.data,data.target) # 调用 test_Lasso
    data = load_digits()  # 生成用于分类问题的数据集
    test_LinearSVC(data.data, data.target)  # 调用 test_LinearSVC
示例#33
0
                    j = k
                    error_j = error_k
        # if came in this loop first time, we select alpha j randomly
        else:
            j = i
            while j == i:
                j = int(random.uniform(0, self.n))
            error_j = self._cal_ei(j)

        return j, error_j


if __name__ == '__main__':

    # 加载sklearn自带的手写数字识别数据集
    digits = load_digits()
    features = digits.data
    targets = (digits.target > 4).astype(int)
    targets[targets == 0] = -1

    # 随机打乱数据
    shuffle_indices = np.random.permutation(features.shape[0])
    features = features[shuffle_indices]
    targets = targets[shuffle_indices]

    # 划分训练、测试集
    train_count = int(len(features) * 0.8)
    train_xs, train_ys = features[:train_count], targets[:train_count]
    test_xs, test_ys = features[train_count:], targets[train_count:]

    kernel_type = 'gauss'
示例#34
0
# -*- coding: utf-8 -*-
"""
Created on Fri Oct  5 14:35:29 2018

@author: rtake
"""

import matplotlib.pyplot as plt
from sklearn import datasets
import numpy as np

# digitsデータをロード
digits = datasets.load_digits()

# 画像を2行5列に表示
for label, img in zip(digits.target[:10], digits.images[:10]):
    plt.subplot(2, 5, label + 1)  # 行列配置で表示をする
    plt.axis('off')  # 軸の非表示
    plt.imshow(img, cmap=plt.cm.gray_r, interpolation='nearest')  # 画像表示をする
    plt.title('Digit:[0]'.format(label))  # 画像のラベルを「Digit:○」の形で表示する

plt.show()

#%% 分類器を作る

#3と8のデータ一を求める
flag_3_8 = (digits.target == 3) + (digits.target == 8)

#3と8のデータ取得
images = digits.images[flag_3_8]
labels = digits.target[flag_3_8]
示例#35
0
文件: som.py 项目: mvasighi/RSOM
            self.X = (self.X - self.data_mean) / (self.data_std + EPS)
        else:
            data_mean = X.mean(axis=0)
            data_std = X.std(axis=0, ddof=1)
            X = (X - data_mean) / data_std
            return X, data_mean, data_std


'''
DEMO CODE
'''
if __name__ == "__main__":
    from sklearn import datasets
    import time

    data = datasets.load_digits().data

    som = SOM(DATA=data, alpha_max=0.05, num_units=100, height=10, width=10)
    #som.train_batch(100)
    #start = time.time()
    #som.train_stoch_theano(10)
    som.train_batch_theano(num_epoch=100)
    #som.train_stoch(10)
    #clusters = som.ins_unit_assign
    #print clusters
    #stop = time.time()
    #
    print som.unit_saliency

    #som_plot_scatter(som.W, som.X, som.activations)
    #som_plot_outlier_scatter(som.W, som.X, som.unit_saliency, som.inst_saliency, som.activations)
示例#36
0
# coding: utf-8

# In[27]:

from sklearn.datasets import load_digits
from matplotlib import pyplot as plt
get_ipython().magic(u'matplotlib inline')

# In[37]:

digit = load_digits()
print(digits.data.shape)
plt.imshow(digits.images[8])
plt.show()
plt.imshow(digits.images[7])
plt.show()
print digits.images[8]
print digits.images[7]

# In[ ]:
示例#37
0
def test_digits_type_hints():
    data = data_df_from_bunch(load_digits())
    data_clean = clean(
        data, type_hints={"x{}".format(i): 'continuous'
                          for i in range(64)})
    assert data_clean.shape[1] == 65
cutdex = np.argmax(harmonics > spectral_cutoff)
zeroindex = int(spectra[0].size / 2)
linspectra = []
cutspectra = []
newspectra = []
for spectrum in spectra:
    print(len(spectrum[zeroindex:]))
    newspectra.append(spectrum[zeroindex:])
    linspectra.append(spectrum[zeroindex:lindex])
    cutspectra.append(spectrum[lindex:cutdex])

# for j in range(15):
#     plt.semilogy(harmonics[zeroindex:],newspectra[j])
# plt.show()

digits = datasets.load_digits(return_X_y=0)


def analysis(X, Y, tuned_parameters, test_fraction=0.8):
    # Use a support vector machine estimator with a grid search for hyperparameters
    # split data into training and testing sets
    train_data, test_data, train_target, test_target = train_test_split(
        X, Y, test_size=test_fraction)

    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        classifier = GridSearchCV(SVC(),
from time import time
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split  # some documents still include the cross-validation option but it no more exists in version 18.0
from sklearn.naive_ import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import pylab as plt

########################################################################################################################
##################################### PART A ###########################################################################
########################################################################################################################
np.random.seed(42)  # random seeding is performed
digits = load_digits(
)  # the whole data set with the labels and other information are extracted
data = scale(digits.data)  # the data is scaled with the use of z-score
n_samples, n_features = data.shape  # the no. of samples and no. of features are determined with the help of shape
n_digits = len(
    np.unique(digits.target)
)  # the number of labels are determined with the aid of unique formula
labels = digits.target  # get the ground-truth labels into the labels

print(labels)  # the labels are printed on the screen
print(digits.keys()
      )  # this command will provide you the key elements in this dataset
print(digits.DESCR)  # to get the descriptive information about this dataset

pl.gray()  # turns an image into gray scale
pl.matshow(digits.images[0])
pl.show()
示例#40
0
from sklearn.externals.six.moves import cStringIO as StringIO
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.sparse import csr_matrix
from sklearn.utils.testing import (assert_raises, assert_greater, assert_equal,
                                   assert_false, ignore_warnings)
from sklearn.utils.testing import assert_raise_message

np.seterr(all='warn')

ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]

digits_dataset_multi = load_digits(n_class=3)

X_digits_multi = MinMaxScaler().fit_transform(digits_dataset_multi.data[:200])
y_digits_multi = digits_dataset_multi.target[:200]

digits_dataset_binary = load_digits(n_class=2)

X_digits_binary = MinMaxScaler().fit_transform(
    digits_dataset_binary.data[:200])
y_digits_binary = digits_dataset_binary.target[:200]

classification_datasets = [(X_digits_multi, y_digits_multi),
                           (X_digits_binary, y_digits_binary)]

boston = load_boston()
示例#41
0
    return data, label


X_datasets, Y_datasets = [], []
# load data
# X_coil20, y_coil20 = download_coil20(dataset_type='processed')
# with open("./data/data_coil20.txt", "wb") as data:
#     pickle.dump((X_coil20,y_coil20), data)

with open("./data/data_coil20.txt", "rb") as data:
    X_coil20, y_coil20 = pickle.load(data)
X_coil20 = X_coil20.reshape((X_coil20.shape[0], -1))
X_datasets.append(X_coil20)
Y_datasets.append(y_coil20)

X_digits, y_digits = datasets.load_digits(n_class=10, return_X_y=True)
X_datasets.append(X_digits)
Y_datasets.append(y_digits)

dft = pd.read_csv('./data/fashion-mnist_test.csv', dtype=int)  # read test data
X_fashion = dft.drop('label', axis=1)
y_fashion = dft['label']
X_datasets.append(X_fashion)
Y_datasets.append(y_fashion)

# Set up algorithms
methods = OrderedDict()
methods['umap'] = umap.UMAP()
methods['t-SNE'] = manifold.TSNE(n_components=2, init='pca', random_state=0)

fig = plt.figure(figsize=(15, 8))
示例#42
0
"""

import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
"""
pipeline 打包步骤:
1. pca降维
2. logistic 线性回归
"""

digits = datasets.load_digits()  # (1797, 64)
X_digits = digits.data
y_digits = digits.target


def fill_pipeline():
    plt.figure(1)
    plt.clf()
    # plt.axes([.2, .2, .7, .7])

    m1_pca = decomposition.PCA()
    """
    fit 与否
    只影响 explained_variance_ 生成和展示
    对最终 gscv 的 best n_components 选择没影响 
    (猜测:因为gscv自己最终也要fit)
示例#43
0
SVC (support vector classifier) to improve the classification scores.
"""
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


# #############################################################################
# Import some data to play with
X, y = load_digits(return_X_y=True)
# Throw away data, to be in the curse of dimension settings
X = X[:200]
y = y[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

# #############################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = SelectPercentile(chi2)

clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
    anim.save(None, writer=matplotlib_utils.SimpleMovieWriter(0.001))

# # Logistic regression
# Your assignment is to implement the logistic regression
#
# Plan:
# * Use a shared variable for weights
# * Use a matrix placeholder for `X`
#
# We shall train on a two-class MNIST dataset
# * please note that target `y` are `{0,1}` and not `{-1,1}` as in some formulae

# In[30]:

from sklearn.datasets import load_digits
mnist = load_digits(2)

X, y = mnist.data, mnist.target

print("y [shape - %s]:" % (str(y.shape)), y[:10])
print("X [shape - %s]:" % (str(X.shape)))

# In[31]:

print('X:\n', X[:3, :10])
print('y:\n', y[:10])
plt.imshow(X[0].reshape([8, 8]))

# It's your turn now!
# Just a small reminder of the relevant math:
#
示例#45
0
from sklearn import datasets, metrics
from scipy import sparse
from sklearn.cross_validation import train_test_split
#Choose one of the dataset using the datasets features in the scikit-learn
from sklearn import svm
from sklearn.datasets import load_boston, load_digits
import numpy as np
#import warnings
#warnings.filterwarnings("ignore", category=DeprecationWarning)

C = 1.0
#getting the data and response of the dataset
#choosing
digits = load_digits()  #load dataset
a = digits.data
b = digits.target
#According to your dataset, split the data to 20% testing data, 80% training data(you can also use any other number)
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2)
#Apply SVC with Linear kernel
model = svm.SVC(kernel='linear')
model.fit(a_train, b_train)
b_pred = model.predict(a_test)
print("Accuracy for linear kernel in SVC " +
      str(metrics.accuracy_score(b_test, b_pred)))
#Apply SVC with RBF kernel
model = svm.SVC(kernel='rbf')
model.fit(a_train, b_train)
b_pred = model.predict(a_test)
print("Accuracy for rbf kernel in SVC " +
      str(metrics.accuracy_score(b_test, b_pred)))
示例#46
0
            self.estimators.append(weak_clf)

    def predict(self, X):
        num_samples = X.shape[0]
        y_hat = np.zeros((num_samples, ))
        # iterate each weak clf
        for clf in self.estimators:
            _y_hat = np.full((num_samples, ), clf.cls)
            _y_hat[X[:, clf.feature_index] <= clf.feature_value] *= -1
            y_hat += clf.alpha * _y_hat

        return np.sign(y_hat)


if __name__ == '__main__':
    data = load_digits()
    digit1 = 1
    digit2 = 8
    idx = np.append(
        np.where(data.target == digit1)[0],
        np.where(data.target == digit2)[0])
    y = data.target[idx]
    y[y == digit1] = -1
    y[y == digit2] = 1
    X = data.data[idx]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

    clf = Adaboost(n_estimators=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
示例#47
0
from sklearn import datasets
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from mpl_toolkits.mplot3d import Axes3D

#matplotlib notebook

mnist = datasets.load_digits()
X = mnist.data
y = mnist.target
pca = decomposition.PCA(n_components=3)
new_X = pca.fit_transform(X)

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(new_X[:, 0], new_X[:, 1], new_X[:, 2], c=y)
plt.show()
示例#48
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  5 21:13:45 2021

@author: GANY
"""
from sklearn import datasets  #import class dataset dari si scikit learn librari
iris = datasets.load_iris()  #memasukan dataset iris ke variable bernama iris
digits = datasets.load_digits()  #memasukan dataset digit ke variable digits

print(
    digits.data
)  #memberikan akses ke fitur yang dapat digunakan untuk mengklasifikasikan sample digit dan agar tertampil di console

digits.target  #memberikan informasi tentang data yang berhubungan atau dijadikan sebagai label

digits.images[
    0]  #data berupa array 2D, shape ( n.samles, n.features), meskipun data asliya, mungkin memiliki bentuk yang berbeda
示例#49
0
        columns.append("X" + str(i))
    columns.append("target")

    i = 0
    tmp = []
    for m in matrixList:
        tmpColumns = np.asarray(m).reshape(-1)
        addTarget = np.append(tmpColumns, [i])
        i = i + 1
        tmp.append(addTarget)
    return pd.DataFrame(tmp, columns=columns)


# In[27]:

digits = load_digits()  #training database from SKLearn
tri = prepareTestSet(
    'handwrittendigits/mag3_EE6653C8_digits_2017-12-08_14_18_28___0444df099e184398993a95d0f40446fe.csv'
)

# In[28]:

collectedFiles = [
    f for f in listdir('handwrittendigits/')
    if isfile(join('handwrittendigits/', f))
]
file = collectedFiles.pop(0)
trainingData = prepareTestSet('handwrittendigits/' + file)
i = 0
for file in reversed(collectedFiles):
    print('\n' + file + '\n')
示例#50
0
文件: mnist.py 项目: RMeli/pynn
"""
Train neuronal network to learn handwritten digits from the MNIST database.

The MNIST database is composed of 8x8 images of handwritten digits.
"""

from matplotlib import pylab as plt
from sklearn import datasets

# Load MNIST database
images, targets = datasets.load_digits(return_X_y=True)

plt.imshow(images[0].reshape(8, 8),
           cmap=plt.cm.gray_r,
           interpolation="nearest")
plt.show()
示例#51
0
"""
Module used to test the plot functions
"""

import matplotlib
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split

from hipe4ml import analysis_utils, plot_utils
from hipe4ml.model_handler import ModelHandler

# data preparation
DIGITS_DATA = datasets.load_digits(n_class=2)
DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10])  # pylint: disable=E1101
Y_DIGITS = DIGITS_DATA.target  # pylint: disable=E1101
SIG_DF = DIGITS[Y_DIGITS == 1]
BKG_DF = DIGITS[Y_DIGITS == 0]
TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split(DIGITS,
                                                        Y_DIGITS,
                                                        test_size=0.5,
                                                        random_state=42)
DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST]
# --------------------------------------------

# training and testing
INPUT_MODEL = xgb.XGBClassifier()
MODEL = ModelHandler(INPUT_MODEL)
MODEL.train_test_model(DATA)
import numpy as np
import matplotlib.pylab as plt
import sklearn.datasets as skdata
import sklearn

numeros = skdata.load_digits()
target = numeros['target']
imagenes = numeros['images']
n_imagenes = len(target)

data = imagenes.reshape((n_imagenes, -1))
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    train_size=0.7)


def distancia(x_train, y_train):
    numero = 1
    dd = y_train == numero
    cov = np.cov(x_train[dd].T)
    valores, vectores = np.linalg.eig(cov)
    valores = np.real(valores)
    vectores = np.real(vectores)
    ii = np.argsort(-valores)
    valores = valores[ii]
    vectores = vectores[:, ii]

    #Predicción de la mínima distancia
示例#53
0
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self


# Prediction using only K-means' clusters: find a number of clusters equal to
# the number of classes, replace the dataset by their assignation to cluster.
# This is not so bad (accuracy 78%) but way less powerful than SVMs for example.
# This is an approximation of the nearest-centroid classifier

cl = ClusteringModel(load_digits()).clusterize(output='add').classify()

print('X data:\n', cl.X_train)
print('y data:\n', cl.y_train)

# Prediction wthout using K-means' clusters: just use a SVMs on the digits
# dataset. This gives 99.07% accuracy.

ClusteringModel(load_digits()).classify()

# Prediction using K-means clusters and another classifier: add to the dataset their
# cluster of assignation. It does not seem to improve the model: this gives 98.8%
# accuracy. Causes may be:

# - the classifier already captured all the information and adding a new feature
# confuses him
示例#54
0
from time import time

import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt

from sklearn import manifold, datasets

digits = datasets.load_digits(n_class=10)
X = digits.data
y = digits.target
n_samples, n_features = X.shape

np.random.seed(0)


def nudge_images(X, y):
    # Having a larger dataset shows more clearly the behavior of the
    # methods, but we multiply the size of the dataset only by 2, as the
    # cost of the hierarchical clustering methods are strongly
    # super-linear in n_samples
    shift = lambda x: ndimage.shift(
        x.reshape((8, 8)),
        .3 * np.random.normal(size=2),
        mode='constant',
    ).ravel()
    X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])
    Y = np.concatenate([y, y], axis=0)
    return X, Y

示例#55
0
def test_inference_batch():
    model = train_model()
    X, _ = datasets.load_digits(return_X_y=True)
    predictions = model.predict(X[0:100])
    assert np.all(predictions < 10)
示例#56
0
def fit_model_to_initial_dataset(dataset_name,
                                 classifier,
                                 model_name,
                                 is_resized,
                                 is_grayscale,
                                 pca=None):
    if (dataset_name == 'mnist' or dataset_name == 'fashion_mnist'):
        digits = datasets.load_digits()
        if (is_resized):
            image_shape = (10, 10)
        else:
            image_shape = (28, 28)
        n_samples = len(digits.images)
        new_images = np.zeros((n_samples, ) + image_shape)
        #for i in range(len(digits.images)):
        #    new_images[i] = cv2.resize(digits.images[i], dsize=image_shape, interpolation=cv2.INTER_CUBIC)
        # data_images = digits.images.reshape((n_samples, -1))
        data_images = new_images.reshape((n_samples, -1))
        d_X, t_X, d_Y, t_Y = train_test_split(data_images, digits.target)

    elif (dataset_name == 'cifar-10'):
        train_X, train_Y = load_svhn(is_grayscale)
        print(train_X.shape, train_Y.shape)
        n_samples = train_X.shape[0]
        train_X_new = train_X.reshape((n_samples, -1))
        print(train_X_new.shape)
        d_X, t_X, d_Y, t_Y = train_test_split(train_X_new, train_Y)

    elif (dataset_name == 'sentiment_analysis'):
        #data_X, data_Y = np.empty((2, cv_X.shape[1])), np.empty((2, ))
        #index = 0
        #for i in range(cv_Y.shape[0]):
        #    if(cv_Y[i] == 0):
        #        data_X[index] = cv_X[i]
        #        data_Y[index] = cv_Y[i]
        #        index = index + 1
        #        break
        #if(index == 3):
        #    break
        #for i in range(cv_Y.shape[0]):
        #    if(cv_Y[i] == 1):
        #        data_X[index] = cv_X[i]
        #        data_Y[index] = cv_Y[i]
        #        index = index + 1
        #        break
        #if(index == 5):
        #    break
        #data_Y = data_Y.astype('int')
        #_, _, cv_X, cv_Y, _, _ = load_sentiment_dataset()
        #data_X = np.random.rand(2, cv_X.shape[1])
        data_X = np.random.rand(2, 768)
        data_Y = np.array([0, 1])
        classifier.fit(data_X, data_Y)
        return classifier

    if (model_name == 'svm' or model_name == 'knn'
            or model_name == 'naive_bayes'):
        if (pca != None):
            data = pca.fit_transform(d_X)
            #print(pca.explained_variance_ratio_)
            print(data.shape)
            #a = input()
            data = data[:2500]
            d_Y = d_Y[:2500]
        else:
            data = d_X[:2500]
            d_Y = d_Y[:2500]
        classifier.fit(data, d_Y)
    #elif(model_name == 'knn'):
    #    classifier.fit(d_X, d_Y)
    elif (model_name == "dt" or model_name == "lr"):
        d_X = d_X[:2500]
        d_Y = d_Y[:2500]
        classifier.train_model(d_X, d_Y, t_X, t_Y)
    ##print("The size of the Initial dataset on which Model A is trained is: ", d_X.shape)
    #print("Fitted model A to the initial dataset")
    return classifier
示例#57
0
__author__ = 'Xialei'
__date__ = '2018/4/22 16:10'

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

np.random.seed(42)  # 随机种子,用于初始化聚类中心

digits = load_digits()  # 载入数据
data = scale(digits.data)  # 中心标准化数据

n_samples, n_features = data.shape  # 1797,64
n_digits = len(np.unique(digits.target))  # 10
labels = digits.target  # 数据真实标签

sample_size = 300

print("n_digits: %d, \t n_samples %d, \t n_features %d" %
      (n_digits, n_samples, n_features))

print(79 * '_')
print('% 9s' % 'init'
      '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')
示例#58
0
def test_inference_sample():
    model = train_model()
    X, _ = datasets.load_digits(return_X_y=True)
    prediction = model.predict(X[0:1])[0]
    assert prediction == 0
示例#59
0
        if callback:
            callback(G, S, iter)

    if compute_err:
        logging.info("Violations of optimization objective: %d/%d " % (
            int(np.sum(np.diff(obj) > 0)), len(obj)))
    return G, S


if 0:
    # toy example for matrix completion
    rnds = np.random.RandomState(0)
    from sklearn import datasets

    R12 = datasets.load_digits(7).data
    obj_types = [0, 1, 2]
    obj_type2rank = {0: 100, 1: 200, 2:10}

    R12b = np.random.rand(R12.shape[0], R12.shape[1])
    R12b[R12b < 0.3] = 0

    # mask R12
    grid = np.indices(R12.shape)
    idx = list(zip(grid[0].ravel(), grid[1].ravel()))
    rnds.shuffle(idx)
    idxi, idxj = list(zip(*idx[:int(0.9 * R12.size)]))
    mR12 = np.ma.array(R12)
    mR12[idxi, idxj] = np.ma.masked
    print("R12 mask: %d" % mR12.mask.sum())
示例#60
0
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import datasets
import kmapper as km

try:
    from scipy.misc import imsave, toimage
except ImportError as e:
    print(
        "imsave requires you to install pillow. Run `pip install pillow` and then try again."
    )
    sys.exit()

# Load digits dat
data, labels = datasets.load_digits().data, datasets.load_digits().target

# Create images for a custom tooltip array
tooltip_s = []
for image_data in data:
    output = io.BytesIO()
    img = toimage(image_data.reshape(
        (8, 8)))  # Data was a flat row of 64 "pixels".
    img.save(output, format="PNG")
    contents = output.getvalue()
    img_encoded = base64.b64encode(contents)
    img_tag = """<img src="data:image/png;base64,{}">""".format(
        img_encoded.decode('utf-8'))
    tooltip_s.append(img_tag)
    output.close()