Пример #1
0
def test_load_breast_cancer():
    res = load_breast_cancer()
    assert_equal(res.data.shape, (569, 30))
    assert_equal(res.target.size, 569)
    assert_equal(res.target_names.size, 2)
    assert_true(res.DESCR)

    # test return_X_y option
    X_y_tuple = load_breast_cancer(return_X_y=True)
    bunch = load_breast_cancer()
    assert_true(isinstance(X_y_tuple, tuple))
    assert_array_equal(X_y_tuple[0], bunch.data)
    assert_array_equal(X_y_tuple[1], bunch.target)
Пример #2
0
def load_breast_cancer_df(include_tgt=True, tgt_name="target", shuffle=False):
    """Loads the breast cancer dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------

    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    shuffle : bool, optional (default=False)
        Whether to shuffle the rows


    Returns
    -------

    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded dataset
    """
    bc = load_breast_cancer()
    X = pd.DataFrame.from_records(data=bc.data, columns=bc.feature_names)

    if include_tgt:
        X[tgt_name] = bc.target

    return X if not shuffle else shuffle_dataframe(X)
Пример #3
0
def test_RFECV():
    from sklearn.datasets import load_boston
    from sklearn.datasets import load_breast_cancer
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import RFECV

    # Regression
    X, y = load_boston(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='reg:squarederror',
                            random_state=0, verbosity=0)
    rfecv = RFECV(
        estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error')
    rfecv.fit(X, y)

    # Binary classification
    X, y = load_breast_cancer(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='binary:logistic',
                            random_state=0, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc')
    rfecv.fit(X, y)

    # Multi-class classification
    X, y = load_iris(return_X_y=True)
    bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear',
                            learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='multi:softprob',
                            random_state=0, reg_alpha=0.001, reg_lambda=0.01,
                            scale_pos_weight=0.5, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss')
    rfecv.fit(X, y)
def main():
    dataset = datasets.load_breast_cancer()
    features = dataset.data
    labels = dataset.target
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3,
        stratify=labels)
    
    parameter_set = {'loss': ('hinge', 'squared_hinge'), 'C': [1, 10, 100, 1000, 5, 50, 500, 5000]}

    model = LinearSVC()
    grid_scores, best_score, best_params, test_score = validate_model(model=model, parameter_set=parameter_set,
        train_data=[train_features, train_labels], test_data=[test_features, test_labels])

    print(grid_scores)
    print('SVM best score: {}'.format(best_score))
    print('SVM best params : {}'.format(best_params))
    print('SVM test score : {}'.format(test_score))

    parameter_set = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'batch_size': [16, 32, 64, 128],}

    model = MLPClassifier()

    grid_scores, best_score, best_params, test_score = validate_model(model=model, parameter_set=parameter_set,
        train_data=[train_features, train_labels], test_data=[test_features, test_labels])

    print(grid_scores)
    print('MLP best score: {}'.format(best_score))
    print('MLP best params : {}'.format(best_params))
    print('MLP test score : {}'.format(test_score))
Пример #5
0
 def test_early_stopping(self):
     X, y = load_breast_cancer(True)
     params = {
         'objective': 'binary',
         'metric': 'binary_logloss',
         'verbose': -1
     }
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     valid_set_name = 'valid_set'
     # no early stopping
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=10,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
                     verbose_eval=False,
                     early_stopping_rounds=5)
     self.assertEqual(gbm.best_iteration, 10)
     self.assertIn(valid_set_name, gbm.best_score)
     self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
     # early stopping occurs
     gbm = lgb.train(params, lgb_train,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
                     verbose_eval=False,
                     early_stopping_rounds=5)
     self.assertLessEqual(gbm.best_iteration, 100)
     self.assertIn(valid_set_name, gbm.best_score)
     self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
Пример #6
0
def test_dt():
    cancer = load_breast_cancer()
    X, y = cancer.data, cancer.target
    feature_names = cancer.feature_names

    sk_dt = SKDT(random_state=1, max_depth=3)
    our_dt = ClassificationTree(feature_names=feature_names, random_state=1)

    sk_dt.fit(X, y)
    our_dt.fit(X, y)

    sk_pred = sk_dt.predict_proba(X)
    our_pred = our_dt.predict_proba(X)
    assert np.allclose(sk_pred, our_pred)

    sk_pred = sk_dt.predict(X)
    our_pred = our_dt.predict(X)
    assert np.allclose(sk_pred, our_pred)

    # With labels
    local_expl = our_dt.explain_local(X, y)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    # Without labels
    local_expl = our_dt.explain_local(X)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    global_expl = our_dt.explain_global()
    global_viz = global_expl.visualize()
    assert global_viz is not None
def main(arguments):
    # load the features of the dataset
    features = datasets.load_breast_cancer().data

    # standardize the features
    features = StandardScaler().fit_transform(features)

    # get the number of features
    num_features = features.shape[1]

    # load the corresponding labels for the features
    labels = datasets.load_breast_cancer().target

    # transform the labels to {-1, +1}
    labels[labels == 0] = -1

    # split the dataset to 70/30 partition: 70% train, 30% test
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels,
                                                                                test_size=0.3, stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset as per the batch size
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    # instantiate the SVM class
    model = SVM(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES,
                num_features=num_features)

    # train the instantiated model
    model.train(epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels],
                train_size=train_features.shape[0], validation_data=[test_features, test_labels],
                validation_size=test_features.shape[0], result_path=arguments.result_path)

    test_conf, test_accuracy = utils.plot_confusion_matrix(phase='testing', path=arguments.result_path,
                                                           class_names=['benign', 'malignant'])

    print('True negatives : {}'.format(test_conf[0][0]))
    print('False negatives : {}'.format(test_conf[1][0]))
    print('True positives : {}'.format(test_conf[1][1]))
    print('False positives : {}'.format(test_conf[0][1]))
    print('Testing accuracy : {}'.format(test_accuracy))
Пример #8
0
 def test_binary(self):
     X, y = load_breast_cancer(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
     gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
     ret = log_loss(y_test, gbm.predict_proba(X_test))
     self.assertLess(ret, 0.15)
     self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
Пример #9
0
 def setUp(self):
     self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
     self.train_data = lgb.Dataset(self.X_train, self.y_train)
     self.params = {
         "objective": "binary",
         "verbose": -1,
         "num_leaves": 3
     }
Пример #10
0
 def load_binary_data(self, shuffled=True):
     samples = load_breast_cancer()
     if shuffled:
         self.X = shuffle(samples.data, random_state=self.SEED)
         self.y = shuffle(samples.target, random_state=self.SEED)
     else:
         self.X, self.y = samples.data, samples.target
     self.n_features = len(self.X[0])
Пример #11
0
 def test_binary(self):
     X_y = load_breast_cancer(True)
     params = {
         'objective': 'binary',
         'metric': 'binary_logloss'
     }
     evals_result, ret = template.test_template(params, X_y, log_loss)
     self.assertLess(ret, 0.15)
     self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5)
Пример #12
0
def train_breast_cancer(param_in):
    data = datasets.load_breast_cancer()
    X = scale(data.data)
    dtrain = xgb.DMatrix(X, label=data.target)
    param = {'objective': 'binary:logistic'}
    param.update(param_in)
    bst = xgb.train(param, dtrain, num_rounds)
    xgb_pred = bst.predict(dtrain)
    xgb_score = metrics.accuracy_score(data.target, np.round(xgb_pred))
    assert xgb_score >= 0.8
Пример #13
0
 def test_issues_161_and_189(self):
     """
     ensure DataManager(data).data == data
     """
     X, y = load_breast_cancer(True)
     X, y = X[15:40], y[15:40]
     model = KNeighborsClassifier(weights='distance', p=2, n_neighbors=10).fit(X, y)
     skater_model = InMemoryModel(model.predict_proba, examples=X, probability=True)
     assert skater_model.probability is True
     assert skater_model.model_type == StaticTypes.model_types.classifier
Пример #14
0
def test_load_breast_cancer():
    res = load_breast_cancer()
    assert_equal(res.data.shape, (569, 30))
    assert_equal(res.target.size, 569)
    assert_equal(res.target_names.size, 2)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_breast_cancer))
Пример #15
0
    def test_chunked_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)

        chunk_size = X_train.shape[0] // 10 + 1
        X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
        X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]

        train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
        valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})

        train_data.construct()
        valid_data.construct()
Пример #16
0
def main(arguments):
    # load the features of the dataset
    features = datasets.load_breast_cancer().data

    # standardize the features
    features = StandardScaler().fit_transform(features)

    # get the number of features
    num_features = features.shape[1]

    # load the labels for the features
    labels = datasets.load_breast_cancer().target

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.30,
                                                                                stratify=labels)

    model = MLP(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, node_size=NUM_NODES, num_classes=NUM_CLASSES,
                num_features=num_features)

    model.train(num_epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels],
                train_size=train_features.shape[0], test_data=[test_features, test_labels],
                test_size=test_features.shape[0], result_path=arguments.result_path)
Пример #17
0
def load_cancer_data():
    # clinical measurements of breast cancer tumors
    # for the classification
    from sklearn.datasets import load_breast_cancer
    cancer = load_breast_cancer() # cancer is like dict

    #import pdb; pdb.set_trace()
    # Format contain "replacement fields" surrounded by {}
    print("cancer.keys(): \n{}".format(cancer.keys()))
    print("shape of cancer data: {}".format(cancer.data.shape))
    print("sample counts per class:\n{}".format(
        {n:v for n, v in zip(cancer.target_names, np.bincount(cancer.target))}
    )) # bincount counts number of occurrences of each value in array of ints
    print("Feature names:\n{}".format(cancer.feature_names))
Пример #18
0
def load_datasets():    
    iris = load_iris()
    iris_X, iris_y = iris['data'], iris['target']
    digits = load_digits()
    digits_X, digits_y = digits['data'], digits['target']
    breast_cancer = load_breast_cancer()
    breast_cancer_X, breast_cancer_y = breast_cancer['data'], breast_cancer['target']
    diabetes = load_diabetes()
    diabetes_X, diabetes_y = diabetes['data'], diabetes['target']
    mnist = fetch_mldata('MNIST original', data_home='datasets/')
    mnist_X, mnist_y = mnist['data'], mnist['target']
    datasets = {'iris':("Iris Plants Dataset",iris_X, iris_y),'digits':("UCI ML hand-written digits dataset",digits_X, digits_y),'breast_cancer':("Breast Cancer Wisconsin (Diagnostic) Dataset",breast_cancer_X, breast_cancer_y),'mnist':("The MNIST database of handwritten digits",mnist_X, mnist_y)}
    #,'diabetes':("Diabetes dataset",diabetes_X, diabetes_y)})}
    #'breast_cancer':("Breast Cancer Wisconsin (Diagnostic) Dataset",breast_cancer_X, breast_cancer_y),
    return datasets
Пример #19
0
def train_cancer(param_in, comparison_tree_method):
    data = load_breast_cancer()
    dtrain = xgb.DMatrix(data.data, label=data.target)
    param = {}
    param['objective'] = 'binary:logistic'
    param.update(param_in)
    res_tmp = {}
    res = {}
    num_rounds = 10
    xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')], evals_result=res_tmp)
    res[param['tree_method']] = res_tmp['train']['error']
    param["tree_method"] = comparison_tree_method
    xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')], evals_result=res_tmp)
    res[comparison_tree_method] = res_tmp['train']['error']
    return res
Пример #20
0
    def test(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = train_data.create_valid(X_test, label=y_test)

        params = {
            "objective": "binary",
            "metric": "auc",
            "min_data": 10,
            "num_leaves": 15,
            "verbose": -1,
            "num_threads": 1,
            "max_bin": 255
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")

        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
            tname = f.name
        with open(tname, "w+b") as f:
            dump_svmlight_file(X_test, y_test, f)
        pred_from_file = bst.predict(tname)
        os.remove(tname)
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
            self.assertAlmostEqual(*preds, places=15)

        # check saved model persistence
        bst = lgb.Booster(params, model_file="model.txt")
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
            # we need to check the consistency of model file here, so test for exact equal
            self.assertEqual(*preds)

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
        self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
        for preds in zip(pred_early_stopping, pred_from_matr):
            # scores likely to be different, but prediction should still be the same
            self.assertEqual(preds[0] > 0, preds[1] > 0)
def main():
    dataset = datasets.load_breast_cancer()

    features = dataset.data
    labels = dataset.target

    num_features = features.shape[1]

    features = StandardScaler().fit_transform(features)

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3,
                                                                                stratify=labels)

    model = NearestNeighbor(train_features, train_labels, num_features)

    model.predict(test_features, test_labels, result_path='./results/nearest_neighbor/')
Пример #22
0
    def test_contribs(self):
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1,
        }
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=20,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)

        self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
Пример #23
0
    def test_plot_metrics(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)
        test_data = lgb.Dataset(X_test, y_test, reference=train_data)

        params = {
            "objective": "binary",
            "metric": {"binary_logloss", "binary_error"},
            "verbose": -1,
            "num_leaves": 3
        }

        evals_result0 = {}
        gbm0 = lgb.train(params, train_data,
                         valid_sets=[train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(params, train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
def main():
    dataset = datasets.load_breast_cancer()

    features = dataset.data

    features = StandardScaler().fit_transform(features)

    num_features = features.shape[1]

    labels = dataset.target

    labels[labels == 0] = -1

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3,
                                                                                stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset to be exact as per the batch size
    # e.g. train_size = 1898322, batch_size = 256
    # [:1898322-(1898322%256)] = [:1898240]
    # 1898322 // 256 = 7415; 7415 * 256 = 1898240
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]

    # modify the size of the dataset to be passed on model.train()
    train_size = train_features.shape[0]

    # slice the dataset to be exact as per the batch size
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    test_size = test_features.shape[0]

    model = GruSvm(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, cell_size=CELL_SIZE, dropout_rate=DROPOUT_RATE,
                   num_classes=NUM_CLASSES, sequence_length=num_features, svm_c=SVM_C)

    model.train(checkpoint_path='./checkpoint_path/gru_svm/', log_path='./log_path/gru_svm/', model_name='gru_svm',
                epochs=3000, train_data=[train_features, train_labels], train_size=train_size,
                validation_data=[test_features, test_labels], validation_size=test_size, result_path='./results')
Пример #25
0
 def test_binary(self):
     X, y = load_breast_cancer(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'binary',
         'metric': 'binary_logloss',
         'verbose': -1,
         'num_iteration': 50  # test num_iteration in dict here
     }
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=20,
                     valid_sets=lgb_eval,
                     verbose_eval=False,
                     evals_result=evals_result)
     ret = log_loss(y_test, gbm.predict(X_test))
     self.assertLess(ret, 0.15)
     self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50)
     self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
Пример #26
0
    def test_plot_importance(self):
        X_train, _, y_train, _ = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)

        params = {
            "objective": "binary",
            "verbose": -1,
            "num_leaves": 3
        }
        gbm0 = lgb.train(params, train_data, num_boost_round=10)
        ax0 = lgb.plot_importance(gbm0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Feature importance')
        self.assertEqual(ax0.get_xlabel(), 'Feature importance')
        self.assertEqual(ax0.get_ylabel(), 'Features')
        self.assertLessEqual(len(ax0.patches), 30)

        gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm1.fit(X_train, y_train)

        ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
        self.assertIsInstance(ax1, matplotlib.axes.Axes)
        self.assertEqual(ax1.get_title(), 't')
        self.assertEqual(ax1.get_xlabel(), 'x')
        self.assertEqual(ax1.get_ylabel(), 'y')
        self.assertLessEqual(len(ax1.patches), 30)
        for patch in ax1.patches:
            self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.))  # red

        ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'],
                                  title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
        self.assertLessEqual(len(ax2.patches), 30)
        self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.))  # r
        self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.))  # y
        self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.))  # g
        self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.))  # b
def main():
    dataset = datasets.load_breast_cancer()

    features = dataset.data

    features = StandardScaler().fit_transform(features)

    num_features = features.shape[1]

    labels = dataset.target

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3,
                                                                                stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset to be exact as per the batch size
    # e.g. train_size = 1898322, batch_size = 256
    # [:1898322-(1898322%256)] = [:1898240]
    # 1898322 // 256 = 7415; 7415 * 256 = 1898240
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]

    # modify the size of the dataset to be passed on model.train()
    train_size = train_features.shape[0]

    # slice the dataset to be exact as per the batch size
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    test_size = test_features.shape[0]

    model = LinearRegression(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES,
                             sequence_length=num_features)

    model.train(epochs=3000, log_path='./log_path/linear_regression/', train_data=[train_features, train_labels],
                train_size=train_size, validation_data=[test_features, test_labels], validation_size=test_size,
                result_path='./results/linear_regression/')
Пример #28
0
    def test(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
        valid_data = train_data.create_valid(X_test, label=y_test)

        params = {
            "objective": "binary",
            "metric": "auc",
            "min_data": 1,
            "num_leaves": 15,
            "verbose": -1
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")

        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
            tname = f.name
        with open(tname, "w+b") as f:
            np.savetxt(f, X_test, delimiter=',')
        pred_from_file = bst.predict(tname)
        os.remove(tname)
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
            self.assertAlmostEqual(*preds, places=15)
        # check saved model persistence
        bst = lgb.Booster(params, model_file="model.txt")
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
            self.assertEqual(*preds)
        # check pmml
        os.system('python ../../pmml/pmml.py model.txt')
Пример #29
0
def test_XGBClassifier_resume():
    from sklearn.datasets import load_breast_cancer
    from sklearn.metrics import log_loss

    with TemporaryDirectory() as tempdir:
        model1_path = os.path.join(tempdir, 'test_XGBClassifier.model')
        model1_booster_path = os.path.join(tempdir, 'test_XGBClassifier.booster')

        X, Y = load_breast_cancer(return_X_y=True)

        model1 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8)
        model1.fit(X, Y)

        pred1 = model1.predict(X)
        log_loss1 = log_loss(pred1, Y)

        # file name of stored xgb model
        model1.save_model(model1_path)
        model2 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8)
        model2.fit(X, Y, xgb_model=model1_path)

        pred2 = model2.predict(X)
        log_loss2 = log_loss(pred2, Y)

        assert np.any(pred1 != pred2)
        assert log_loss1 > log_loss2

        # file name of 'Booster' instance Xgb model
        model1.get_booster().save_model(model1_booster_path)
        model2 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8)
        model2.fit(X, Y, xgb_model=model1_booster_path)

        pred2 = model2.predict(X)
        log_loss2 = log_loss(pred2, Y)

        assert np.any(pred1 != pred2)
        assert log_loss1 > log_loss2
Пример #30
0
import numpy as np


def to_file(answer, num):
    with open('bayes_answer' + str(num) + '.txt', 'w') as fout:
        fout.write(str(answer))


def get_ME(X, Y):
    return cross_validation.cross_val_score(
        naive_bayes.BernoulliNB(), X,
        Y).mean(), cross_validation.cross_val_score(
            naive_bayes.MultinomialNB(), X,
            Y).mean(), cross_validation.cross_val_score(
                naive_bayes.GaussianNB(), X, Y).mean()


def terminate(X, Y):
    m1, m2, m3 = get_ME(X, Y)
    print('BernoulliNB: {0}\n'.format(m1), 'MultinomialNB: {0}\n'.format(m2),
          'GaussianNB: {0}'.format(m3))
    return max(m1, m2, m3)


if __name__ == '__main__':
    pack = [datasets.load_digits(), datasets.load_breast_cancer()]
    X, Y, M, N = pack[0].data, pack[0].target, pack[1].data, pack[1].target
    to_file(terminate(M, N), 1)
    to_file(terminate(X, Y), 2)
    to_file('3 4', 3)
Пример #31
0
def experiments(config_file):
    args = get_args_parser().parse_args(['@' + config_file])

    # Set seed
    np.random.seed(int(args.seed))

    # Construct output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outdir = args.outdir + timestamp + '/'

    # Create results directory
    outdir_path = Path(outdir)
    if not outdir_path.is_dir():
        os.makedirs(outdir)

    # Logging
    logfile = outdir + 'log.txt'
    log(logfile, "Directory " + outdir + " created.")

    # Set dataset
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)

    # Set plot settings
    plt.figure(figsize=(7 * 2 + 6, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)
    plt.style.use('dark_background')
    plot_num = 1

    datasets = ((3, load_iris(return_X_y=True), "Iris"),
                (2, load_breast_cancer(return_X_y=True),
                 "Breast Cancer"), (2, noisy_circles, "Noisy Circles"))

    # Traverse datasets
    # High-level abstraction is from https://scikit-learn.org/stable/modules/clustering.html
    for i, (n_clusters, dataset, dataset_name) in enumerate(datasets):
        X, y = dataset

        # Normalization of features for easier parameter selection
        X = StandardScaler().fit_transform(X)

        connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
        # connectivity = 0.5 * (connectivity + connectivity.T)  # Make connectivity symmetric

        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=n_clusters,
            connectivity=connectivity)

        ward_linkage = cluster.AgglomerativeClustering(linkage="ward",
                                                       n_clusters=n_clusters)

        complete_linkage = cluster.AgglomerativeClustering(
            linkage="complete", n_clusters=n_clusters)

        single_linkage = cluster.AgglomerativeClustering(linkage="single",
                                                         n_clusters=n_clusters)

        k_means = cluster.KMeans(n_clusters=n_clusters)

        gaussian_mixture = mixture.GaussianMixture(n_components=n_clusters,
                                                   covariance_type='full')

        # Set techniques
        techniques = (
            ('Agglomerative Avg', average_linkage),
            ('Agglomerative Single', single_linkage),
            ('Agglomerative Complete', complete_linkage),
            ('Agglomerative Ward', ward_linkage),
            ('kMeans', k_means),
            ('GaussianMixture', gaussian_mixture),
        )

        for name, technique in techniques:
            log(logfile, dataset_name + ", " + name)

            time_start = time.time()

            # Catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                technique.fit(X)

            time_stop = time.time()

            # Predictions
            if hasattr(technique, 'labels_'):
                y_pred = technique.labels_.astype(np.int)
            else:
                y_pred = technique.predict(X)

            # Entropy metric
            true_cluster_labels = [
                y[get_cluster_indices(c, y_pred)] for c in range(n_clusters)
            ]
            overall_entropy = get_overall_entropy(true_cluster_labels,
                                                  y.shape[0])

            # F-Score metric
            f1_score = metrics.f1_score(y, y_pred, average='weighted')

            log(logfile,
                "\tOverall entropy: " + str(round(overall_entropy, 3)))
            log(logfile, "\tF1 Score: " + str(round(f1_score, 3)))

            # Plotting
            plt.subplot(len(datasets), len(techniques), plot_num)
            if i == 0:
                plt.title("{}".format(name), size=15)

            colors = np.array(
                list(
                    islice(cycle(['#377eb8', '#ff7f00', '#4daf4a']),
                           int(max(y_pred) + 1))))
            colors = np.append(
                colors, ["#000000"])  # Add black color for outliers (if any)
            plt.scatter(X[:, 0],
                        X[:, 1],
                        s=10,
                        color=colors[y_pred],
                        alpha=0.60)

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())

            plt.text(.15,
                     .01, ('%.2fs' % (time_stop - time_start)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')

            plt.text(.99,
                     .07, ('%.2f' % (overall_entropy)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plt.text(.99,
                     .01, ('%.2f' % (f1_score)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')

            plot_num += 1

    # Plotting
    plt.savefig(outdir + 'plot.png', bbox_inches='tight')
def entropy(y):
    unique, counts = np.unique(y, return_counts=True)
    ps = counts / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


class Node:
    def __init__(self,
                 feature=None,
                 threshold=None,
                 left=None,
                 right=None,
                 *,
                 value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


data = datasets.load_breast_cancer(return_X_y=True)  # dataset in tuple
data = np.column_stack((data[0], data[1]))  # converts dataset into M x N array

decisionTree = DecisionTree(data, max_depth=10)
decisionTree.fit()
decisionTree.predict()
Пример #33
0
#
# LeMbRaR DiSsO AcImA ^
"""
import numpy as np
from sklearn import datasets


def sigmoid(soma):
    return 1 / (1 + np.exp(-soma))


def sigmoidDerivada(sig):
    return sig * (1 - sig)


base = datasets.load_breast_cancer()  # carrega dataset
entradas = base.data  # pega os dados do dadaset como entradas
valoresSaida = base.target  # pega os alvos, resultados do dataset como saidas
saidas = np.empty([569, 1], dtype=int)  # cria um array vazio
for i in range(569):
    saidas[i] = valoresSaida[i]  #preenche o array vazio com valores do dataset

pesos0 = 2 * np.random.random(
    (30, 5)) - 1  # 30(atributos de entrada), 5(neuronios na camada escondida)
pesos1 = 2 * np.random.random(
    (5, 1
     )) - 1  # 5(numero de pesos camada oculta), 1(neuronio de saida - classes)

epocas = 10000
taxaAprendizagem = 0.3
momento = 1
from tensorflow.keras import regularizers  #This allows using whichever regularizer we want (l1,l2,l1_l2)
from tensorflow.keras.utils import to_categorical  #This allows using categorical cross entropy as the cost function
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as splitter
from sklearn.datasets import load_breast_cancer
import pickle
import os

# %%
"""Load breast cancer dataset"""

np.random.seed(0)  #create same seed for random number every time

cancer = load_breast_cancer()  #Download breast cancer dataset

inputs = cancer.data  #Feature matrix of 569 rows (samples) and 30 columns (parameters)
outputs = cancer.target  #Label array of 569 rows (0 for benign and 1 for malignant)
labels = cancer.feature_names[0:30]

print('The content of the breast cancer dataset is:'
      )  #Print information about the datasets
print(labels)
print('-------------------------')
print("inputs =  " + str(inputs.shape))
print("outputs =  " + str(outputs.shape))
print("labels =  " + str(labels.shape))

x = inputs  #Reassign the Feature and Label matrices to other variables
y = outputs
# argument yang digunakan untuk menentukan model algoritma klasifikasi yang ingin digunakan
ap.add_argument("-m", "--model", type=str, default="knn",
                    help="type of python machine learning model to use")
args = vars(ap.parse_args())

# dictionary dari model-model yang dapat digunakan
models = {
    "knn": KNeighborsClassifier(n_neighbors=1),
    "naive_bayes": GaussianNB(),
    "logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
    "svm": SVC(kernel="rbf", gamma="auto"),
    "decision_tree": DecisionTreeClassifier(),
    "pct": Perceptron(),
    "random_forest": RandomForestClassifier(n_estimators=100),
    "mlp": MLPClassifier()
    }

# meload dataset iris dan membagi data untuk training dan testing, contoh dibawah data test diambil 25% dan data train 75%
print("[INFO] loading data...")
dataset = load_breast_cancer()
(trainX, testX, trainY, testY) = train_test_split(dataset.data, dataset.target, random_state=3, test_size=0.25)

# data training menggunakan model yang dipilih user
print("[INFO] using '{}' model".format(args["model"]))
model = models[args['model']]
model.fit(trainX,trainY)

# evaluasi data menggunakan setiap baris data test
print("[INFO] evaluating...")
predictions = model.predict(testX)
print(classification_report(testY, predictions, target_names=dataset.target_names))
Пример #36
0
from qn2 import five_number_summary
from sklearn.preprocessing import MinMaxScaler


def normalize_minmax(data):
    #Check if is 2D array
    array = len(data.shape)
    if array != 2:
        return None
    else:
        ls = []
        #scaler = MinMaxScaler()
        #scaler.fit(data)
        #return scaler.transform(data)
        for i in range(data.shape[1]):
            row = data[:, i]
            #print(row)
            #print(np.max(row), np.min(row))
            row = (row - np.min(row)) / (np.max(row) - np.min(row))
            ls.append(row)
        #print(np.array(ls))
        return np.array(ls).transpose()


bunchobject = datasets.load_breast_cancer()
cols = [1, 7]
some_columns = bunchobject.data[:, cols]
snorm = normalize_minmax(some_columns)
#print("normalized",five_number_summary(snorm))
# can use ndim to check for dimensions
Пример #37
0
"""
Created on Sat Sep 19 19:47:14 2020

@author: https://www.codigofluente.com.br/aula-04-instalando-o-pandas/
"""

from __future__ import division, print_function
import skfuzzy as fuzz
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.model_selection import train_test_split

colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']

#Carrega o iris dataset em iris
iris = load_breast_cancer()

n_testes_uteis = 0
numeroDeTestes = 100
desempenho = 0
for cont in range(numeroDeTestes):

    ncenters = 2

    #Divisão entre treino e teste aleatória (50% para cada)
    X_treino, X_teste, y_treino, y_teste = train_test_split(iris.data,
                                                            iris.target,
                                                            test_size=0.5)

    #Divisão entre treino e teste usando paridade do indice (50% para cada)
    #X_treino, X_teste, y_treino, y_teste = iris.data[0:][::2], iris.data[1:][::2], iris.target[0:][::2], iris.target[1:][::2]
"""
Created on 2021/1/5 11:09

@author: Irvinfaith

@email: [email protected]
"""
from core.base_neural_network import NeuralNetwork
from core.optimizer import *

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 导入样例数据
data_loader = load_breast_cancer()
data = data_loader['data']
# 进行归一化
mms = MinMaxScaler()
data = mms.fit_transform(data)
# 拆分训练测试集
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    data_loader['target'],
                                                    test_size=0.3,
                                                    random_state=101)

# 输入层
nn = NeuralNetwork(30, True)
# 添加全连接层,并定义神经元个数以及该层的激活函数
nn.add_dense_layer(64, "sigmoid")
nn.add_dense_layer(32, "sigmoid")
Пример #39
0
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

boston = load_breast_cancer()

x = boston.data
y = boston.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.2, shuffle = True, random_state = 1)


#model = KNeighborsClassifier()
#model = SVC()
#model = LinearSVC()
#model = KNeighborsRegressor()
#model = RandomForestClassifier()
#model = RandomForestRegressor()


#3.실행

model.fit(x_train, y_train)

#4.평가 예측

y_predict = model.predict(x_test)
Пример #40
0
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
from scipy.stats import spearmanr
from scipy.cluster import hierarchy

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

import pandas as pd

data = load_breast_cancer()
data.target[[10, 50, 85]]

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
def download_datasets():

    i = 0

    i += 1
    print("%d of 10" % i)
    print("Iris\n")
    iris = load_iris()
    X1 = iris.data
    Y1 = iris.target

    i += 1
    print("%d of 10" % i)
    print("Digits\n")
    digits = load_digits()
    X2 = digits.data
    Y2 = digits.target

    i += 1
    print("%d of 10" % i)
    print("Breast cancer\n")
    breast_cancer = load_breast_cancer()
    X3 = breast_cancer.data
    Y3 = breast_cancer.target

    i += 1
    print("%d of 10" % i)
    print("Sensorless drive diagnosis\n")
    Sensorless_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt',
        sep=' ',
        header=None)
    X4 = Sensorless_data.values[:, 0:48]
    Y4 = Sensorless_data.values[:, 48]

    i += 1
    print("%d of 10" % i)
    print("Banknote authentication\n")
    banknote_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt',
        sep=',',
        header=None)
    X5 = banknote_data.values[:, 0:4]
    Y5 = banknote_data.values[:, 4]

    i += 1
    print("%d of 10" % i)
    print("Balance\n")
    balance_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data',
        sep=',',
        header=None)
    X6 = balance_data.values[:, 1:5]
    Y6 = balance_data.values[:, 0]

    i += 1
    print("%d of 10" % i)
    print("Wifi localization\n")
    wifi_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/00422/wifi_localization.txt',
        sep='\s+',
        header=None)
    X7 = wifi_data.values[:, 0:7]
    Y7 = wifi_data.values[:, 7]

    i += 1
    print("%d of 10" % i)
    print("CMC\n")
    cmc_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data',
        sep=',',
        header=None)
    X8 = cmc_data.values[:, 0:9]
    Y8 = cmc_data.values[:, 9]

    i += 1
    print("%d of 10" % i)
    print("Yeast\n")
    yeast_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data',
        sep='\s+',
        header=None)
    X9 = yeast_data.values[:, 1:9]
    Y9 = yeast_data.values[:, 9]

    i += 1
    print("%d of 10" % i)
    print("Abalone\n")
    abalone_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',
        sep=',',
        header=None)
    X10 = abalone_data.values[:, 0:8]
    for i in X10:
        if i[0] == 'M':
            i[0] = 1
        elif i[0] == 'F':
            i[0] = 2
        else:
            i[0] = 3
    Y10 = abalone_data.values[:, 8].astype(int)

    print("Saving the datasets for future use.")
    pickle.dump((X1, X2, X3, X4, X5, X6, X7, X8, X9, X10),
                open('datasets_x.pkl', 'wb'))
    pickle.dump((Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10),
                open('datasets_y.pkl', 'wb'))
# -*- coding: utf-8 -*-

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(\
     cancer.data, cancer.target,
     stratify=cancer.target, random_state=0)

clf = GradientBoostingClassifier(random_state=0, 
                                 max_depth=3, 
                                 learning_rate=0.01, 
                                 subsample=0.2)
clf.fit(X_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test)))

import os
# pip install joblib
from joblib import dump

try:
    if not(os.path.isdir("../../save")):
        os.makedirs(os.path.join("../../save"))
except OSError as e:
    if e.errno != errno.EEXIST:
        print("Failed to create directory!!!!!")
Пример #43
0
def plot_agglomerative_dendograms(config_file):
    args = get_args_parser().parse_args(['@' + config_file])

    # Set seed
    np.random.seed(int(args.seed))

    # Construct output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outdir = args.outdir + timestamp + '/'

    # Create results directory
    outdir_path = Path(outdir)
    if not outdir_path.is_dir():
        os.makedirs(outdir)

    # Set dataset
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)

    plt.figure(figsize=(2 * 10 + 2, 18.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)
    plt.style.use('dark_background')
    plot_num = 1

    datasets = ((3, load_iris(return_X_y=True)),
                (2, load_breast_cancer(return_X_y=True)), (2, noisy_circles))

    for i, (n_clusters, dataset) in enumerate(datasets):
        X, y = dataset

        # Normalization of features for easier parameter selection
        X = StandardScaler().fit_transform(X)

        connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
        connectivity = 0.5 * (connectivity + connectivity.T
                              )  # Make connectivity symmetric

        # Setting distance_threshold=0 ensures we compute the full tree.
        # Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            distance_threshold=0,
            n_clusters=None,
            connectivity=connectivity)

        ward_linkage = cluster.AgglomerativeClustering(linkage="ward",
                                                       distance_threshold=0,
                                                       n_clusters=None)

        complete_linkage = cluster.AgglomerativeClustering(
            linkage="complete", distance_threshold=0, n_clusters=None)

        single_linkage = cluster.AgglomerativeClustering(linkage="single",
                                                         distance_threshold=0,
                                                         n_clusters=None)

        techniques = (('Agglomerative Avg', average_linkage),
                      ('Agglomerative Single', single_linkage),
                      ('Agglomerative Complete',
                       complete_linkage), ('Agglomerative Ward', ward_linkage))

        for name, technique in techniques:
            # Catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                model = technique.fit(X)

            plt.subplot(len(datasets), len(techniques), plot_num)
            if i == 0:
                plt.title("{}".format(name), size=18)

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plot_dendrogram(model,
                            truncate_mode='level',
                            p=n_clusters,
                            no_labels=True)
            plot_num += 1

    # Plotting
    plt.savefig(outdir + 'agglomerative_dendrograms.png', bbox_inches='tight')
Пример #44
0
from xgboost import XGBRegressor, plot_importance, XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score

import matplotlib.pyplot as plt

dataset = load_breast_cancer()

# x = dataset.data
# y = dataset.target

x, y = load_breast_cancer(return_X_y=True)

print(x.shape)
print(y.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.96,
                                                    shuffle='True',
                                                    random_state=16)

model = XGBClassifier(n_estimators=300, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["logloss", "rmse", "mae"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)
Пример #45
0
 def setUp(self):
     data = load_breast_cancer()
     self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(
         data.data, data.target, shuffle=True, train_size=50)
Пример #46
0
x_test , y_test = x[test_idx, : ] , y[test_idx]

print('\nTraining')
print(x_train.shape , y_train.shape)

print('\nValidating')
print(x_val.shape , y_val.shape)

print('\nTesting')
print(x_test.shape , y_test.shape)



print('\n\t\t---------- Breast cancer datasets --------')

breast_cancer_data = skdata.load_breast_cancer()

#Loading data

feature_names = breast_cancer_data.feature_names
x1 = breast_cancer_data.data
y1 = breast_cancer_data.target


print('\n',x1.shape)
print('\n',y1.shape)

for name , value in zip(feature_names , x1[0,...]) :
    print('\n{} : {}'.format(name , value))
print('\nLabel: {}'.format(y1[0]))
Пример #47
0
def breast_cancer_dataset():
    X, y = load_breast_cancer(return_X_y=True)
    X = StandardScaler().fit_transform(X)
    return X, y
Пример #48
0
# Import Libraries
from sklearn.datasets import load_breast_cancer
from sklearn.impute import SimpleImputer
import numpy as np
#----------------------------------------------------

#load breast cancer data

BreastData = load_breast_cancer()

#X Data
X = BreastData.data

#y Data
y = BreastData.target

#----------------------------------------------------
# Cleaning data
'''
impute.SimpleImputer(missing_values=nan, strategy='mean’, fill_value=None, verbose=0, copy=True)
'''

ImputedModule = SimpleImputer(missing_values=np.nan, strategy='mean')
ImputedX = ImputedModule.fit(X)
X = ImputedX.transform(X)

#X Data
print('X Data is \n', X[:10])

#y Data
print('y Data is \n', y[:10])
Пример #49
0
    def setUp(self):

        super(ConfounderRemovalTests, self).setUp()
        self.X, self.y = load_breast_cancer(True)
        self.X_train = self.X[:100]
        self.y_train = self.y[:100]
        self.shuffle_split = ShuffleSplit(test_size=0.2,
                                          n_splits=1,
                                          random_state=15)
        settings = OutputSettings(project_folder=self.tmp_folder_path)
        self.pipe = Hyperpipe(
            "confounder_pipe",
            outer_cv=self.shuffle_split,
            inner_cv=KFold(n_splits=3, random_state=15),
            metrics=["accuracy"],
            best_config_metric="accuracy",
            output_settings=settings,
        )
        self.pipe += PipelineElement("StandardScaler")
        self.cr = PipelineElement("ConfounderRemoval")
        self.pipe += self.cr
        self.pipe += PipelineElement("SVC")
        self.random_confounders = np.random.randn(self.X.shape[0], 1)

        # do confounder removal by hand
        self.multiple_confounders = np.random.randn(self.X.shape[0], 2) * 10
        ols_confounder = sm.add_constant(self.multiple_confounders)
        self.X_transformed = np.empty(self.X.shape)
        for i in range(self.X.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X[:, i]),
                           exog=ols_confounder).fit()
            # transform
            self.X_transformed[:, i] = np.asarray(
                np.squeeze(self.X[:, i]) -
                np.matmul(ols_confounder, np.squeeze(model.params)))

        # prepare caching
        self.X_train_transformed = np.empty(self.X_train.shape)
        self.confounder_train = self.multiple_confounders[:100]
        ols_confounder_train = sm.add_constant(self.confounder_train)
        for i in range(self.X_train.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X_train[:, i]),
                           exog=ols_confounder_train).fit()
            # transform
            self.X_train_transformed[:, i] = np.asarray(
                np.squeeze(self.X_train[:, i]) -
                np.matmul(ols_confounder_train, np.squeeze(model.params)))

        # prepare confounder removal with standardization of covariates
        scaled_covs = list()
        # standardize covariates
        for cov in self.multiple_confounders.T:
            scaler = StandardScaler()
            scaled_covs.append(
                scaler.fit_transform(cov.reshape(-1, 1)).squeeze())
        scaled_covs = np.asarray(scaled_covs).T
        scaled_covs = sm.add_constant(scaled_covs)
        self.X_transformed_standardized = np.empty(self.X.shape)
        for i in range(self.X.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X[:, i]),
                           exog=scaled_covs).fit()
            # transform
            self.X_transformed_standardized[:, i] = np.asarray(
                np.squeeze(self.X[:, i]) -
                np.matmul(scaled_covs, np.squeeze(model.params)))

        # prepare statistical testing of confounder removal
        # Generate samples from three independent normally distributed random
        # variables (with mean 0 and std. dev. 1).
        x = norm.rvs(size=(4, 300))

        # desired covariance matrix
        r = np.array([
            [1, 0.9, 0.9, 0.9],
            [0.9, 1, 0.9, 0.9],
            [0.9, 0.9, 1, 0.9],
            [0.9, 0.9, 0.9, 1],
        ])
        c = cholesky(r, lower=True)

        # convert the data to correlated random variables
        self.z = np.dot(c, x).T
Пример #50
0
from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score


#1. 데이터

# dataset = load_boston()
# x = dataset.data
# y = dataset.target

x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    train_size=0.8, 
    random_state=77,
    shuffle=True
)


#2. 모델
# model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

#estimator 없이 
#default가 몇 갤까? -> 100번 돈다 : n_estimator default 100
Пример #51
0
def gen_dataset_func_eager():
    data_raw = load_breast_cancer(as_frame=True)
    dataset_df = data_raw["data"]
    dataset_df["target"] = data_raw["target"]
    dataset = from_pandas(dataset_df)
    return dataset
Пример #52
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.datasets import load_breast_cancer  #built in datasets that come with skikit learn (sklearn)

#This config fixes the truncated console writes. Makes things a lot easier
pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 10)
np.set_printoptions(linewidth=400)

#['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']
my_data = load_breast_cancer(
)  #datatype is a little odd. Works like a dictionary, however, which can then be used to get the data out that we need

my_df_var = pd.DataFrame(data=my_data['data'],
                         columns=my_data['feature_names'])

#!!!!!!! Scale the data !!!!!!!!!!!!

#Standard scalar fitted to ensure that all data in scaled in the same manner
my_scalar = StandardScaler().fit(my_df_var)

my_df_scaled = pd.DataFrame(data=my_scalar.transform(my_df_var),
                            columns=my_data['feature_names'])  #Scaled data

#!!!!! Perform the PCA !!!!!!!!!!!!
my_pca = PCA(n_components=2).fit(my_df_scaled)  #Maintain the two first PC's
Пример #53
0
def load_data() -> pd.DataFrame:
    data = load_breast_cancer()
    df = pd.DataFrame(data["data"], columns=data["feature_names"])
    df["target"] = data["target"]
    return df
Пример #54
0
def main(config):
    # 연산 디바이스 설정
    if config.gpu_id < 0:
        print("Device: CPU")
        device = torch.device('cpu')
    else:
        print("Device:", torch.cuda.get_device_name(0))
        device = torch.device('cuda:%d' % config.gpu_id)

    # 유방암 데이터 가져오기
    cancer_data = load_breast_cancer()
    df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
    df['class'] = cancer_data.target
    data = torch.from_numpy(df.values).float()
    x = data[:, :30]
    y = data[:, -1:]

    # 학습, 검증, 테스트 데이터 나누고 섞기
    ratios = [.6, .2, .2]
    train_cnt = int(x.size(0) * ratios[0])
    valid_cnt = int(x.size(0) * ratios[1])
    test_cnt = x.size(0) - train_cnt - valid_cnt
    cnts = [train_cnt, valid_cnt, test_cnt]
    indices = torch.randperm(x.size(0))
    x = torch.index_select(x, dim=0, index=indices).to(device)
    y = torch.index_select(y, dim=0, index=indices).to(device)
    x = x.split(cnts, dim=0)
    y = y.split(cnts, dim=0)

    # 토치 데이터셋, 로더를 이용하여 데이터 객체화
    train_loader = DataLoader(dataset=CustomDataset(x[0], y[0]),
                              batch_size=config.batch_size,
                              shuffle=True)
    valid_loader = DataLoader(dataset=CustomDataset(x[1], y[1]),
                              batch_size=config.batch_size,
                              shuffle=False)
    test_loader = DataLoader(dataset=CustomDataset(x[2], y[2]),
                             batch_size=config.batch_size,
                             shuffle=False)
    print("Train %d / Valid %d / Test %d samples." % (
        len(train_loader.dataset),
        len(valid_loader.dataset),
        len(test_loader.dataset),
    ))

    # 모델 선언 및 구조 결정
    model = CancerClassifier(x[0].size(-1), y[0].size(-1)).to(device)
    optimizer = optim.Adam(model.parameters())

    # 학습 수행
    trainer = Trainer(model, optimizer, train_loader, valid_loader)
    trainer.train(config)

    # Loss history
    plot_from = 2
    plt.figure(figsize=(20, 10))
    plt.grid(True)
    plt.title("Train / Valid Loss History")
    plt.plot(
        range(plot_from, len(trainer.train_history)),
        trainer.train_history[plot_from:],
        range(plot_from, len(trainer.valid_history)),
        trainer.valid_history[plot_from:],
    )
    plt.yscale('log')
    plt.show()

    # Evaluate
    test_loss = 0
    y_hat = []
    model.eval()
    with torch.no_grad():
        for x_i, y_i in test_loader:
            y_hat_i = model(x_i)
            loss = F.binary_cross_entropy(y_hat_i, y_i)
            test_loss += float(loss)  # Gradient is already detached.
            y_hat += [y_hat_i]
    test_loss = test_loss / len(test_loader)
    y_hat = torch.cat(y_hat, dim=0)
    print("Test loss: %.4e" % test_loss)
    correct_cnt = (y[2] == (y_hat > .5)).sum()
    total_cnt = float(y[2].size(0))
    print('Test Accuracy: %.4f' % (correct_cnt / total_cnt))
Пример #55
0
    - wandb
depend:
    requirements:
        - scikit-learn
assert:
    - :wandb:runs_len: 1
    - :wandb:runs[0][exitcode]: 0
    - :yea:exit: 0
    - :wandb:runs[0][summary][feature_importances][_type]: table-file
    - :wandb:runs[0][summary][feature_importances][ncols]: 2
    - :wandb:runs[0][summary][feature_importances][nrows]: 30
"""
from sklearn import datasets
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
import wandb

wandb.init("my-scikit-integration")

wbcd = wisconsin_breast_cancer_data = datasets.load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(wbcd.data,
                                                    wbcd.target,
                                                    test_size=0.2)
labels = wbcd.target_names

model = ElasticNet()
model.fit(X_train, y_train)

wandb.sklearn.plot_feature_importances(model)
Пример #56
0
import sklearn as skt
import numpy as np
import sklearn.datasets as data
import matplotlib.pyplot as plt

dx,dy=data.load_breast_cancer(return_X_y=True)

plt.figure()
plt.plot(dx,dy)
plt.show()
Пример #57
0
X_C2, y_C2 = make_classification(n_samples=100, n_features=2,# 100 n_samples
                                 n_redundant=0, n_informative=2,
                                 n_clusters_per_class=1, flip_y=0.1, # 10% chance of flipping correct label; pose challenge to classifier.
                                 class_sep=0.5, random_state=0)
plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2, marker='o', s=50, cmap=cmap_bold)

# More difficult syndata for binary classifying with non-linearly-separable classes.
from sklearn.datasets import make_blobs
X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8, cluster_std=1.3, random_state=4) # 100 samples grouped into 8 clusters.
y_D2 %= 2 # To make cluster blobs binary
plt.figure()
plt.title("(4) Sample binary classification problem with non-linearly-separable classes")
plt.scatter(X_D2[:, 0], X_D2[:, 1], c=y_D2, marker='o', s=50, cmap=cmap_bold)

# Breast cancer dataset for classification
cancer = load_breast_cancer()
X_cancer, y_cancer = load_breast_cancer(return_X_y=True)

# Communities and Crime dataset
X_crime, y_crime =  load_crime_dataset()
# Target valuje to predict: per capita violent crime rate.

"""   ====K-Nearest Neighbors====    """

# Classification
from adspy_shared_utilities import plot_two_class_knn
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

# Figures 5-7
plot_two_class_knn(X_train, y_train, 1, "uniform", X_test, y_test) # Overfitting for complex model b/c too much variance.
plot_two_class_knn(X_train, y_train, 3, "uniform", X_test, y_test) # General trend more properly captured. Less accuracy in training set,
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from logistic_regression import LogisticRegression

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)

print("LR classification accuracy:", accuracy(y_test, predictions))
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

cancer_data = load_breast_cancer()
X = cancer_data.data
Y = cancer_data.target
print('Input Data size :', X.shape)
print('Output Data size :', Y.shape)
print('Label names :', cancer_data.target_names)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train, Y_train)

accuracy = clf.score(X_test, Y_test)
print(f'The accuracy is: {accuracy*100:.1f}%')
pred = clf.predict(X_test)
print(
    classification_report(Y_test, pred, target_names=cancer_data.target_names))
def Breast_cancer(training_size, test_size, n, PLOT_DATA):
    class_labels = [r'A', r'B']
    data, target = datasets.load_breast_cancer(True)
    sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.3, random_state=12)

    # Now we standarize for gaussian around 0 with unit variance
    std_scale = StandardScaler().fit(sample_train)
    sample_train = std_scale.transform(sample_train)
    sample_test = std_scale.transform(sample_test)

    # Now reduce number of features to number of qubits
    pca = PCA(n_components=n).fit(sample_train)
    sample_train = pca.transform(sample_train)
    sample_test = pca.transform(sample_test)

    # Scale to the range (-1,+1)
    samples = np.append(sample_train, sample_test, axis=0)
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    sample_train = minmax_scale.transform(sample_train)
    sample_test = minmax_scale.transform(sample_test)

    # Pick training size number of samples from each distro
    training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)}
    test_input = {key: (sample_train[label_train == k, :])[training_size:(
        training_size+test_size)] for k, key in enumerate(class_labels)}

    if PLOT_DATA:
        for k in range(0, 2):
            plt.scatter(sample_train[label_train == k, 0][:training_size],
                        sample_train[label_train == k, 1][:training_size])

        plt.title("PCA dim. reduced Breast cancer dataset")
        plt.show()

    return sample_train, training_input, test_input, class_labels