def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) assert_equal(res.target.size, 569) assert_equal(res.target_names.size, 2) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_breast_cancer(return_X_y=True) bunch = load_breast_cancer() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def load_breast_cancer_df(include_tgt=True, tgt_name="target", shuffle=False): """Loads the breast cancer dataset into a dataframe with the target set as the "target" feature or whatever name is specified in ``tgt_name``. Parameters ---------- include_tgt : bool, optional (default=True) Whether to include the target tgt_name : str, optional (default="target") The name of the target feature shuffle : bool, optional (default=False) Whether to shuffle the rows Returns ------- X : pd.DataFrame, shape=(n_samples, n_features) The loaded dataset """ bc = load_breast_cancer() X = pd.DataFrame.from_records(data=bc.data, columns=bc.feature_names) if include_tgt: X[tgt_name] = bc.target return X if not shuffle else shuffle_dataframe(X)
def test_RFECV(): from sklearn.datasets import load_boston from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_iris from sklearn.feature_selection import RFECV # Regression X, y = load_boston(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='reg:squarederror', random_state=0, verbosity=0) rfecv = RFECV( estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error') rfecv.fit(X, y) # Binary classification X, y = load_breast_cancer(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='binary:logistic', random_state=0, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc') rfecv.fit(X, y) # Multi-class classification X, y = load_iris(return_X_y=True) bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='multi:softprob', random_state=0, reg_alpha=0.001, reg_lambda=0.01, scale_pos_weight=0.5, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss') rfecv.fit(X, y)
def main(): dataset = datasets.load_breast_cancer() features = dataset.data labels = dataset.target train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) parameter_set = {'loss': ('hinge', 'squared_hinge'), 'C': [1, 10, 100, 1000, 5, 50, 500, 5000]} model = LinearSVC() grid_scores, best_score, best_params, test_score = validate_model(model=model, parameter_set=parameter_set, train_data=[train_features, train_labels], test_data=[test_features, test_labels]) print(grid_scores) print('SVM best score: {}'.format(best_score)) print('SVM best params : {}'.format(best_params)) print('SVM test score : {}'.format(test_score)) parameter_set = {'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'batch_size': [16, 32, 64, 128],} model = MLPClassifier() grid_scores, best_score, best_params, test_score = validate_model(model=model, parameter_set=parameter_set, train_data=[train_features, train_labels], test_data=[test_features, test_labels]) print(grid_scores) print('MLP best score: {}'.format(best_score)) print('MLP best params : {}'.format(best_params)) print('MLP test score : {}'.format(test_score))
def test_early_stopping(self): X, y = load_breast_cancer(True) params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1 } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) valid_set_name = 'valid_set' # no early stopping gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, valid_names=valid_set_name, verbose_eval=False, early_stopping_rounds=5) self.assertEqual(gbm.best_iteration, 10) self.assertIn(valid_set_name, gbm.best_score) self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) # early stopping occurs gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, valid_names=valid_set_name, verbose_eval=False, early_stopping_rounds=5) self.assertLessEqual(gbm.best_iteration, 100) self.assertIn(valid_set_name, gbm.best_score) self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
def test_dt(): cancer = load_breast_cancer() X, y = cancer.data, cancer.target feature_names = cancer.feature_names sk_dt = SKDT(random_state=1, max_depth=3) our_dt = ClassificationTree(feature_names=feature_names, random_state=1) sk_dt.fit(X, y) our_dt.fit(X, y) sk_pred = sk_dt.predict_proba(X) our_pred = our_dt.predict_proba(X) assert np.allclose(sk_pred, our_pred) sk_pred = sk_dt.predict(X) our_pred = our_dt.predict(X) assert np.allclose(sk_pred, our_pred) # With labels local_expl = our_dt.explain_local(X, y) local_viz = local_expl.visualize(0) assert local_viz is not None # Without labels local_expl = our_dt.explain_local(X) local_viz = local_expl.visualize(0) assert local_viz is not None global_expl = our_dt.explain_global() global_viz = global_expl.visualize() assert global_viz is not None
def main(arguments): # load the features of the dataset features = datasets.load_breast_cancer().data # standardize the features features = StandardScaler().fit_transform(features) # get the number of features num_features = features.shape[1] # load the corresponding labels for the features labels = datasets.load_breast_cancer().target # transform the labels to {-1, +1} labels[labels == 0] = -1 # split the dataset to 70/30 partition: 70% train, 30% test train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset as per the batch size train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] # instantiate the SVM class model = SVM(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES, num_features=num_features) # train the instantiated model model.train(epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels], train_size=train_features.shape[0], validation_data=[test_features, test_labels], validation_size=test_features.shape[0], result_path=arguments.result_path) test_conf, test_accuracy = utils.plot_confusion_matrix(phase='testing', path=arguments.result_path, class_names=['benign', 'malignant']) print('True negatives : {}'.format(test_conf[0][0])) print('False negatives : {}'.format(test_conf[1][0])) print('True positives : {}'.format(test_conf[1][1])) print('False positives : {}'.format(test_conf[0][1])) print('Testing accuracy : {}'.format(test_accuracy))
def test_binary(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) ret = log_loss(y_test, gbm.predict_proba(X_test)) self.assertLess(ret, 0.15) self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
def setUp(self): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) self.train_data = lgb.Dataset(self.X_train, self.y_train) self.params = { "objective": "binary", "verbose": -1, "num_leaves": 3 }
def load_binary_data(self, shuffled=True): samples = load_breast_cancer() if shuffled: self.X = shuffle(samples.data, random_state=self.SEED) self.y = shuffle(samples.target, random_state=self.SEED) else: self.X, self.y = samples.data, samples.target self.n_features = len(self.X[0])
def test_binary(self): X_y = load_breast_cancer(True) params = { 'objective': 'binary', 'metric': 'binary_logloss' } evals_result, ret = template.test_template(params, X_y, log_loss) self.assertLess(ret, 0.15) self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5)
def train_breast_cancer(param_in): data = datasets.load_breast_cancer() X = scale(data.data) dtrain = xgb.DMatrix(X, label=data.target) param = {'objective': 'binary:logistic'} param.update(param_in) bst = xgb.train(param, dtrain, num_rounds) xgb_pred = bst.predict(dtrain) xgb_score = metrics.accuracy_score(data.target, np.round(xgb_pred)) assert xgb_score >= 0.8
def test_issues_161_and_189(self): """ ensure DataManager(data).data == data """ X, y = load_breast_cancer(True) X, y = X[15:40], y[15:40] model = KNeighborsClassifier(weights='distance', p=2, n_neighbors=10).fit(X, y) skater_model = InMemoryModel(model.predict_proba, examples=X, probability=True) assert skater_model.probability is True assert skater_model.model_type == StaticTypes.model_types.classifier
def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) assert_equal(res.target.size, 569) assert_equal(res.target_names.size, 2) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_breast_cancer))
def test_chunked_dataset(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) chunk_size = X_train.shape[0] // 10 + 1 X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100}) valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100}) train_data.construct() valid_data.construct()
def main(arguments): # load the features of the dataset features = datasets.load_breast_cancer().data # standardize the features features = StandardScaler().fit_transform(features) # get the number of features num_features = features.shape[1] # load the labels for the features labels = datasets.load_breast_cancer().target train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.30, stratify=labels) model = MLP(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, node_size=NUM_NODES, num_classes=NUM_CLASSES, num_features=num_features) model.train(num_epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels], train_size=train_features.shape[0], test_data=[test_features, test_labels], test_size=test_features.shape[0], result_path=arguments.result_path)
def load_cancer_data(): # clinical measurements of breast cancer tumors # for the classification from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer() # cancer is like dict #import pdb; pdb.set_trace() # Format contain "replacement fields" surrounded by {} print("cancer.keys(): \n{}".format(cancer.keys())) print("shape of cancer data: {}".format(cancer.data.shape)) print("sample counts per class:\n{}".format( {n:v for n, v in zip(cancer.target_names, np.bincount(cancer.target))} )) # bincount counts number of occurrences of each value in array of ints print("Feature names:\n{}".format(cancer.feature_names))
def load_datasets(): iris = load_iris() iris_X, iris_y = iris['data'], iris['target'] digits = load_digits() digits_X, digits_y = digits['data'], digits['target'] breast_cancer = load_breast_cancer() breast_cancer_X, breast_cancer_y = breast_cancer['data'], breast_cancer['target'] diabetes = load_diabetes() diabetes_X, diabetes_y = diabetes['data'], diabetes['target'] mnist = fetch_mldata('MNIST original', data_home='datasets/') mnist_X, mnist_y = mnist['data'], mnist['target'] datasets = {'iris':("Iris Plants Dataset",iris_X, iris_y),'digits':("UCI ML hand-written digits dataset",digits_X, digits_y),'breast_cancer':("Breast Cancer Wisconsin (Diagnostic) Dataset",breast_cancer_X, breast_cancer_y),'mnist':("The MNIST database of handwritten digits",mnist_X, mnist_y)} #,'diabetes':("Diabetes dataset",diabetes_X, diabetes_y)})} #'breast_cancer':("Breast Cancer Wisconsin (Diagnostic) Dataset",breast_cancer_X, breast_cancer_y), return datasets
def train_cancer(param_in, comparison_tree_method): data = load_breast_cancer() dtrain = xgb.DMatrix(data.data, label=data.target) param = {} param['objective'] = 'binary:logistic' param.update(param_in) res_tmp = {} res = {} num_rounds = 10 xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')], evals_result=res_tmp) res[param['tree_method']] = res_tmp['train']['error'] param["tree_method"] = comparison_tree_method xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')], evals_result=res_tmp) res[comparison_tree_method] = res_tmp['train']['error'] return res
def test(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2) train_data = lgb.Dataset(X_train, label=y_train) valid_data = train_data.create_valid(X_test, label=y_test) params = { "objective": "binary", "metric": "auc", "min_data": 10, "num_leaves": 15, "verbose": -1, "num_threads": 1, "max_bin": 255 } bst = lgb.Booster(params, train_data) bst.add_valid(valid_data, "valid_1") for i in range(30): bst.update() if i % 10 == 0: print(bst.eval_train(), bst.eval_valid()) bst.save_model("model.txt") pred_from_matr = bst.predict(X_test) with tempfile.NamedTemporaryFile() as f: tname = f.name with open(tname, "w+b") as f: dump_svmlight_file(X_test, y_test, f) pred_from_file = bst.predict(tname) os.remove(tname) self.assertEqual(len(pred_from_matr), len(pred_from_file)) for preds in zip(pred_from_matr, pred_from_file): self.assertAlmostEqual(*preds, places=15) # check saved model persistence bst = lgb.Booster(params, model_file="model.txt") pred_from_model_file = bst.predict(X_test) self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) for preds in zip(pred_from_matr, pred_from_model_file): # we need to check the consistency of model file here, so test for exact equal self.assertEqual(*preds) # check early stopping is working. Make it stop very early, so the scores should be very close to zero pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} pred_early_stopping = bst.predict(X_test, **pred_parameter) self.assertEqual(len(pred_from_matr), len(pred_early_stopping)) for preds in zip(pred_early_stopping, pred_from_matr): # scores likely to be different, but prediction should still be the same self.assertEqual(preds[0] > 0, preds[1] > 0)
def main(): dataset = datasets.load_breast_cancer() features = dataset.data labels = dataset.target num_features = features.shape[1] features = StandardScaler().fit_transform(features) train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) model = NearestNeighbor(train_features, train_labels, num_features) model.predict(test_features, test_labels, result_path='./results/nearest_neighbor/')
def test_contribs(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, } lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, verbose_eval=False, evals_result=evals_result) self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True) - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
def test_plot_metrics(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) test_data = lgb.Dataset(X_test, y_test, reference=train_data) params = { "objective": "binary", "metric": {"binary_logloss", "binary_error"}, "verbose": -1, "num_leaves": 3 } evals_result0 = {} gbm0 = lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, evals_result=evals_result0, verbose_eval=False) ax0 = lgb.plot_metric(evals_result0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Metric during training') self.assertEqual(ax0.get_xlabel(), 'Iterations') self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'}) ax0 = lgb.plot_metric(evals_result0, metric='binary_error') ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) evals_result1 = {} gbm1 = lgb.train(params, train_data, num_boost_round=10, evals_result=evals_result1, verbose_eval=False) self.assertRaises(ValueError, lgb.plot_metric, evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '')
def main(): dataset = datasets.load_breast_cancer() features = dataset.data features = StandardScaler().fit_transform(features) num_features = features.shape[1] labels = dataset.target labels[labels == 0] = -1 train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset to be exact as per the batch size # e.g. train_size = 1898322, batch_size = 256 # [:1898322-(1898322%256)] = [:1898240] # 1898322 // 256 = 7415; 7415 * 256 = 1898240 train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] # modify the size of the dataset to be passed on model.train() train_size = train_features.shape[0] # slice the dataset to be exact as per the batch size test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] test_size = test_features.shape[0] model = GruSvm(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, cell_size=CELL_SIZE, dropout_rate=DROPOUT_RATE, num_classes=NUM_CLASSES, sequence_length=num_features, svm_c=SVM_C) model.train(checkpoint_path='./checkpoint_path/gru_svm/', log_path='./log_path/gru_svm/', model_name='gru_svm', epochs=3000, train_data=[train_features, train_labels], train_size=train_size, validation_data=[test_features, test_labels], validation_size=test_size, result_path='./results')
def test_binary(self): X, y = load_breast_cancer(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, 'num_iteration': 50 # test num_iteration in dict here } lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, verbose_eval=False, evals_result=evals_result) ret = log_loss(y_test, gbm.predict(X_test)) self.assertLess(ret, 0.15) self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50) self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
def test_plot_importance(self): X_train, _, y_train, _ = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) params = { "objective": "binary", "verbose": -1, "num_leaves": 3 } gbm0 = lgb.train(params, train_data, num_boost_round=10) ax0 = lgb.plot_importance(gbm0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Feature importance') self.assertEqual(ax0.get_xlabel(), 'Feature importance') self.assertEqual(ax0.get_ylabel(), 'Features') self.assertLessEqual(len(ax0.patches), 30) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(X_train, y_train) ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y') self.assertIsInstance(ax1, matplotlib.axes.Axes) self.assertEqual(ax1.get_title(), 't') self.assertEqual(ax1.get_xlabel(), 'x') self.assertEqual(ax1.get_ylabel(), 'y') self.assertLessEqual(len(ax1.patches), 30) for patch in ax1.patches: self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '') self.assertLessEqual(len(ax2.patches), 30) self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b
def main(): dataset = datasets.load_breast_cancer() features = dataset.data features = StandardScaler().fit_transform(features) num_features = features.shape[1] labels = dataset.target train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset to be exact as per the batch size # e.g. train_size = 1898322, batch_size = 256 # [:1898322-(1898322%256)] = [:1898240] # 1898322 // 256 = 7415; 7415 * 256 = 1898240 train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] # modify the size of the dataset to be passed on model.train() train_size = train_features.shape[0] # slice the dataset to be exact as per the batch size test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] test_size = test_features.shape[0] model = LinearRegression(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES, sequence_length=num_features) model.train(epochs=3000, log_path='./log_path/linear_regression/', train_data=[train_features, train_labels], train_size=train_size, validation_data=[test_features, test_labels], validation_size=test_size, result_path='./results/linear_regression/')
def test(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, max_bin=255, label=y_train) valid_data = train_data.create_valid(X_test, label=y_test) params = { "objective": "binary", "metric": "auc", "min_data": 1, "num_leaves": 15, "verbose": -1 } bst = lgb.Booster(params, train_data) bst.add_valid(valid_data, "valid_1") for i in range(30): bst.update() if i % 10 == 0: print(bst.eval_train(), bst.eval_valid()) bst.save_model("model.txt") pred_from_matr = bst.predict(X_test) with tempfile.NamedTemporaryFile() as f: tname = f.name with open(tname, "w+b") as f: np.savetxt(f, X_test, delimiter=',') pred_from_file = bst.predict(tname) os.remove(tname) self.assertEqual(len(pred_from_matr), len(pred_from_file)) for preds in zip(pred_from_matr, pred_from_file): self.assertAlmostEqual(*preds, places=15) # check saved model persistence bst = lgb.Booster(params, model_file="model.txt") pred_from_model_file = bst.predict(X_test) self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) for preds in zip(pred_from_matr, pred_from_model_file): self.assertEqual(*preds) # check pmml os.system('python ../../pmml/pmml.py model.txt')
def test_XGBClassifier_resume(): from sklearn.datasets import load_breast_cancer from sklearn.metrics import log_loss with TemporaryDirectory() as tempdir: model1_path = os.path.join(tempdir, 'test_XGBClassifier.model') model1_booster_path = os.path.join(tempdir, 'test_XGBClassifier.booster') X, Y = load_breast_cancer(return_X_y=True) model1 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8) model1.fit(X, Y) pred1 = model1.predict(X) log_loss1 = log_loss(pred1, Y) # file name of stored xgb model model1.save_model(model1_path) model2 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 # file name of 'Booster' instance Xgb model model1.get_booster().save_model(model1_booster_path) model2 = xgb.XGBClassifier(learning_rate=0.3, seed=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_booster_path) pred2 = model2.predict(X) log_loss2 = log_loss(pred2, Y) assert np.any(pred1 != pred2) assert log_loss1 > log_loss2
import numpy as np def to_file(answer, num): with open('bayes_answer' + str(num) + '.txt', 'w') as fout: fout.write(str(answer)) def get_ME(X, Y): return cross_validation.cross_val_score( naive_bayes.BernoulliNB(), X, Y).mean(), cross_validation.cross_val_score( naive_bayes.MultinomialNB(), X, Y).mean(), cross_validation.cross_val_score( naive_bayes.GaussianNB(), X, Y).mean() def terminate(X, Y): m1, m2, m3 = get_ME(X, Y) print('BernoulliNB: {0}\n'.format(m1), 'MultinomialNB: {0}\n'.format(m2), 'GaussianNB: {0}'.format(m3)) return max(m1, m2, m3) if __name__ == '__main__': pack = [datasets.load_digits(), datasets.load_breast_cancer()] X, Y, M, N = pack[0].data, pack[0].target, pack[1].data, pack[1].target to_file(terminate(M, N), 1) to_file(terminate(X, Y), 2) to_file('3 4', 3)
def experiments(config_file): args = get_args_parser().parse_args(['@' + config_file]) # Set seed np.random.seed(int(args.seed)) # Construct output directory timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") outdir = args.outdir + timestamp + '/' # Create results directory outdir_path = Path(outdir) if not outdir_path.is_dir(): os.makedirs(outdir) # Logging logfile = outdir + 'log.txt' log(logfile, "Directory " + outdir + " created.") # Set dataset n_samples = 1500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) # Set plot settings plt.figure(figsize=(7 * 2 + 6, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plt.style.use('dark_background') plot_num = 1 datasets = ((3, load_iris(return_X_y=True), "Iris"), (2, load_breast_cancer(return_X_y=True), "Breast Cancer"), (2, noisy_circles, "Noisy Circles")) # Traverse datasets # High-level abstraction is from https://scikit-learn.org/stable/modules/clustering.html for i, (n_clusters, dataset, dataset_name) in enumerate(datasets): X, y = dataset # Normalization of features for easier parameter selection X = StandardScaler().fit_transform(X) connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # connectivity = 0.5 * (connectivity + connectivity.T) # Make connectivity symmetric average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=n_clusters, connectivity=connectivity) ward_linkage = cluster.AgglomerativeClustering(linkage="ward", n_clusters=n_clusters) complete_linkage = cluster.AgglomerativeClustering( linkage="complete", n_clusters=n_clusters) single_linkage = cluster.AgglomerativeClustering(linkage="single", n_clusters=n_clusters) k_means = cluster.KMeans(n_clusters=n_clusters) gaussian_mixture = mixture.GaussianMixture(n_components=n_clusters, covariance_type='full') # Set techniques techniques = ( ('Agglomerative Avg', average_linkage), ('Agglomerative Single', single_linkage), ('Agglomerative Complete', complete_linkage), ('Agglomerative Ward', ward_linkage), ('kMeans', k_means), ('GaussianMixture', gaussian_mixture), ) for name, technique in techniques: log(logfile, dataset_name + ", " + name) time_start = time.time() # Catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) technique.fit(X) time_stop = time.time() # Predictions if hasattr(technique, 'labels_'): y_pred = technique.labels_.astype(np.int) else: y_pred = technique.predict(X) # Entropy metric true_cluster_labels = [ y[get_cluster_indices(c, y_pred)] for c in range(n_clusters) ] overall_entropy = get_overall_entropy(true_cluster_labels, y.shape[0]) # F-Score metric f1_score = metrics.f1_score(y, y_pred, average='weighted') log(logfile, "\tOverall entropy: " + str(round(overall_entropy, 3))) log(logfile, "\tF1 Score: " + str(round(f1_score, 3))) # Plotting plt.subplot(len(datasets), len(techniques), plot_num) if i == 0: plt.title("{}".format(name), size=15) colors = np.array( list( islice(cycle(['#377eb8', '#ff7f00', '#4daf4a']), int(max(y_pred) + 1)))) colors = np.append( colors, ["#000000"]) # Add black color for outliers (if any) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred], alpha=0.60) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.15, .01, ('%.2fs' % (time_stop - time_start)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plt.text(.99, .07, ('%.2f' % (overall_entropy)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plt.text(.99, .01, ('%.2f' % (f1_score)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 # Plotting plt.savefig(outdir + 'plot.png', bbox_inches='tight')
def entropy(y): unique, counts = np.unique(y, return_counts=True) ps = counts / len(y) return -np.sum([p * np.log2(p) for p in ps if p > 0]) class Node: def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None): self.feature = feature self.threshold = threshold self.left = left self.right = right self.value = value def is_leaf_node(self): return self.value is not None data = datasets.load_breast_cancer(return_X_y=True) # dataset in tuple data = np.column_stack((data[0], data[1])) # converts dataset into M x N array decisionTree = DecisionTree(data, max_depth=10) decisionTree.fit() decisionTree.predict()
# # LeMbRaR DiSsO AcImA ^ """ import numpy as np from sklearn import datasets def sigmoid(soma): return 1 / (1 + np.exp(-soma)) def sigmoidDerivada(sig): return sig * (1 - sig) base = datasets.load_breast_cancer() # carrega dataset entradas = base.data # pega os dados do dadaset como entradas valoresSaida = base.target # pega os alvos, resultados do dataset como saidas saidas = np.empty([569, 1], dtype=int) # cria um array vazio for i in range(569): saidas[i] = valoresSaida[i] #preenche o array vazio com valores do dataset pesos0 = 2 * np.random.random( (30, 5)) - 1 # 30(atributos de entrada), 5(neuronios na camada escondida) pesos1 = 2 * np.random.random( (5, 1 )) - 1 # 5(numero de pesos camada oculta), 1(neuronio de saida - classes) epocas = 10000 taxaAprendizagem = 0.3 momento = 1
from tensorflow.keras import regularizers #This allows using whichever regularizer we want (l1,l2,l1_l2) from tensorflow.keras.utils import to_categorical #This allows using categorical cross entropy as the cost function import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split as splitter from sklearn.datasets import load_breast_cancer import pickle import os # %% """Load breast cancer dataset""" np.random.seed(0) #create same seed for random number every time cancer = load_breast_cancer() #Download breast cancer dataset inputs = cancer.data #Feature matrix of 569 rows (samples) and 30 columns (parameters) outputs = cancer.target #Label array of 569 rows (0 for benign and 1 for malignant) labels = cancer.feature_names[0:30] print('The content of the breast cancer dataset is:' ) #Print information about the datasets print(labels) print('-------------------------') print("inputs = " + str(inputs.shape)) print("outputs = " + str(outputs.shape)) print("labels = " + str(labels.shape)) x = inputs #Reassign the Feature and Label matrices to other variables y = outputs
# argument yang digunakan untuk menentukan model algoritma klasifikasi yang ingin digunakan ap.add_argument("-m", "--model", type=str, default="knn", help="type of python machine learning model to use") args = vars(ap.parse_args()) # dictionary dari model-model yang dapat digunakan models = { "knn": KNeighborsClassifier(n_neighbors=1), "naive_bayes": GaussianNB(), "logit": LogisticRegression(solver="lbfgs", multi_class="auto"), "svm": SVC(kernel="rbf", gamma="auto"), "decision_tree": DecisionTreeClassifier(), "pct": Perceptron(), "random_forest": RandomForestClassifier(n_estimators=100), "mlp": MLPClassifier() } # meload dataset iris dan membagi data untuk training dan testing, contoh dibawah data test diambil 25% dan data train 75% print("[INFO] loading data...") dataset = load_breast_cancer() (trainX, testX, trainY, testY) = train_test_split(dataset.data, dataset.target, random_state=3, test_size=0.25) # data training menggunakan model yang dipilih user print("[INFO] using '{}' model".format(args["model"])) model = models[args['model']] model.fit(trainX,trainY) # evaluasi data menggunakan setiap baris data test print("[INFO] evaluating...") predictions = model.predict(testX) print(classification_report(testY, predictions, target_names=dataset.target_names))
from qn2 import five_number_summary from sklearn.preprocessing import MinMaxScaler def normalize_minmax(data): #Check if is 2D array array = len(data.shape) if array != 2: return None else: ls = [] #scaler = MinMaxScaler() #scaler.fit(data) #return scaler.transform(data) for i in range(data.shape[1]): row = data[:, i] #print(row) #print(np.max(row), np.min(row)) row = (row - np.min(row)) / (np.max(row) - np.min(row)) ls.append(row) #print(np.array(ls)) return np.array(ls).transpose() bunchobject = datasets.load_breast_cancer() cols = [1, 7] some_columns = bunchobject.data[:, cols] snorm = normalize_minmax(some_columns) #print("normalized",five_number_summary(snorm)) # can use ndim to check for dimensions
""" Created on Sat Sep 19 19:47:14 2020 @author: https://www.codigofluente.com.br/aula-04-instalando-o-pandas/ """ from __future__ import division, print_function import skfuzzy as fuzz from sklearn.datasets import load_breast_cancer import numpy as np from sklearn.model_selection import train_test_split colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen'] #Carrega o iris dataset em iris iris = load_breast_cancer() n_testes_uteis = 0 numeroDeTestes = 100 desempenho = 0 for cont in range(numeroDeTestes): ncenters = 2 #Divisão entre treino e teste aleatória (50% para cada) X_treino, X_teste, y_treino, y_teste = train_test_split(iris.data, iris.target, test_size=0.5) #Divisão entre treino e teste usando paridade do indice (50% para cada) #X_treino, X_teste, y_treino, y_teste = iris.data[0:][::2], iris.data[1:][::2], iris.target[0:][::2], iris.target[1:][::2]
""" Created on 2021/1/5 11:09 @author: Irvinfaith @email: [email protected] """ from core.base_neural_network import NeuralNetwork from core.optimizer import * from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler # 导入样例数据 data_loader = load_breast_cancer() data = data_loader['data'] # 进行归一化 mms = MinMaxScaler() data = mms.fit_transform(data) # 拆分训练测试集 X_train, X_test, y_train, y_test = train_test_split(data, data_loader['target'], test_size=0.3, random_state=101) # 输入层 nn = NeuralNetwork(30, True) # 添加全连接层,并定义神经元个数以及该层的激活函数 nn.add_dense_layer(64, "sigmoid") nn.add_dense_layer(32, "sigmoid")
from sklearn.datasets import load_breast_cancer from sklearn.svm import SVC, LinearSVC from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor boston = load_breast_cancer() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.2, shuffle = True, random_state = 1) #model = KNeighborsClassifier() #model = SVC() #model = LinearSVC() #model = KNeighborsRegressor() #model = RandomForestClassifier() #model = RandomForestRegressor() #3.실행 model.fit(x_train, y_train) #4.평가 예측 y_predict = model.predict(x_test)
from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier import matplotlib.pyplot as plt from collections import defaultdict import numpy as np from scipy.stats import spearmanr from scipy.cluster import hierarchy from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance from sklearn.model_selection import train_test_split import pandas as pd data = load_breast_cancer() data.target[[10, 50, 85]] X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set")
def download_datasets(): i = 0 i += 1 print("%d of 10" % i) print("Iris\n") iris = load_iris() X1 = iris.data Y1 = iris.target i += 1 print("%d of 10" % i) print("Digits\n") digits = load_digits() X2 = digits.data Y2 = digits.target i += 1 print("%d of 10" % i) print("Breast cancer\n") breast_cancer = load_breast_cancer() X3 = breast_cancer.data Y3 = breast_cancer.target i += 1 print("%d of 10" % i) print("Sensorless drive diagnosis\n") Sensorless_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt', sep=' ', header=None) X4 = Sensorless_data.values[:, 0:48] Y4 = Sensorless_data.values[:, 48] i += 1 print("%d of 10" % i) print("Banknote authentication\n") banknote_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt', sep=',', header=None) X5 = banknote_data.values[:, 0:4] Y5 = banknote_data.values[:, 4] i += 1 print("%d of 10" % i) print("Balance\n") balance_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', sep=',', header=None) X6 = balance_data.values[:, 1:5] Y6 = balance_data.values[:, 0] i += 1 print("%d of 10" % i) print("Wifi localization\n") wifi_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/00422/wifi_localization.txt', sep='\s+', header=None) X7 = wifi_data.values[:, 0:7] Y7 = wifi_data.values[:, 7] i += 1 print("%d of 10" % i) print("CMC\n") cmc_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', sep=',', header=None) X8 = cmc_data.values[:, 0:9] Y8 = cmc_data.values[:, 9] i += 1 print("%d of 10" % i) print("Yeast\n") yeast_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data', sep='\s+', header=None) X9 = yeast_data.values[:, 1:9] Y9 = yeast_data.values[:, 9] i += 1 print("%d of 10" % i) print("Abalone\n") abalone_data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', sep=',', header=None) X10 = abalone_data.values[:, 0:8] for i in X10: if i[0] == 'M': i[0] = 1 elif i[0] == 'F': i[0] = 2 else: i[0] = 3 Y10 = abalone_data.values[:, 8].astype(int) print("Saving the datasets for future use.") pickle.dump((X1, X2, X3, X4, X5, X6, X7, X8, X9, X10), open('datasets_x.pkl', 'wb')) pickle.dump((Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10), open('datasets_y.pkl', 'wb'))
# -*- coding: utf-8 -*- from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(\ cancer.data, cancer.target, stratify=cancer.target, random_state=0) clf = GradientBoostingClassifier(random_state=0, max_depth=3, learning_rate=0.01, subsample=0.2) clf.fit(X_train, y_train) print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train))) print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test))) import os # pip install joblib from joblib import dump try: if not(os.path.isdir("../../save")): os.makedirs(os.path.join("../../save")) except OSError as e: if e.errno != errno.EEXIST: print("Failed to create directory!!!!!")
def plot_agglomerative_dendograms(config_file): args = get_args_parser().parse_args(['@' + config_file]) # Set seed np.random.seed(int(args.seed)) # Construct output directory timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") outdir = args.outdir + timestamp + '/' # Create results directory outdir_path = Path(outdir) if not outdir_path.is_dir(): os.makedirs(outdir) # Set dataset n_samples = 1500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) plt.figure(figsize=(2 * 10 + 2, 18.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plt.style.use('dark_background') plot_num = 1 datasets = ((3, load_iris(return_X_y=True)), (2, load_breast_cancer(return_X_y=True)), (2, noisy_circles)) for i, (n_clusters, dataset) in enumerate(datasets): X, y = dataset # Normalization of features for easier parameter selection X = StandardScaler().fit_transform(X) connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T ) # Make connectivity symmetric # Setting distance_threshold=0 ensures we compute the full tree. # Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", distance_threshold=0, n_clusters=None, connectivity=connectivity) ward_linkage = cluster.AgglomerativeClustering(linkage="ward", distance_threshold=0, n_clusters=None) complete_linkage = cluster.AgglomerativeClustering( linkage="complete", distance_threshold=0, n_clusters=None) single_linkage = cluster.AgglomerativeClustering(linkage="single", distance_threshold=0, n_clusters=None) techniques = (('Agglomerative Avg', average_linkage), ('Agglomerative Single', single_linkage), ('Agglomerative Complete', complete_linkage), ('Agglomerative Ward', ward_linkage)) for name, technique in techniques: # Catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) model = technique.fit(X) plt.subplot(len(datasets), len(techniques), plot_num) if i == 0: plt.title("{}".format(name), size=18) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plot_dendrogram(model, truncate_mode='level', p=n_clusters, no_labels=True) plot_num += 1 # Plotting plt.savefig(outdir + 'agglomerative_dendrograms.png', bbox_inches='tight')
from xgboost import XGBRegressor, plot_importance, XGBClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score, accuracy_score import matplotlib.pyplot as plt dataset = load_breast_cancer() # x = dataset.data # y = dataset.target x, y = load_breast_cancer(return_X_y=True) print(x.shape) print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.96, shuffle='True', random_state=16) model = XGBClassifier(n_estimators=300, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse", "mae"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20)
def setUp(self): data = load_breast_cancer() self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split( data.data, data.target, shuffle=True, train_size=50)
x_test , y_test = x[test_idx, : ] , y[test_idx] print('\nTraining') print(x_train.shape , y_train.shape) print('\nValidating') print(x_val.shape , y_val.shape) print('\nTesting') print(x_test.shape , y_test.shape) print('\n\t\t---------- Breast cancer datasets --------') breast_cancer_data = skdata.load_breast_cancer() #Loading data feature_names = breast_cancer_data.feature_names x1 = breast_cancer_data.data y1 = breast_cancer_data.target print('\n',x1.shape) print('\n',y1.shape) for name , value in zip(feature_names , x1[0,...]) : print('\n{} : {}'.format(name , value)) print('\nLabel: {}'.format(y1[0]))
def breast_cancer_dataset(): X, y = load_breast_cancer(return_X_y=True) X = StandardScaler().fit_transform(X) return X, y
# Import Libraries from sklearn.datasets import load_breast_cancer from sklearn.impute import SimpleImputer import numpy as np #---------------------------------------------------- #load breast cancer data BreastData = load_breast_cancer() #X Data X = BreastData.data #y Data y = BreastData.target #---------------------------------------------------- # Cleaning data ''' impute.SimpleImputer(missing_values=nan, strategy='mean’, fill_value=None, verbose=0, copy=True) ''' ImputedModule = SimpleImputer(missing_values=np.nan, strategy='mean') ImputedX = ImputedModule.fit(X) X = ImputedX.transform(X) #X Data print('X Data is \n', X[:10]) #y Data print('y Data is \n', y[:10])
def setUp(self): super(ConfounderRemovalTests, self).setUp() self.X, self.y = load_breast_cancer(True) self.X_train = self.X[:100] self.y_train = self.y[:100] self.shuffle_split = ShuffleSplit(test_size=0.2, n_splits=1, random_state=15) settings = OutputSettings(project_folder=self.tmp_folder_path) self.pipe = Hyperpipe( "confounder_pipe", outer_cv=self.shuffle_split, inner_cv=KFold(n_splits=3, random_state=15), metrics=["accuracy"], best_config_metric="accuracy", output_settings=settings, ) self.pipe += PipelineElement("StandardScaler") self.cr = PipelineElement("ConfounderRemoval") self.pipe += self.cr self.pipe += PipelineElement("SVC") self.random_confounders = np.random.randn(self.X.shape[0], 1) # do confounder removal by hand self.multiple_confounders = np.random.randn(self.X.shape[0], 2) * 10 ols_confounder = sm.add_constant(self.multiple_confounders) self.X_transformed = np.empty(self.X.shape) for i in range(self.X.shape[1]): # fit model = sm.OLS(endog=np.squeeze(self.X[:, i]), exog=ols_confounder).fit() # transform self.X_transformed[:, i] = np.asarray( np.squeeze(self.X[:, i]) - np.matmul(ols_confounder, np.squeeze(model.params))) # prepare caching self.X_train_transformed = np.empty(self.X_train.shape) self.confounder_train = self.multiple_confounders[:100] ols_confounder_train = sm.add_constant(self.confounder_train) for i in range(self.X_train.shape[1]): # fit model = sm.OLS(endog=np.squeeze(self.X_train[:, i]), exog=ols_confounder_train).fit() # transform self.X_train_transformed[:, i] = np.asarray( np.squeeze(self.X_train[:, i]) - np.matmul(ols_confounder_train, np.squeeze(model.params))) # prepare confounder removal with standardization of covariates scaled_covs = list() # standardize covariates for cov in self.multiple_confounders.T: scaler = StandardScaler() scaled_covs.append( scaler.fit_transform(cov.reshape(-1, 1)).squeeze()) scaled_covs = np.asarray(scaled_covs).T scaled_covs = sm.add_constant(scaled_covs) self.X_transformed_standardized = np.empty(self.X.shape) for i in range(self.X.shape[1]): # fit model = sm.OLS(endog=np.squeeze(self.X[:, i]), exog=scaled_covs).fit() # transform self.X_transformed_standardized[:, i] = np.asarray( np.squeeze(self.X[:, i]) - np.matmul(scaled_covs, np.squeeze(model.params))) # prepare statistical testing of confounder removal # Generate samples from three independent normally distributed random # variables (with mean 0 and std. dev. 1). x = norm.rvs(size=(4, 300)) # desired covariance matrix r = np.array([ [1, 0.9, 0.9, 0.9], [0.9, 1, 0.9, 0.9], [0.9, 0.9, 1, 0.9], [0.9, 0.9, 0.9, 1], ]) c = cholesky(r, lower=True) # convert the data to correlated random variables self.z = np.dot(c, x).T
from xgboost import XGBClassifier, XGBRegressor from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectFromModel from sklearn.metrics import accuracy_score #1. 데이터 # dataset = load_boston() # x = dataset.data # y = dataset.target x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.8, random_state=77, shuffle=True ) #2. 모델 # model = XGBRegressor(n_estimators=1000, learning_rate=0.1) #estimator 없이 #default가 몇 갤까? -> 100번 돈다 : n_estimator default 100
def gen_dataset_func_eager(): data_raw = load_breast_cancer(as_frame=True) dataset_df = data_raw["data"] dataset_df["target"] = data_raw["target"] dataset = from_pandas(dataset_df) return dataset
import pandas as pd import matplotlib.pyplot as plt import seaborn as sb from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.datasets import load_breast_cancer #built in datasets that come with skikit learn (sklearn) #This config fixes the truncated console writes. Makes things a lot easier pd.set_option('display.width', 400) pd.set_option('display.max_columns', 10) np.set_printoptions(linewidth=400) #['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'] my_data = load_breast_cancer( ) #datatype is a little odd. Works like a dictionary, however, which can then be used to get the data out that we need my_df_var = pd.DataFrame(data=my_data['data'], columns=my_data['feature_names']) #!!!!!!! Scale the data !!!!!!!!!!!! #Standard scalar fitted to ensure that all data in scaled in the same manner my_scalar = StandardScaler().fit(my_df_var) my_df_scaled = pd.DataFrame(data=my_scalar.transform(my_df_var), columns=my_data['feature_names']) #Scaled data #!!!!! Perform the PCA !!!!!!!!!!!! my_pca = PCA(n_components=2).fit(my_df_scaled) #Maintain the two first PC's
def load_data() -> pd.DataFrame: data = load_breast_cancer() df = pd.DataFrame(data["data"], columns=data["feature_names"]) df["target"] = data["target"] return df
def main(config): # 연산 디바이스 설정 if config.gpu_id < 0: print("Device: CPU") device = torch.device('cpu') else: print("Device:", torch.cuda.get_device_name(0)) device = torch.device('cuda:%d' % config.gpu_id) # 유방암 데이터 가져오기 cancer_data = load_breast_cancer() df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names) df['class'] = cancer_data.target data = torch.from_numpy(df.values).float() x = data[:, :30] y = data[:, -1:] # 학습, 검증, 테스트 데이터 나누고 섞기 ratios = [.6, .2, .2] train_cnt = int(x.size(0) * ratios[0]) valid_cnt = int(x.size(0) * ratios[1]) test_cnt = x.size(0) - train_cnt - valid_cnt cnts = [train_cnt, valid_cnt, test_cnt] indices = torch.randperm(x.size(0)) x = torch.index_select(x, dim=0, index=indices).to(device) y = torch.index_select(y, dim=0, index=indices).to(device) x = x.split(cnts, dim=0) y = y.split(cnts, dim=0) # 토치 데이터셋, 로더를 이용하여 데이터 객체화 train_loader = DataLoader(dataset=CustomDataset(x[0], y[0]), batch_size=config.batch_size, shuffle=True) valid_loader = DataLoader(dataset=CustomDataset(x[1], y[1]), batch_size=config.batch_size, shuffle=False) test_loader = DataLoader(dataset=CustomDataset(x[2], y[2]), batch_size=config.batch_size, shuffle=False) print("Train %d / Valid %d / Test %d samples." % ( len(train_loader.dataset), len(valid_loader.dataset), len(test_loader.dataset), )) # 모델 선언 및 구조 결정 model = CancerClassifier(x[0].size(-1), y[0].size(-1)).to(device) optimizer = optim.Adam(model.parameters()) # 학습 수행 trainer = Trainer(model, optimizer, train_loader, valid_loader) trainer.train(config) # Loss history plot_from = 2 plt.figure(figsize=(20, 10)) plt.grid(True) plt.title("Train / Valid Loss History") plt.plot( range(plot_from, len(trainer.train_history)), trainer.train_history[plot_from:], range(plot_from, len(trainer.valid_history)), trainer.valid_history[plot_from:], ) plt.yscale('log') plt.show() # Evaluate test_loss = 0 y_hat = [] model.eval() with torch.no_grad(): for x_i, y_i in test_loader: y_hat_i = model(x_i) loss = F.binary_cross_entropy(y_hat_i, y_i) test_loss += float(loss) # Gradient is already detached. y_hat += [y_hat_i] test_loss = test_loss / len(test_loader) y_hat = torch.cat(y_hat, dim=0) print("Test loss: %.4e" % test_loss) correct_cnt = (y[2] == (y_hat > .5)).sum() total_cnt = float(y[2].size(0)) print('Test Accuracy: %.4f' % (correct_cnt / total_cnt))
- wandb depend: requirements: - scikit-learn assert: - :wandb:runs_len: 1 - :wandb:runs[0][exitcode]: 0 - :yea:exit: 0 - :wandb:runs[0][summary][feature_importances][_type]: table-file - :wandb:runs[0][summary][feature_importances][ncols]: 2 - :wandb:runs[0][summary][feature_importances][nrows]: 30 """ from sklearn import datasets from sklearn.linear_model import ElasticNet from sklearn.model_selection import train_test_split import wandb wandb.init("my-scikit-integration") wbcd = wisconsin_breast_cancer_data = datasets.load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(wbcd.data, wbcd.target, test_size=0.2) labels = wbcd.target_names model = ElasticNet() model.fit(X_train, y_train) wandb.sklearn.plot_feature_importances(model)
import sklearn as skt import numpy as np import sklearn.datasets as data import matplotlib.pyplot as plt dx,dy=data.load_breast_cancer(return_X_y=True) plt.figure() plt.plot(dx,dy) plt.show()
X_C2, y_C2 = make_classification(n_samples=100, n_features=2,# 100 n_samples n_redundant=0, n_informative=2, n_clusters_per_class=1, flip_y=0.1, # 10% chance of flipping correct label; pose challenge to classifier. class_sep=0.5, random_state=0) plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2, marker='o', s=50, cmap=cmap_bold) # More difficult syndata for binary classifying with non-linearly-separable classes. from sklearn.datasets import make_blobs X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8, cluster_std=1.3, random_state=4) # 100 samples grouped into 8 clusters. y_D2 %= 2 # To make cluster blobs binary plt.figure() plt.title("(4) Sample binary classification problem with non-linearly-separable classes") plt.scatter(X_D2[:, 0], X_D2[:, 1], c=y_D2, marker='o', s=50, cmap=cmap_bold) # Breast cancer dataset for classification cancer = load_breast_cancer() X_cancer, y_cancer = load_breast_cancer(return_X_y=True) # Communities and Crime dataset X_crime, y_crime = load_crime_dataset() # Target valuje to predict: per capita violent crime rate. """ ====K-Nearest Neighbors==== """ # Classification from adspy_shared_utilities import plot_two_class_knn X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0) # Figures 5-7 plot_two_class_knn(X_train, y_train, 1, "uniform", X_test, y_test) # Overfitting for complex model b/c too much variance. plot_two_class_knn(X_train, y_train, 3, "uniform", X_test, y_test) # General trend more properly captured. Less accuracy in training set,
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from logistic_regression import LogisticRegression def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy bc = datasets.load_breast_cancer() X, y = bc.data, bc.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000) regressor.fit(X_train, y_train) predictions = regressor.predict(X_test) print("LR classification accuracy:", accuracy(y_test, predictions))
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import classification_report cancer_data = load_breast_cancer() X = cancer_data.data Y = cancer_data.target print('Input Data size :', X.shape) print('Output Data size :', Y.shape) print('Label names :', cancer_data.target_names) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) clf = SVC(kernel='linear', C=1.0, random_state=42) clf.fit(X_train, Y_train) accuracy = clf.score(X_test, Y_test) print(f'The accuracy is: {accuracy*100:.1f}%') pred = clf.predict(X_test) print( classification_report(Y_test, pred, target_names=cancer_data.target_names))
def Breast_cancer(training_size, test_size, n, PLOT_DATA): class_labels = [r'A', r'B'] data, target = datasets.load_breast_cancer(True) sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.3, random_state=12) # Now we standarize for gaussian around 0 with unit variance std_scale = StandardScaler().fit(sample_train) sample_train = std_scale.transform(sample_train) sample_test = std_scale.transform(sample_test) # Now reduce number of features to number of qubits pca = PCA(n_components=n).fit(sample_train) sample_train = pca.transform(sample_train) sample_test = pca.transform(sample_test) # Scale to the range (-1,+1) samples = np.append(sample_train, sample_test, axis=0) minmax_scale = MinMaxScaler((-1, 1)).fit(samples) sample_train = minmax_scale.transform(sample_train) sample_test = minmax_scale.transform(sample_test) # Pick training size number of samples from each distro training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)} test_input = {key: (sample_train[label_train == k, :])[training_size:( training_size+test_size)] for k, key in enumerate(class_labels)} if PLOT_DATA: for k in range(0, 2): plt.scatter(sample_train[label_train == k, 0][:training_size], sample_train[label_train == k, 1][:training_size]) plt.title("PCA dim. reduced Breast cancer dataset") plt.show() return sample_train, training_input, test_input, class_labels