def test_fetch_covtype_true_shuffle(): hold1 = fetch_covtype(download_if_missing=True, shuffle=True) hold2 = fetch_covtype(download_if_missing=False, shuffle=False) data1, data2 = hold1['data'], hold2['data'] target1, target2 = hold1['data'], hold2['data'] assert_false(np.array_equal(data1, data2)) assert_false(np.array_equal(target1, target2))
def covtype_binary(dataset_dir: Path) -> bool: """ Cover type dataset from UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/covertype y contains 7 unique class labels from 1 to 7 inclusive. Classification task. n_classes = 7. covtype X train dataset (464809, 54) covtype y train dataset (464809, 1) covtype X test dataset (116203, 54) covtype y test dataset (116203, 1) """ dataset_name = 'covtype_binary' os.makedirs(dataset_dir, exist_ok=True) nrows_train, nrows_test = 100000, 100000 logging.info(f'Started loading {dataset_name}') X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg logging.info(f'{dataset_name} is loaded, started parsing...') y = (y > 3).astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, train_size=nrows_train, test_size=nrows_test, shuffle=False) for data, name in zip((X_train, X_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True
def testCovtype(): from sklearn.datasets import fetch_covtype covtype = fetch_covtype() total = len(covtype.data) onePercent = int(total * 0.01) baseMap = map(zipToMap, zip(covtype.data[:onePercent], covtype.target[:onePercent])) onPercentDataFrame = pd.DataFrame(baseMap) init = time.time() clusters = minasOffline(onPercentDataFrame) print( f'minasOffline(testCovtype) => {len(clusters)}, {time.time() - init} seconds' ) print(len(clusters)) fivePercent = int(total * 0.05) fivePercentZip = zip(covtype.data[onePercent + 1:fivePercent], map(str, covtype.target[onePercent + 1:fivePercent])) inputStream = (Example(item=i, label=t) for i, t in fivePercentZip) init = time.time() for o in metaMinas(minasOnline(inputStream, clusters)): print(o) print(f'metaMinas(minasOnline(testCovtype) {time.time() - init} seconds')
def generate_covertype(): covtype = fetch_covtype() X = np.array(covtype["data"], dtype=float) y = np.array(covtype["target"]) == 2 # Very Easy clf = xgb.XGBClassifier(objective="reg:logistic", nthread=4, tree_method="hist", max_depth=4, learning_rate=0.5, n_estimators=10) model = clf.fit(X, y) at = addtree_from_xgb_model(model) at.base_score = 0.0 err = sum(y != model.predict(X)) / len(y) mae = mean_absolute_error(model.predict(X[:1000], output_margin=True), at.predict(X[:1000])) print(f"easy covtype: error rate {err}") print(f"easy covtype: mae model difference {mae}") # edge case test _, feat_id, split_value = at[0].get_split(0) Xt = [X[12]] Xt[0][feat_id] = split_value print("edge case diff: ", model.predict(Xt, output_margin=True) - at.predict(Xt)) at.write("tests/models/xgb-covtype-easy.json")
def forest_dataload(): from sklearn.datasets import fetch_covtype import numpy as np forest = fetch_covtype() Data = forest['data'] label = forest['target'] return Data, label
def load_data(dtype=np.float32, order='C'): ###################################################################### ## Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=opts.random_seed) X, y = data['data'], data['target'] X = np.asarray(X, dtype=dtype) if order.lower() == 'f': X = np.asfortranarray(X) # class 1 vs. all others. y[np.where(y != 1)] = -1 ###################################################################### ## Create train-test split (as [Joachims, 2006]) logger.info("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] ###################################################################### ## Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test
def setUpClass(cls): # setupLog() with open('logging.conf.yaml', 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) logging.config.dictConfig(config) dataset = fetch_covtype() cls.dataset = dataset total = len(dataset.data) print('sizeof dataset', sizeof_fmt(dataset.data.nbytes), 'len', total) print('dataset', dataset.data[0], dataset.target[0]) zipToMap = lambda x: {'item': x[0], 'label': str(x[1])} onePercent = int(total * 0.01) baseMap = map( zipToMap, zip(dataset.data[:onePercent], dataset.target[:onePercent])) cls.onPercentDataFrame = pd.DataFrame(baseMap) fivePercent = int(total * 0.05) fivePercentZip = zip( dataset.data[onePercent + 1:fivePercent], map(str, dataset.target[onePercent + 1:fivePercent])) cls.fivePercentDataIterator = list(fivePercentZip) tenPercent = int(total * 0.10) baseMap = map( zipToMap, zip(dataset.data[:tenPercent], dataset.target[:tenPercent])) cls.tenPercentDataFrame = pd.DataFrame(baseMap) cls.allDataIterator = list(zip(dataset.data, map(str, dataset.target)))
def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test
def load_data(): # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=RANDOM_STATE) X = check_array(data["data"], dtype=np.float32, order="C") y = (data["target"] != 1).astype(np.int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test
def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test
def get_cover_type(num_rows=None): data = datasets.fetch_covtype() data = data.data if num_rows is not None: data = data[0:num_rows] return data
def load_data(dtype=np.float32, order='F'): ###################################################################### ## Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=opts.random_seed) X, y = data['data'], data['target'] if order.lower() == 'f': X = np.asfortranarray(X) # class 1 vs. all others. y[np.where(y != 1)] = -1 ###################################################################### ## Create train-test split (as [Joachims, 2006]) logger.info("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] ###################################################################### ## Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test
def make_forest_cover_data(): forest_cover = fetch_covtype() X, y = forest_cover.data, forest_cover.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) cols = [ 'Cover_Type', 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40' ] train_df = pd.DataFrame(np.hstack([y_train.reshape(-1, 1), X_train]), columns=cols) test_df = pd.DataFrame(np.hstack([y_test.reshape(-1, 1), X_test]), columns=cols) return train_df, test_df
def forestcover(random_state=1): data = fetch_covtype() x = data.data x = MinMaxScaler().fit_transform(x) y = data.target y = np.array([1 if l == 2 else -1 if l == 4 else 0 for l in y]) normal = x[np.where(y == 1)] anomalies = x[np.where(y == -1)] x = np.concatenate((normal, anomalies), axis=0) y = np.concatenate(([1]*len(normal), [-1]*len(anomalies)), axis=0) x, y = shuffle(x, y, random_state=random_state) normal = x[np.where(y == 1)] test_normal = normal[int(len(normal)/2):] normal = normal[:int(len(normal)/2)] anomalies = x[np.where(y == -1)] test_anomalies = anomalies[int(len(anomalies)/2):] anomalies = anomalies[:int(len(anomalies)/2)] x_train = np.concatenate((normal, anomalies), axis=0) y_train = np.concatenate(([1]*len(normal), [-1]*len(anomalies)), axis=0) x_train, y_train = shuffle(x_train, y_train, random_state=1) x_test = np.concatenate((test_normal, test_anomalies), axis=0) y_test = np.concatenate(([1]*len(test_normal), [-1]*len(test_anomalies)), axis=0) x_test, y_test = shuffle(x_test, y_test, random_state=1) return x_train, y_train, x_test, y_test
def main(): newsgroups_train = fetch_covtype() # Récupération des data et target data = newsgroups_train.data # Elimination de données pour accélerer le temps de traitement qui ne se terminé pas sur mon PC data = data[:len(data) - 575000] target = newsgroups_train.target target = target[:len(target) - 575000] classes = set(target) # Créer un jeu de données test et train x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # Créer une liste contenant des tuples (images,value) test_values = [(x_test[index], value) for index, value in enumerate(y_test)] start_time = time.time() # Créer des classifieur O vs O o_vs_o_classifiers = generateOvOClassifier(classes, x_train, y_train) print("OvO classifieur Done") # Fait les prédictions avec les classifieurs O vs O predictOVO(test_values, o_vs_o_classifiers) print("Temps d'execution OvO : %s secondes" % (time.time() - start_time)) print() start_time = time.time() # Créer des classifieur O vs R ovrclassifier = generateOvRClassifier(classes, x_train, y_train) print("OvR classifieur Done") # Fait les prédictions avec les classifieurs O vs R predictOVR(test_values, ovrclassifier) print("Temps d'execution OvR : %s secondes" % (time.time() - start_time)) print() start_time = time.time() # Créer des forest classifieur forestclassifier = RandomForestClassifier(n_estimators=10).fit( x_train, y_train) print("Forest classifieur Done") # Fait les prédictions avec les forests classifieurs predictForest(test_values, forestclassifier) print("Temps d'execution Forest : %s secondes" % (time.time() - start_time)) print() start_time = time.time() # Créer des SVM classifieur SVMclassifier = svm.SVC(gamma='scale', decision_function_shape='ovo', probability=True).fit(x_train, y_train) print("SVM classifieur Done") # Fait les prédictions avec les SVM classifieurs predictSVM(test_values, SVMclassifier) print("Temps d'execution SVM : %s secondes" % (time.time() - start_time))
def create_covtype(): covtype_data = datasets.fetch_covtype() print covtype_data.__dict__ data = data_class.Data() data.x = covtype_data.data data.y = covtype_data.target helper_functions.save_object("data_sets/covtype/raw_data.pkl") pass
def fetch_data(): from sklearn.datasets import fetch_covtype import fcntl with open("sklearn_download.lock", mode="ab") as f: fcntl.lockf(f, fcntl.LOCK_EX) data = fetch_covtype() fcntl.lockf(f, fcntl.LOCK_UN) return data
def create_covtype(): covtype_data = datasets.fetch_covtype() print covtype_data.__dict__ data = data_class.Data() data.x = covtype_data.data data.y = covtype_data.target helper_functions.save_object('data_sets/covtype/raw_data.pkl') pass
def test_xgboost_covtype(n_gpus): import xgboost as xgb import numpy as np from sklearn.datasets import fetch_covtype from sklearn.model_selection import train_test_split import time # Fetch dataset using sklearn cov = fetch_covtype() X = cov.data y = cov.target # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42) # Specify sufficient boosting iterations to reach a minimum num_round = 10 # Leave most parameters as default param = { 'objective': 'multi:softmax', # Specify multiclass classification 'num_class': 8, # Number of possible output classes 'tree_method': 'gpu_hist', # Use GPU accelerated algorithm } if n_gpus is not None: param['n_gpus'] = n_gpus # Convert input data from numpy to XGBoost format dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1) dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1) gpu_res = {} # Store accuracy result tmp = time.time() # Train model xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res) print("GPU Training Time: %s seconds" % (str(time.time() - tmp))) # TODO: https://github.com/dmlc/xgboost/issues/4518 dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1) dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1) # Repeat for CPU algorithm tmp = time.time() param['tree_method'] = 'hist' cpu_res = {} xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res) print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
def get_data(random_state, test_size=0.2): data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = data['data'] y = data['target'] n_train = int((1 - test_size) * X.shape[0]) return X[:n_train], y[:n_train], X[n_train:], y[n_train:]
def get_covtype(num_rows=None): data = datasets.fetch_covtype() X = data.data y = data.target if num_rows is not None: X = X[0:num_rows] y = y[0:num_rows] return X, y
def closure(mu): ds = fetch_covtype() X, _, y, _ = train_test_split(ds["data"], ds["target"], random_state=42, test_size=0.9, stratify=ds["target"]) y = y - 1 return preprocess_and_noise({"data": X, "target": y}, mu=mu)
def getdata_covtype(): from sklearn.datasets import fetch_covtype data = fetch_covtype() X = data['data'] y_ = data['target'] - 1 y = np.zeros((len(y_), 7)) y[np.arange(len(y_)), y_] = 1 return X, y
def prepare_covtype(dataset_folder, nrows): # pylint: disable=unused-argument X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg if nrows is not None: X = X[0:nrows] y = y[0:nrows] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, test_size=0.2, ) return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION)
def get(self, **kwargs) -> np.ndarray: try: print('Fetching CovType dataset...') data = fetch_covtype(download_if_missing=True, shuffle=True) X, y = data['data'], data['target'] return X, y except: raise RuntimeError
def raw_frame(): dataset = datasets.fetch_covtype() feature_names = [ f"feature_{ix}" for ix in range(dataset.data.shape[1]) ] df = pd.DataFrame(data=dataset.data, columns=feature_names) # This is a multiclass dataset, but we want to treat it as a binary one. # We'll just try to detect class 2, since that one is the most common. df["target"] = dataset.target == 2 return df
def load_covtype_dataset(subset=0.1, test_size=0.2, random_state=None): '''Load & Split training/test covtype dataset.''' print ('\nDataset used: \t\tForest covertypes from UCI ({:.1%} random subset)'.format(subset)) X, y = datasets.fetch_covtype(return_X_y=True) y = make_binary_classification_target(y, 7, verbose=True) X, y = imbalance_random_subset( X, y, size=subset, random_state=random_state) X_train, X_test, y_train, y_test = imbalance_train_test_split( X, y, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test
def dataSetGenCovtype(log): from sklearn.datasets import fetch_covtype covtype = fetch_covtype() # covtype.data.shape Out[2]: (581012, 54) 581 012 log.info(f'Dataset len {covtype.data.shape}') allData = list() for data, target in zip(covtype.data, covtype.target): data = [float(i) for i in data] allData.append((data, str(target))) # return allData
def test_select_data(self): """ Tests the select_data function in algo_runner.py :return: None """ # case less than 10000 rows: iris = load_iris() data = iris['data'] target = iris['target'] data = pd.DataFrame(data) target = pd.DataFrame(target) pct_examples_1, pct_examples_2, pct_examples_3 = \ algo_runner.select_data(data, target) self.assertTrue(np.isclose(pct_examples_1, 0.05, rtol=0.01, atol=0.01)) self.assertTrue(np.isclose(pct_examples_2, 0.10, rtol=0.01, atol=0.01)) self.assertTrue(np.isclose(pct_examples_3, 0.15, rtol=0.01, atol=0.01)) # case greater than 10000 less than 100000 rows: (x_train, y_train), (x_test, y_test) = mnist.load_data() num_pixels = x_train.shape[1] * x_train.shape[2] x_train = x_train.reshape(x_train.shape[0], num_pixels).astype('float32') x_train = x_train / 255 y_train = np_utils.to_categorical(y_train) x_train = pd.DataFrame(x_train) y_train = pd.DataFrame(y_train) pct_examples_1, pct_examples_2, pct_examples_3 = \ algo_runner.select_data(x_train, y_train) self.assertTrue( np.isclose(pct_examples_1, 0.02, rtol=0.001, atol=0.001)) self.assertTrue( np.isclose(pct_examples_2, 0.04, rtol=0.001, atol=0.001)) self.assertTrue( np.isclose(pct_examples_3, 0.06, rtol=0.001, atol=0.001)) # Case greater than 100000 rows: covtype = fetch_covtype() data = covtype['data'] target = covtype['target'] data = pd.DataFrame(data) target = pd.DataFrame(target) pct_examples_1, pct_examples_2, pct_examples_3 = \ algo_runner.select_data(data, target) self.assertTrue( np.isclose(pct_examples_1, 0.01, rtol=0.001, atol=0.001)) self.assertTrue( np.isclose(pct_examples_2, 0.02, rtol=0.001, atol=0.001)) self.assertTrue( np.isclose(pct_examples_3, 0.03, rtol=0.001, atol=0.001)) return None
def load_cover_type(random_state=None, dtype=np.float32, order='C'): """Load cover type data Parameters ---------- random_state : int, np.random.RandomState or None, optional (default=None) The random state used to shuffle the data if needed. dtype : np.dtype, optional (default=np.float32) The type for the data to be returned. order : 'C', 'F' or None, optional (default='C') Whether an array will be forced to be fortran or c-style. When order is None (default), then if copy=False, nothing is ensured about the memory layout of the output array; otherwise (copy=True) the memory layout of the returned array is kept as close as possible to the original array. Returns ------- X : ndarray, shape (n_train_samples, n_features) y : ndarray, shape (n_train_samples, ) T : ndarray, shape (n_test_samples, n_features) valid: ndarray, shape (n_test_samples, ) """ ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, y_train, X_test, y_test
def run_in_cluster(): dataset = datasets.fetch_covtype(data_home=tempfile.mkdtemp()) X, y = dataset.data, dataset.target - 1 training_size = 400000 max_depth = 10 clf = DecisionTreeClassifier(max_depth=max_depth) start = time.time() clf.fit(X[:training_size], y[:training_size]) end = time.time() y_pred = clf.predict(X[training_size:]) accuracy = metrics.accuracy_score(y[training_size:], y_pred) return end - start, accuracy
def get_cd_data(num_samples=500): # Load in the vectorized news group data from scikit-learn package cov = fetch_covtype() all_data = np.array(cov.data) all_targets = np.array(cov.target) # Set class pairings as described in the multiview clustering paper view1_classes = [1, 2, 3] view2_classes = [4, 5, 6] # Create lists to hold data and labels for each of the classes across # 2 different views labels = [ num for num in range(len(view1_classes)) for _ in range(num_samples) ] labels = np.array(labels) view1_data = list() view2_data = list() # Randomly sample 500 items from each of the selected classes in view1 for class_num in view1_classes: class_data = all_data[(all_targets == class_num)] indices = np.random.choice(class_data.shape[0], num_samples) view1_data.append(class_data[indices]) view1_data = np.concatenate(view1_data) # Construct view 2 by applying a nonlinear transformation # to data from view 1 comprised of a linear transformation # and a logistic nonlinearity t_mat = np.random.random((view1_data.shape[1], 50)) noise = 0.005 - 0.01 * np.random.random((view1_data.shape[1], 50)) t_mat *= noise transformed = view1_data @ t_mat view2_data = scp.special.expit(transformed) # Shuffle and normalize vectors shuffled_inds = np.random.permutation(num_samples * len(view1_classes)) view1_data = np.vstack(view1_data) view2_data = np.vstack(view2_data) view1_data = view1_data[shuffled_inds] view2_data = view2_data[shuffled_inds] magnitudes1 = np.linalg.norm(view1_data, axis=0) magnitudes2 = np.linalg.norm(view2_data, axis=0) magnitudes1[magnitudes1 == 0] = 1 magnitudes2[magnitudes2 == 0] = 1 magnitudes1 = magnitudes1.reshape((1, -1)) magnitudes2 = magnitudes2.reshape((1, -1)) view1_data /= magnitudes1 view2_data /= magnitudes2 labels = labels[shuffled_inds] return [view1_data, view2_data], labels
def load_classification_data(): dataset = fetch_covtype(data_home="data") data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])[:10000, :] col_names = [f"feature_{i}" for i in range(data.shape[-1])] col_names[-1] = "target" data = pd.DataFrame(data, columns=col_names) data["feature_0_cat"] = pd.qcut(data["feature_0"], q=4) data["feature_0_cat"] = "feature_0_" + data.feature_0_cat.cat.codes.astype( str) test_idx = data.sample(int(0.2 * len(data)), random_state=42).index test = data[data.index.isin(test_idx)] train = data[~data.index.isin(test_idx)] return (train, test, ["target"])
def get_covertype(train_test_ratio): covertype = datasets.fetch_covtype() x = covertype.data y = convert_to_1_hot(covertype.target - 1, 7) cutoff = int(x.shape[0] * train_test_ratio) x = x.astype(np.float32) y = y.astype(np.float32) x_train = x[0:cutoff, :] x_test = x[cutoff:, :] y_train = y[0:cutoff] y_test = y[cutoff:] return x_train, y_train, x_test, y_test
def fun(): import xgboost as xgb import numpy as np from sklearn.datasets import fetch_covtype from sklearn.model_selection import train_test_split import time # Fetch dataset using sklearn cov = fetch_covtype() X = cov.data y = cov.target # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42) # Specify sufficient boosting iterations to reach a minimum num_round = 10 # Leave most parameters as default param = {'objective': 'multi:softmax', # Specify multiclass classification 'num_class': 8, # Number of possible output classes 'tree_method': 'gpu_hist' # Use GPU accelerated algorithm } # Convert input data from numpy to XGBoost format dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1) dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1) gpu_res = {} # Store accuracy result tmp = time.time() # Train model xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res) print("GPU Training Time: %s seconds" % (str(time.time() - tmp))) # Repeat for CPU algorithm tmp = time.time() param['tree_method'] = 'hist' cpu_res = {} xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res) print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
import numpy as np from sklearn.ensemble import GradientBoostingRegressorCV, GradientBoostingClassifierCV from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier from sklearn.datasets import load_boston, fetch_covtype, load_iris, make_classification from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_california_housing gbccv = GradientBoostingClassifierCV(n_jobs=8, random_state=42) covtype = fetch_covtype() X, y = covtype.data[::2], covtype.target[::2] gbccv.fit(X, y)
def load_data(ds_name): data_dir = path.dirname(__file__) + "/../data/" global _img_data if ds_name == "digits": ds = load_digits() x_train = ds.data y_train = ds.target elif ds_name == "iris": ds = load_iris() x_train = ds.data y_train = ds.target elif ds_name == "diabetes": ds = load_diabetes() x_train = ds.data y_train = ds.target > 140 elif ds_name == "covtype": ds = fetch_covtype(download_if_missing = True) x_train = ds.data y_train = ds.target elif ds_name == "cf10": with open(data_dir + "data_batch_1", "r") as f: ds = cPickle.load(f) x_train = ds['data'] y_train = np.array(ds['labels']) elif ds_name == "cf100": with open(data_dir + "train", "r") as f: ds = cPickle.load(f) x_train = ds['data'] y_train = np.array(ds['fine_labels']) elif ds_name == "cd10_test": with open(data_dir + "test_batch", "r") as f: ds = cPickle.load(f) x_train = ds['data'] y_train = np.array(ds['labels']) elif ds_name == "cf100_test": with open(data_dir + "test", "r") as f: ds = cPickle.load(f) x_train = ds['data'] y_train = np.array(ds['fine_labels']) elif ds_name == "inet": if _img_data is None: with open("/ssd/imagenet-subset.pickle", "r") as f: _img_data = cPickle.load(f) return _img_data['x'][0:10000], _img_data['Y'][0:10000] elif ds_name == "inet_test": if _img_data is None: with open("/ssd/imagenet-subset.pickle", "r") as f: _img_data = cPickle.load(f) return _img_data['x'][10000:], _img_data['Y'][10000:] elif ds_name == "kdd": data = np.load(data_dir + "data.npy") x_train = data[:, :-1] y_train = data[:, -1] elif ds_name == "poker": data = sklearn.datasets.fetch_mldata("poker") x_train = data.data y_train = data.target elif ds_name == "pamap": data = np.load(data_dir + "pamap.npz") x_train = data['x'] y_train = data['y'] else: assert False, "Unrecognized data set name %s" % ds_name return x_train, y_train
X = dataset.data y = dataset.target if dataset_name == 'shuttle': dataset = fetch_openml('shuttle') X = dataset.data y = dataset.target # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) if dataset_name == 'forestcover': dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) print('vectorizing data') if dataset_name == 'SF': lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]]
import xgboost as xgb import numpy as np from sklearn.datasets import fetch_covtype from sklearn.model_selection import train_test_split import time # Fetch dataset using sklearn cov = fetch_covtype() X = cov.data y = cov.target # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42) # Specify sufficient boosting iterations to reach a minimum num_round = 3000 # Leave most parameters as default param = {'objective': 'multi:softmax', # Specify multiclass classification 'num_class': 8, # Number of possible output classes 'tree_method': 'gpu_hist' # Use GPU accelerated algorithm } # Convert input data from numpy to XGBoost format dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) gpu_res = {} # Store accuracy result tmp = time.time() # Train model
def fetch(*args, **kwargs): return fetch_covtype(*args, download_if_missing=False, **kwargs)
y = dataset.target if dat == 'shuttle': dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target sh(X, y) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) if dat == 'forestcover': dataset = fetch_covtype(shuffle=True) X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) if dat == 'SF': lb = LabelBinarizer() lb.fit(X[:, 1]) x1 = lb.transform(X[:, 1]) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != 'normal.').astype(int)
import time import numpy as np from sklearn.datasets import fetch_covtype from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from ivalice.regression import RFRegressor data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=0) X, y = data.data, data.target n_samples = 10000 mask = y <= 2 Xb = X[mask][:n_samples] yb = y[mask][:n_samples] Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(Xb, yb, train_size=0.75, test_size=0.2, random_state=0) rf = RandomForestRegressor(n_estimators=100, max_depth=3, max_features=0.6) start = time.time() rf.fit(Xb_tr, yb_tr) print "RandomForestRegressor" print time.time() - start, "seconds" y_pred = rf.predict(Xb_te) print mean_squared_error(yb_te, y_pred)
if dat == 'shuttle': dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) print('----- ') if dat == 'forestcover': dataset = fetch_covtype(shuffle=True, random_state=random_state) X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) print_outlier_ratio(y) print('--- Vectorizing data...') if dat == 'SF': lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str))