def train_test_sampling(self, validation_split: float = 0.1) -> dict: """ Data sampling into train & test data :param validation_split: float Amount of training data to validate quality during training :return dict: Train and test split for both target and predictors """ #if self.stratification: # _stratification: np.array = self.df[self.target].values #else: # _stratification = None _x_train, _x_test, _y_train, _y_test = train_test_split( self.df[self.features], self.df[self.target], test_size=self.test_size, train_size=self.train_size, random_state=self.seed, shuffle=self.random_sample, #stratify=_stratification ) if validation_split > 0: _x_train_, _x_val, _y_train_, _y_val = train_test_split( _x_train, _y_train, test_size=validation_split, train_size=1 - validation_split, random_state=self.seed, shuffle=self.random_sample) else: _x_train_ = _x_train del _x_train _x_val = None _y_train_ = _y_train del _y_train _y_val = None return dict(x_train=_x_train_.compute(), x_test=_x_test.compute(), y_train=_y_train_.compute(), y_test=_y_test.compute(), x_val=_x_val, y_val=_y_val) return dict(x_train=_x_train_.compute(), x_test=_x_test.compute(), y_train=_y_train_.compute(), y_test=_y_test.compute(), x_val=_x_val.compute(), y_val=_y_val.compute())
def process_data(df, pca_level): data_x = feature_engine(df) data_y = df['meter_reading'] PP_Pipeline = Pipeline([ ('Imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), ('Scaler', preprocessing.MinMaxScaler()), ('PCA', PCA(n_components=pca_level)), ]) x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4) x_train_pp = PP_Pipeline.fit_transform(x_train) x_test_pp = PP_Pipeline.transform(x_test) # PipelineFile = open("PipelineFile", "wb") # pickle.dump(PP_Pipeline, PipelineFile) # PipelineFile.close() print('\n') print('Completed Preprocessing and Dimensionality Reduction') print('\n') return x_train_pp, x_test_pp, y_train, y_test
def main(): start = time.time() initialize(interface='ib0') client = Client() iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.05) D_test = xgb.DMatrix(X_test, label=Y_test) params = { 'eta': 0.3, 'max_depth': 3, 'objective': 'multi:softprob', 'num_class': 3 } bst = dxgb.train(client, params, da.asarray(X_train), da.asarray(Y_train), num_boost_round=10) preds = bst.predict(D_test) best_preds = np.asarray([np.argmax(line) for line in preds]) print("Precision = {}".format( precision_score(Y_test, best_preds, average='macro'))) print("Recall = {}".format( recall_score(Y_test, best_preds, average='macro'))) print("Accuracy = {}".format(accuracy_score(Y_test, best_preds))) elapsed = (time.time() - start) print(f"Elapsed time: {elapsed}")
def setup_class(self): setup_dask(self) print("Loading datasets...") df_train = dd.from_pandas(dsutils.load_adult().head(1000), npartitions=2) self.y = df_train.pop(14) self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, auto_categorize=False, auto_discrete=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_eval, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.oof_proba, self.eval_proba, self.test_proba = \ self.dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, num_folds=3, epochs=1, n_jobs=1)
def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( 'solver', ['admm', 'gradient_descent', 'proximal_grad']) C = trial.suggest_uniform('C', 0.0, 1.0) if solver == 'admm' or solver == 'proximal_grad': penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elastic_net']) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = 'l2' classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_test, y_train, y_test = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) return score
def train_test_split(*data, shuffle=True, random_state=None, stratify=None, **kwargs): if DaskToolBox.exist_dask_dataframe(*data): if len(data) > 1: data = [ DaskToolBox.make_divisions_known( DaskToolBox.to_dask_frame_or_series(x)) for x in data ] head = data[0] for i in range(1, len(data)): if data[i].divisions != head.divisions: logger.info( f'repartition {i} from {data[i].divisions} to {head.divisions}' ) data[i] = data[i].repartition(divisions=head.divisions) result = dm_sel.train_test_split(*data, shuffle=shuffle, random_state=random_state, **kwargs) result = [x.clear_divisions() for x in result] else: result = sk_sel.train_test_split(*data, shuffle=shuffle, random_state=random_state, stratify=stratify, **kwargs) return result
def split_dataset(self, dataset, random_state): """ Split dataset into train and test data subsets, currently using CV-fold index for randomness. Plan to refactor with dask_ml KFold """ hpo_log.info('> train-test split') label_column = self.hpo_config.label_column train, test = train_test_split(dataset, random_state=random_state) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column] X_train = train.drop(label_column, axis=1) y_test = test[label_column] X_test = test.drop(label_column, axis=1) # force execution X_train, y_train, X_test, y_test = persist_across_workers( self.client, [X_train, y_train, X_test, y_test], workers=self.client.has_what().keys()) # wait! wait([X_train, y_train, X_test, y_test]) return (X_train.astype(self.hpo_config.dataset_dtype), X_test.astype(self.hpo_config.dataset_dtype), y_train.astype(self.hpo_config.dataset_dtype), y_test.astype(self.hpo_config.dataset_dtype))
def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( "solver", ["admm", "gradient_descent", "proximal_grad"]) C = trial.suggest_float("C", 0.0, 1.0) if solver == "admm" or solver == "proximal_grad": penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elastic_net"]) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = "l2" classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_valid, y_train, y_valid = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_valid, y_valid) return score
def split_data(self, dataset, y_label, train_size=.8, random_state=0, shuffle=True): with PerfTimer(self, 'split_timer'): train, test = train_test_split( dataset, random_state=random_state ) # unable to shuffle -- no dask_cudf sampler implemented X_train, y_train = train.drop( y_label, axis=1).astype('float32'), train[y_label].astype('int32') X_test, y_test = test.drop( y_label, axis=1).astype('float32'), test[y_label].astype('int32') if 'multi-GPU' in self.compute_type: with PerfTimer(self, 'persist_timer'): workers = self.client.has_what().keys() X_train, X_test, y_train, y_test = persist_across_workers( self.client, [X_train, X_test, y_train, y_test], workers=workers) wait([X_train, X_test, y_train, y_test]) return X_train, X_test, y_train, y_test
def split_dataset(self, dataset, random_state): """ Split dataset into train and test data subsets, currently using CV-fold index for randomness. Plan to refactor with dask_ml KFold """ hpo_log.info("> train-test split") label_column = self.hpo_config.label_column train, test = train_test_split(dataset, random_state=random_state) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column] X_train = train.drop(label_column, axis=1) y_test = test[label_column] X_test = test.drop(label_column, axis=1) # persist X_train = X_train.persist() y_train = y_train.persist() wait([X_train, y_train]) return ( X_train.astype(self.hpo_config.dataset_dtype), X_test.astype(self.hpo_config.dataset_dtype), y_train.astype(self.hpo_config.dataset_dtype), y_test.astype(self.hpo_config.dataset_dtype), )
def main(): # client = Client("tcp://127.0.0.1:64958") client = Client(processes=False, threads_per_worker=2, n_workers=1, memory_limit='4GB') print(client) rs = RandomSearcher(get_space_num_cat_pipeline_complex, optimize_direction=OptimizeDirection.Maximize) hk = HyperGBM(rs, task='classification', reward_metric='accuracy', cache_dir=f'{test_output_dir}/hypergbm_cache', callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')]) df = dsutils.load_bank_by_dask() df.drop(['id'], axis=1) df['y'] = dm_pre.LabelEncoder().fit_transform(df['y']) # df = df.sample(frac=0.1) # object_columns = [i for i, v in df.dtypes.items() if v == 'object'] # for c in object_columns: # df[c] = df[c].astype('category') # df = df.categorize(object_columns) X_train, X_test = train_test_split(df, test_size=0.8, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') hk.search(X_train, y_train, X_test, y_test, max_trails=50) print('-' * 30) best_trial = hk.get_best_trail() print(f'best_train:{best_trial}') estimator = hk.final_train(best_trial.space_sample, X_train, y_train) score = estimator.predict(X_test) result = estimator.evaluate(X_test, y_test, metrics=['accuracy', 'auc', 'logloss']) print(f'final result:{result}')
def main(client): m = 100000 n = 100 X, y = make_regression(n_samples=m, n_features=n, chunks=200, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) dtrain = DaskDMatrix(client, X_train, y_train) dtest = DaskDMatrix(client, X_test, y_test) output = xgb.dask.train( client, { "verbosity": 1, "tree_method": "hist", "objective": "reg:squarederror", "eval_metric": "rmse", "max_depth": 6, "learning_rate": 1.0, }, dtrain, num_boost_round=1000, evals=[(dtrain, "train"), (dtest, "test")], callbacks=[ CustomEarlyStopping(validation_set="test", target_metric="rmse", maximize=False, seed=0) ], )
def process_data(X, y=None, test_size=0.2): if y is None: km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100) km.fit(X.flatten().reshape(-1, 1)) y = km.labels_ y_uniqs = np.unique(y[:,0]) len_ = X.shape[0] X = prepare_dataset(X) shape_ = list(X.shape[1:]) if test_size != 0: samples = list() samples_labels = list() print('Preparing samples ...') for _ in range(2): for y_uniq in y_uniqs: sample = list() label = list() for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) label.append(y_uniq) if len(sample) >= len(y_uniqs): break except: pass samples += sample samples_labels += label samples = da.vstack(samples) samples_labels = da.vstack(samples_labels) if test_size == 0: print('Training dataset shape x: ', X.shape) print('Training dataset shape y: ', y.shape) train_dataset = Dataset(X, y) return train_dataset else: X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=config.seeds) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples train_dataset.samples_labels = samples_labels print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def _preprocess( self, df: "dask.DataFrame", inferencing: bool) -> Tuple["dask.DataFrame", "dask.DataFrame"]: df = df.loc[:, df.columns != "index"] # remove nulls and/or NaNs scalably with dask print(f"step1: drop nulls from rows") df = df.dropna(subset=["nullable_feature"]) print(f"step2: creating new_col and updatingfeature_1") df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] + df["feature_3"]) / 3. df["feature_1"] = 2. * df["feature_1"] + 0.1 # TODO: this doesn't work with more than 1 parquet file # df['mean_by_fruit'] = df.groupby('fruit')['feature_1'].transform('mean') print(f"step3: one-hot encoding fruit") df = df.astype({"fruit": "category"}) df = df.categorize() df.persist() if inferencing: assert self.column_transformer is not None df_fruits = self.column_transformer.transform(df) else: assert self.column_transformer is None self.column_transformer = ColumnTransformer([ ("one-hot", OneHotEncoder(sparse=False), ["fruit"]) ]) df_fruits = self.column_transformer.fit_transform(df) df_data = df.loc[:, (df.columns != "label") & (df.columns != "fruit")] df_data = dd.concat([df_data, df_fruits], axis=1) assert df_data.isnull().sum().sum().compute( ) == 0, "There are nulls or Nans in the data!" if inferencing: print(f"step4: standardrize inference dataset") assert self.scaler is not None df_data_inference = self.scaler.transform(df_data) return df_data_inference, None else: print(f"step4: standardrize train dataset") df_labels = df.loc[:, df.columns == "label"] df_data_train, df_data_test, df_label_train, df_label_test = train_test_split( df_data, df_labels) df_data_train.persist() assert self.scaler is None self.scaler = StandardScaler( ) # this just turns col values to z-scores df_data_train = self.scaler.fit_transform(df_data_train) df_data_test = self.scaler.transform(df_data_test) df_train = dd.concat([df_data_train, df_label_train], axis=1) df_test = dd.concat([df_data_test, df_label_test], axis=1) return df_train, df_test
def split(full_set): training_dd = dd.read_csv(full_set, assume_missing=True) y = training_dd.y X = training_dd.drop('y', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) return (X_train, X_test, y_train, y_test)
def prepare_dataset(X, y): scaler = StandardScaler() X.compute_chunk_sizes() X_train, X_test, y_train, y_test = train_test_split(X.rechunk( {1: X.shape[1]}), y, test_size=0.25) del X del y X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5) y_train = scaler.fit_transform(y_train.compute().reshape(-1, 1)) y_test = scaler.transform(y_test.compute().reshape(-1, 1)) y_valid = scaler.transform(y_valid.compute().reshape(-1, 1)) return X_train, X_test, X_valid, y_train, y_test, y_valid
def split_and_write_data(df: pd.DataFrame, *, seed=42) -> None: df = df.drop(['ID', 'lat', 'lon', 'year'], axis=1) X_train, X_test, y_train, y_test = train_test_split(df.drop(['n2o', 'gwp'], axis=1), df[['n2o', 'gwp']], shuffle=True, train_size=0.8, random_state=seed) y_train_n2o, y_train_gwp = y_train['n2o'], y_train['gwp'] #y_test_n2o, y_test_gwp = y_test['n2o'], y_test['gwp'] gzip_args = {'method': 'gzip', 'compresslevel': 1} X_train.to_csv(DEST / "x_train.csv.gz", compression=gzip_args) X_test.to_csv(DEST / "x_test.csv.gz", compression=gzip_args) y_train.to_csv(DEST / "y_train.csv.gz", compression=gzip_args) y_test.to_csv(DEST / "y_test.csv.gz", compression=gzip_args) y_train_n2o.to_csv(DEST / "y_train_n2o.csv.gz", compression=gzip_args) y_train_gwp.to_csv(DEST / "y_train_gwp.csv.gz", compression=gzip_args)
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: y = da.ones(X.shape[0]) y_uniqs = np.unique(y) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) samples = list() for _ in range(10): for y_uniq in y_uniqs: sample = list() for xa, ya in zip(chunks(X, 10),chunks(y, 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) if len(sample) >= 500: break except: pass samples += sample samples = da.vstack(samples) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def train(seed, epochs, n_gpus, dataset): with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=4) as cluster: with Client(cluster) as client: # Fetch dataset using sklearn if dataset == 'boston': dataset = load_boston() param = {} elif dataset == 'covertype': dataset = fetch_covtype() param = { 'objective': 'multi:softmax', 'num_class': 8 # 'single_precision_histogram': True } param['verbosity'] = 2 param['tree_method'] = 'gpu_hist' # Rechunking is required for the covertype dataset X = da.from_array(dataset.data, chunks=1000) y = da.from_array(dataset.target, chunks=1000) # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, train_size=0.75, random_state=0) dtrain = DaskDMatrix(client, X_train, y_train) dtest = DaskDMatrix(client, X_test, y_test) random_seed(seed, param) gpu_runtime = time.time() model_training_results = xgb.dask.train(client, param, dtrain, num_boost_round=epochs, evals=[(dtest, 'test')]) print(model_training_results) print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
def train_test_split(*data, shuffle=True, random_state=None, **kwargs): if exist_dask_dataframe(*data): if len(data) > 1: data = [make_divisions_known(to_dask_type(x)) for x in data] head = data[0] for i in range(1, len(data)): if data[i].divisions != head.divisions: print( '-' * 10, f'repartition {i} from {data[i].divisions} to {head.divisions}' ) data[i] = data[i].repartition(divisions=head.divisions) result = dm_sel.train_test_split(*data, shuffle=shuffle, random_state=random_state, **kwargs) else: result = sk_sel.train_test_split(*data, shuffle=shuffle, random_state=random_state, **kwargs) return result
def searchBestForest(params, client): c = client print(c) data = getDataForTraining(getData()) data.to_csv('../ignore/dataPrepared.csv') data = dd.read_csv('../ignore/dataPrepared.csv').drop(columns='Unnamed: 0') X_train, X_test, y_train, y_test = train_test_split( data.drop(columns='price'), data.price, test_size=0.2) [ele.compute() for ele in [X_train, X_test, y_train, y_test]] with joblib.parallel_backend('dask'): model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) model.fit(X_train, y_train) y_pred = model.predict(X_test) bestMod = {'model': model, 'R2_score': r2_score(y_test, y_pred)} contador = 1 print(bestMod) for estimators in params['n_estimators']: for features in params['max_features']: for dep in params['max_depth']: for samples in params['min_samples_split']: for samplesL in params['min_samples_leaf']: for boot in params['bootstrap']: model = RandomForestRegressor( bootstrap=boot, criterion='mse', max_depth=dep, max_features=features, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=samplesL, min_samples_split=samples, min_weight_fraction_leaf=0.0, n_estimators=estimators, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) model.fit(X_train, y_train) y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) if r2 > bestMod['R2_score']: bestMod = {'model': model, 'R2_score': r2} print(bestMod) del model
x_data1 = da.array(X_data1) X_data2 =np.load('D:/GAT/Sound/2next2b+.npy') x_data2 = da.array(X_data2) X_data3 =np.load('D:/GAT/Sound/3next2b+.npy') x_data3 = da.array(X_data3) x_data=da.concatenate([x_data1,x_data2,x_data3],axis=-1) print(x_data.shape) y_data=pd.read_csv('D:/GAT/subm/train_answer.csv', index_col=0) y_labels = y_data.columns.values y_data=y_data.values Y_data=y_data # #Preprocessing---------------------------------------------------------------------------------------------------------- x_train,xtest,y_train,ytest=train_test_split(x_data,Y_data,train_size=0.8,random_state=42) x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,train_size=0.8,random_state=42) # kf = KFold(n_splits=4) # for train_index, test_index in kf.split(x_data): # x_train, x_test = x_data[train_index], x_data[test_index] # y_train, y_test = y_data[train_index], y_data[test_index] batch_size=32 # train_generator = ImageDataGenerator(horizontal_flip=True, width_shift_range=0.1) # train_Iterator = train_generator.flow(x_train, y_train,batch_size=batch_size) # # valid_generator = ImageDataGenerator() # valid_Iterator = valid_generator.flow(x_test, y_test,batch_size=8) input=(x_train.shape[1],x_train.shape[2],x_train.shape[3])
#!/usr/bin/env python # coding: utf-8 # In[ ]: # https://www.kaggle.com/puneetgrover/speed-up-your-algorithms-dask # dask_kaggle_Regression # In[1]: from dask_ml.datasets import make_regression import dask.dataframe as dd X, y = make_regression(n_samples=1e6, chunks=50000) # In[2]: df = dd.from_dask_array(X) df.head() # In[3]: from dask_ml.model_selection import train_test_split, GridSearchCV xtr, ytr, xval, yval = train_test_split(X, y) # In[ ]:
results = compute('float64', *shapes) # trigger computation to find shape dtype, shapes = results[0], results[1:] chunks = [ da.from_delayed(part.values, shape, dtype) for part, shape in zip(partitions, shapes) ] return da.concatenate(chunks, axis=0) # Test-train split from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X), to_dask_array(y), random_state=99) ################################################################################### # Fitting the Logistic Regression Classifier from dask_ml.linear_model import LogisticRegression lr = LogisticRegression() with ProgressBar(): lr.fit(X_train, y_train) print('Logistic Regression Score : ', lr.score(X_test, y_test).compute()) ##### OUTPUT --------> Logistic Regression Score : 0.70025
def run(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealth.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y = df['acquired'] X = df.drop('acquired', axis=1) from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates], delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) inc = Incremental(est, scoring='neg_log_loss') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test)) # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1) params = {'alpha': np.logspace(-2, 1, num=1000)} from dask_ml.model_selection import IncrementalSearchCV search = IncrementalSearchCV(est, params, n_initial_parameters=100, patience=20, max_iter=100) search.fit(X_train, y_train, classes=[0, 1]) print(search) print("SCORE") print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))
#del Y_vector, outputset gc.collect() ''' #a = np.argmax(dummy_y, axis=1) from dask_ml.model_selection import train_test_split from dask_ml.preprocessing import DummyEncoder encoder = DummyEncoder() yyy = encoder.fit_transform(Y_cat) dummy_y = dummy_y.values ''' #Spliting dataset from dask_ml.model_selection import train_test_split #X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=42) X_train, X_test, Y_train, Y_test = train_test_split(dask_X, dask_dummy_y, random_state=42) #del X, dummy_y, dask_X gc.collect() # Number of catagories y_catagories = len(Y_test[0]) #number of rows - outcome_size = len(Y_test) ''' #Feature scaling don't need to feature scale because all data is binary from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)
#print a few lines print("\n Dataframe: ") print(df.head()) #Get target variable dt = dd.from_array(data['target']) dt.columns = ["target"] #print target classes example print("\n Target: ") print(dt.head()) # train and test split from dask_ml.model_selection import train_test_split train, test, train_labels, test_labels = train_test_split(df, dt, random_state=123) #xgboost from dask_ml.xgboost import XGBClassifier est = XGBClassifier() #fit model model = est.fit(train, train_labels) #which features contribute most import pandas as pd featureimp = pd.DataFrame(model.feature_importances_) featureimp.columns = ['classifier_feature_importance'] featureimp["variable"] = data['feature_names'] print("\n\n === Xgboost Classifier Feature Importance: === ")
# In[8]: print("Scaler") scaler = StandardScaler() scaler.fit(X) X_scaled = scaler.transform(X) # In[7]: X_train, X_test, y_train, y_test = train_test_split(X_scaled , y["p.ERK"], test_size=0.33, random_state=101,shuffle=True) # In[9]: print ("train base model") import joblib model = SGDRegressor(verbose=2) with joblib.parallel_backend('dask'): model.fit(X_train.compute(), y_train.compute()) # In[10]:
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
client # In[4]: from dask_ml.datasets import make_regression X, y = make_regression(n_samples=4000000, n_features=32, chunks=1000, n_informative=10, random_state=101) # In[5]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # In[6]: params = { 'objective': 'reg:squarederror', 'n_estimators': 100000, 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.5 } bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100) # In[7]: