def baseline_custom_NN(train, train_class, test): parameters = { "batch_size": [10, 100], "node_per_layer": [1000, 500], "layer_count": [1, 2], "learning_rate": [1000, 500], "epoch": [15] } grid = GridSearchCV(CustomNNClassifier(), parameters, refit=True, cv=3, verbose=5, return_train_score=True) with parallel_backend('threading'): grid.fit(train, train_class) best_parameters = grid.best_params_ filename = "Results\\baseline_custom_NN_performance" write_baselines(filename, "Custom Neural Network", grid) print("Best score for best parameters:") print(grid.best_score_) print(grid.best_params_) pred = grid.predict(test) return pred
def run(self, num_features=0, run_mode='regular', stratified_cv=True, n_jobs=1, print_freq=5, features_to_keep_indices=None): # define a dictionary to initialize the SpFtSel kernel sp_params = dict() sp_params['num_features'] = num_features sp_params['run_mode'] = run_mode sp_params['stratified_cv'] = stratified_cv sp_params['n_jobs'] = n_jobs sp_params['print_freq'] = print_freq sp_params['features_to_keep_indices'] = features_to_keep_indices # *** for advanced users *** # two gain types are available: bb (barzilai & borwein) or mon (monotone) sp_params['gain_type'] = 'bb' if run_mode == 'extended': sp_params['cv_folds'] = 5 sp_params['iter_max'] = 200 sp_params['stall_limit'] = 50 sp_params['num_grad_avg'] = 10 sp_params['cv_reps_grad'] = 1 sp_params['cv_reps_eval'] = 5 sp_params['num_gain_smoothing'] = 1 elif run_mode == 'regular': sp_params['cv_folds'] = 5 sp_params['iter_max'] = 100 sp_params['stall_limit'] = 25 sp_params['num_grad_avg'] = 2 sp_params['cv_reps_grad'] = 1 sp_params['cv_reps_eval'] = 2 sp_params['num_gain_smoothing'] = 2 else: raise ValueError('Error: Unknown run mode') kernel = SpFtSelKernel(sp_params) kernel.set_inputs(x=self._x, y=self._y, wrapper=self._wrapper, scoring=self._scoring) kernel.shuffle_data() kernel.init_parameters() kernel.gen_cv_task() with parallel_backend('multiprocessing'): kernel.run_kernel() self.results = kernel.parse_results() return self
def clustering(self, examples): """ After the execution of the clustering algorithm, each micro-cluster is represented by four components (N, LS, SS and T). """ logging.info('clustering {}, {}'.format(len(examples), examples[0])) assert len(examples) > 0 n_samples = len(examples) n_clusters = min(self.k, int(n_samples / (3 * self.representationThr))) assert n_samples >= n_clusters df = pandas.DataFrame(data=[ex.item for ex in examples]) kmeans = KMeans(n_clusters=n_clusters) if self.daskEnableKmeans: logging.info('clustering with dask kmeans') with joblib.parallel_backend('dask'): kmeans.fit(df) else: kmeans.fit(df) clusters = [] for centroid in kmeans.cluster_centers_: c = Cluster() c.center = centroid clusters.append(c) # Add examples to its cluster for ex in examples: nearCl, dist = self.closestCluster(ex.item, clusters) nearCl.addExample(ex) return clusters
def baseline_custom_neural_network(): parameters = { "batch_size": [10], "node_per_layer": [4], "layer_count": [2, 1], "learning_rate": [0.4, 0.2] } inputs, outputs = build_XOR_dataset() test_input = np.array([[0, 0], [1, 1], [1, 0], [0, 1]]) grid = GridSearchCV(CustomNNClassifier(), parameters, refit=True, cv=3, verbose=5, return_train_score=True) with parallel_backend('threading'): grid.fit(inputs, outputs) best_parameters = grid.best_params_ pred = grid.predict(test_input) print("Best score for best parameters:") print(grid.best_score_) print(grid.best_params_) test_input = np.array([[0, 0], [1, 1], [1, 0], [0, 1]]) print(grid.predict(test_input))
def baseline_logistic_regression(train, train_class, test, original=False): """ Baseline classifier using logistic regression with gridsearch """ parameters = {'penalty': ['l2'], 'C': np.logspace(-3, 0, 20)} grid = GridSearchCV(LogisticRegression(), parameters, refit=True, cv=3, verbose=5, return_train_score=True) with parallel_backend('threading'): grid.fit(train, train_class) best_parameters = grid.best_params_ pred = grid.predict(test) filename = "Results\\baseline_logistic_regression_performance" if (original): filename += "_original" write_baselines(filename, "Logistic Regression", grid) print("Best score for best parameters:") print(grid.best_score_) print(grid.best_params_) return pred
def kfold_cv(self): """ K-fold crossvalidator. Returns: fitted values and test values to be used for model optimization. """ with joblib.parallel_backend("dask"): self.xgb_est = XGBClassifier( max_depth=5, subsample=0.7, scale_pos_weight=2, num_class=1, learning_rate=0.05, ) cv = KFold(n_splits=8, random_state=24, shuffle=True) for train_index, test_index in cv.split(self.X): X_train, X_test, y_train, y_test = ( self.X[train_index], self.X[test_index], self.y[train_index], self.y[test_index], ) self.xgb_est.fit(X_train, y_train) y_pred = self.xgb_est.predict(X_test) self.predictions.append(y_pred) self.ypred_iterations.append(y_pred) self.ytest_iterations.append(y_test) self.predicted_probability_iterations.append( self.xgb_est.predict_proba(X_test))
def add_classification( dataframe_path, classifier_path: RandomForestClassifier, emotion: str ): client = Client(processes=False) print(client) with parallel_backend("dask"): PATIENT_DIRS = [ x for x in glob.glob(os.path.join(dataframe_path, "*cropped")) if "hdfs" in os.listdir(x) ] for patient_dir in tqdm(PATIENT_DIRS): try: curr_df = dd.read_hdf( os.path.join(patient_dir, "hdfs", "au.hdf"), "/data" ) # curr_df = curr_df[curr_df[" success"] == 1] curr_df = curr_df.compute() if ( len(curr_df) and "annotated" in curr_df.columns and "frame" in curr_df.columns ): kwargs = { "{0}_predicted".format(emotion): lambda x: predict( x, classifier_path.predict ), "{0}_predicted_proba".format(emotion): lambda x: [ n[1] for n in predict(x, classifier_path.predict_proba) ], } imp_columns=['patient','success','frame','timestamp','annotated','confidence','session','vid','datetime'] #curr_df = curr_df.assign(**kwargs) emotion_df = curr_df[imp_columns] emotion_df = emotion_df.assign(**kwargs) # create name for new dataframe (using patient_session_frame) # store in the out_fullpath emotion_df.to_hdf( os.path.join(patient), "/data", format="table", scheduler="processes", ) else: print(patient_dir + "HAS A PROBLEM") except AttributeError as e: print(e) except ValueError as e: print(e) except KeyError as e: print(e)
def basic(scheduler_address, backends): ESTIMATORS = { 'RandomForest': RandomForestClassifier(n_estimators=100), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100) } X_train, X_test, y_train, y_test = load_data() print_data(X_train, y_train, X_test, y_test) BACKENDS = build_backends(backends, scheduler_address, X_train, y_train) print("Training Classifiers") print("====================") error, train_time, test_time = {}, {}, {} for est_name, estimator in sorted(ESTIMATORS.items()): for backend, backend_kwargs in BACKENDS: print("Training %s with %s backend... " % (est_name, backend), end="") estimator_params = estimator.get_params() estimator.set_params( **{ p: RANDOM_STATE for p in estimator_params if p.endswith("random_state") }) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=-1) # Key for the results name = '%s, %s' % (est_name, backend) with parallel_backend(backend, **backend_kwargs): time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "error-rate")) print("-" * 44) for name in sorted(error, key=error.get): print("%s %s %s %s" % (name, ("%.4fs" % train_time[name]), ("%.4fs" % test_time[name]), ("%.4f" % error[name]))) print()
def clustering(self, examples: typing.List[Vector], label: str = None) -> ClusterList: n_clusters = min( self.CONSTS.k, int(len(examples) / (3 * self.CONSTS.representationThr))) kmeans = KMeans(n_clusters=n_clusters) with joblib.parallel_backend('dask'): kmeans.fit(examples) return [ Cluster(center=centroid, label=label) for centroid in kmeans.cluster_centers_ ]
def run(self, num_features=0, run_mode='regular'): # define a dictionary to initialize the SPFSR engine sp_params = dict() sp_params['num_features'] = num_features # how many cores to use for parallel processing during cross validation # this value is directly passed in to cross_val_score() sp_params['n_jobs'] = 1 # two gain types are available: bb (barzilai & borwein) or mon (monotone) sp_params['gain_type'] = 'bb' if run_mode == 'extended': sp_params['iter_max'] = 200 sp_params['stall_limit'] = 50 sp_params['num_grad_avg'] = 10 sp_params['cv_reps_grad'] = 1 sp_params['cv_reps_eval'] = 5 sp_params['num_gain_smoothing'] = 1 elif run_mode == 'regular': sp_params['iter_max'] = 100 sp_params['stall_limit'] = 25 sp_params['num_grad_avg'] = 2 sp_params['cv_reps_grad'] = 1 sp_params['cv_reps_eval'] = 2 sp_params['num_gain_smoothing'] = 2 else: raise ValueError('Error: Unknown SPFSR run mode.') # set other algorithm parameters sp_params[ 'print_freq'] = 5 # how often do you want to print iteration results sp_params['cv_folds'] = 5 sp_params['scoring_metric'] = scorer.accuracy_scorer sp_params['stratified_cv'] = True sp_params['maximize_score'] = True # two performance eval methods are available: cv or resub sp_params['perf_eval_method'] = 'cv' ##### kernel = SpfsrKernel(sp_params) kernel.set_inputs(x=self.x, y=self.y, wrapper=self.wrapper) kernel.shuffle_data() kernel.init_parameters() kernel.gen_cv_task() with parallel_backend('multiprocessing'): kernel.run_spfsr() self.results = kernel.parse_results() return self
def bestKNN(X, y, Xt, yt): clf = GridSearchCV(KNeighborsClassifier(), { 'n_neighbors': [5, 8, 13], 'metric': ['euclidean', 'hamming', 'dice', 'jaccard'] }, scoring='accuracy', cv=5) with parallel_backend('threading', n_jobs=24): clf.fit(X, y) results = clf.cv_results_ print(results) acc = clf.score(Xt, yt) print(acc) return results
def nested(scheduler_address, backends, classifier_n_jobs=-1): X_train, X_test, y_train, y_test = load_data() print_data(X_train, y_train, X_test, y_test) BACKENDS = build_backends(backends, scheduler_address, X_train, y_train) n_jobs_grid = [-1, 1] error, train_time = {}, {} for backend, backend_kwargs in BACKENDS: for n_jobs_outer in n_jobs_grid: for n_jobs_inner in n_jobs_grid: clf = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=10, n_jobs=classifier_n_jobs) param_grid = { 'max_features': [4, 8, 12], 'min_samples_split': [2, 5], } gs = GridSearchCV(clf, param_grid, cv=5, n_jobs=n_jobs_inner, verbose=2) name = '%s,%s,%s' % (backend, n_jobs_outer, n_jobs_inner) print("Training with {}...".format(name), end="") with parallel_backend(backend, **backend_kwargs): time_start = time() cv_gs = cross_validate(gs, X=X_train, y=y_train, cv=5, return_train_score=True, n_jobs=n_jobs_outer) train_time[name] = time() - time_start error[name] = cv_gs['test_score'].mean() print("done") df = pd.DataFrame(cv_gs) df.to_csv("{}.csv".format(name)) print("{:<25} | {}".format("Backend", "Train Time")) print("-" * 44) for name in sorted(error, key=error.get): print("{:<25} | {}".format(name, train_time[name]))
def _calculate_vif(self): if lib == 'sklearn': vif = [ self.viffunc(self.X.iloc[:, variables], ix) for ix in range(self.X.iloc[:, variables].shape[1]) ] elif lib == 'statsmodels': with parallel_backend('threading', n_jobs=self.n_jobs): vif = Parallel()( delayed(self.viffunc)(self.X.iloc[:, variables].values, ix) for ix in range(self.X.iloc[:, variables].shape[1])) else: vif = [ self.viffunc(self.X.iloc[:, variables], ix) for ix in range(self.X.iloc[:, variables].shape[1]) ] return vif
def train(): disc = VGGNet() cp = SaveBestParam(dirname='best') early_stop = StopRestore(patience=10) score = Score_ConfusionMatrix(scoring="accuracy", lower_is_better=False) pt = PrintLog(keys_ignored="confusion_matrix") net = NeuralNetClassifier(disc, max_epochs=100, lr=0.01, device='cuda', callbacks=[('best', cp), ('early', early_stop)], iterator_train__shuffle=True, iterator_valid__shuffle=False) net.set_params(callbacks__valid_acc=score) net.set_params(callbacks__print_log=pt) # X, y = load_data() # net.fit(X, y) # print(1) param_dist = { 'lr': [0.05, 0.01, 0.005], } search = RandomizedSearchCV(net, param_dist, cv=StratifiedKFold(n_splits=3), n_iter=3, verbose=10, scoring='accuracy') X, y = load_data() # search.fit(X, y) Client("127.0.0.1:8786") # create local cluster with joblib.parallel_backend('dask'): search.fit(X, y) with open('result.pkl', 'wb') as f: pickle.dump(search, f)
def baseline_SVC(train, train_class, test, original=False): """ Baseline classifier using SVC """ parameters = { 'C': np.logspace(-3, -1, 5), 'gamma': [1, 0.1], 'kernel': ['linear'] } grid = GridSearchCV(SVC(), parameters, refit=True, cv=3, verbose=5, return_train_score=True) with parallel_backend('threading'): grid.fit(train, train_class) best_parameters = grid.best_params_ pred = grid.predict(test) filename = "Results\\baseline_SVC_performance" if (original): filename += "_original" write_baselines(filename, "SVC", grid) print("Best score for best parameters:") print(grid.best_score_) print(grid.best_params_) return pred
def tune_parameters_RL(X, estimator, non_negative=0, distributed=0, scheduler_host="", coeff_penalty_range=(0.0001, 1, 10), fit_params={}, scoring_function=None, random_state=None): """ Parameters tuner. It tunes the parameters of a representations learning estimator using 3-splits monte carlo sampling cross validation. Parameters ---------- X: array-like, shape=(n_samples, n_features) The matrix to decompose and analyse. D: array-like, shape=(n_atoms, n_features) The dictionary. estimator: RepresentationLearning class, optional The estimator you want to use to analyse the matrix. non_negative: boolean, optional distributed: int, optional If 0 the parameters research will be executed in parallel on the computer the script is launched. If 1 the parameters research will be executed sequentially. If 2 the parameters research will be distributed on multiple machines connected by dask. In this case also scheduler_host must be speficied. scheduler_host: string, optional If distributed=2 it is necessary to specify the scheduler of the dask network. The string must be "ip_address:port", for example: "10.251.61.226:8786" coeff_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the coefficient penalty and the number of values to try. fit_params: dictionary, optional The parameters to pass to the fitting procedure during GridSearch. scoring_function: callable or None, default=None A scorer callable object / function with signature scorer(estimator, X, y=None). If None, the score method of the estimator is used. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- GridSearchCV The resulting GridSearch. """ # ------------------parameters control ---------------------------------- # X = check_array(X) random_state = check_random_state(random_state) _check_range(coeff_penalty_range) if estimator is None: logging.error("passed estimator was None") raise ValueError("passed estimator was None") _check_estimator(estimator) estimator.non_negativity = non_negative if distributed == 2: if scheduler_host is None: logging.ERROR("Distributed execution requires a scheduler " "specification. Changing the type to parallel.") distributed = 1 distributed = _check_scheduler(scheduler_host) ss = MonteCarloBootstrap(n_splits=3, test_size=0.1, random_state=random_state) params = _get_params_coeff(estimator, coeff_penalty_range, representation_learning=1) jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=(cpu_count() - 5), fit_params=fit_params, iid=True, refit=True, scoring=scoring_function, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) return gscv
def tune_parameters_DL(X, estimator=None, analysis=3, non_negative="none", distributed=0, scheduler_host="", range_k=None, dict_penalty_range=(0.0001, 1, 10), coeff_penalty_range=(0.0001, 1, 10), fit_params = {}, scoring_function=None, random_state=None): """ Parameters tuner. It tunes the parameters of a dictionary learning estimator using 3-splits monte carlo sampling cross validation. Parameters ---------- X: array-like, shape=(n_samples, n_features) The matrix to decompose and analyse. estimator: DictionaryLearning class, optional The estimator you want to use to analyse the matrix. If None only the research on the best number of atoms will be done. analysis: int, optional The type of tuning you want to perform. - 0: tune together number of atoms and dictionary penalty and then the coefficients penalty - 1: tune only the penalties and take the number of atoms as specified in the estimator - 2: tune only the number of atoms - 3: tune all together, number of atoms and penalties non_negative: string, optional If "none" no negativity is imposed on the decomposition, if "coeff" only negativity on the coefficient is imposed. If "both" negativiy is on both decomposition matrices. distributed: int, optional If 0 the parameters research will be executed in parallel on the computer the script is launched. If 1 the parameters research will be executed sequentially. If 2 the parameters research will be distributed on multiple machines connected by dask. In this case also scheduler_host must be speficied. scheduler_host: string, optional If distributed=2 it is necessary to specify the scheduler of the dask network. The string must be "ip_address:port", for example: "10.251.61.226:8786" range_k: int or list, optional The maximum number of atoms to try when you search for the right k or the list of possible values to try. If None range_k will be computed as int(min(p, 0.75 * n) / 2) dict_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the dictionary penalty and the number of values to try. coeff_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the coefficient penalty and the number of values to try. fit_params: dictionary, optional The parameters to pass to the fitting procedure during GridSearch. scoring_function: callable or None, default=None A scorer callable object / function with signature scorer(estimator, X, y=None). If None, the score method of the estimator is used. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- GridSearchCV The resulting GridSearch. """ # ------------------parameters control ---------------------------------- # X = check_array(X) random_state = check_random_state(random_state) _check_range(dict_penalty_range) _check_range(coeff_penalty_range) _check_non_negativity(non_negative, X) if estimator is None: analysis = 2 else: _check_estimator(estimator) if estimator.non_negativity == "none": estimator.non_negativity = non_negative n, p = X.shape if range_k is None: range_k = int(min(p, 0.75 * n) / 2) # generally the optimal # number of k is low if (analysis in [0, 1, 3] and (dict_penalty_range is None or coeff_penalty_range is None)): logging.ERROR("The range cannot be None") sys.exit(0) if distributed == 2: if scheduler_host is None: logging.ERROR("Distributed execution requires a scheduler " "specification. Changing the type to parallel.") distributed = 1 distributed = _check_scheduler(scheduler_host) # find first the paramaters on the dictionary and after the coefficients if analysis == 0: params = _get_params_dict(estimator, dict_penalty_range=dict_penalty_range) if type(range_k) is int: params['k'] = list(range(2, range_k)) else: params['k'] = range_k jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=jobs, scoring=scoring_function, iid=True, refit=True, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) estimator = gscv.best_estimator_ params = _get_params_coeff(estimator, coeff_penalty_range) # find only the penalties together elif analysis == 1: params = _get_params(estimator, dict_penalty_range, coeff_penalty_range) # find only the number of atoms elif analysis == 2: if type(range_k) is int: params = {'k': list(range(2, max_k))} else: params = {'k': range_k} # find everything together elif analysis == 3: params = _get_params(estimator, dict_penalty_range, coeff_penalty_range) if type(range_k) is int: params['k'] = list(range(2, range_k)) else: params['k'] = range_k else: logging.error("Unknown type of research, please try with another " "setting") raise ValueError("Unkown type of research, please try with another" "setting") ss = MonteCarloBootstrap(n_splits=3, test_size=0.1, random_state=random_state) jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, fit_params=fit_params, n_jobs=jobs, iid=True, scoring=scoring_function, refit=True, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) return gscv
#import dask_ml.joblib # registers joblib plugin # Scikit-learn bundles joblib, so you need to import from # `sklearn.externals.joblib` instead of `joblib` directly #from sklearn.externals.joblib import parallel_backend from sklearn.datasets import load_digits #from sklearn.grid_search import RandomizedSearchCV from sklearn.model_selection import train_test_split from sklearn.model_selection import RandomizedSearchCV from sklearn.svm import SVC import numpy as np from dask.distributed import Client from sklearn.externals import joblib digits = load_digits() param_space = { 'C': np.logspace(-6, 6, 13), 'gamma': np.logspace(-8, 8, 17), 'tol': np.logspace(-4, -1, 4), 'class_weight': [None, 'balanced'], } model = SVC(kernel='rbf') search = RandomizedSearchCV(model, param_space, cv=3, n_iter=50, verbose=10) client = Client() with joblib.parallel_backend('dask'): search.fit(digits.data, digits.target)
#some parameters to test in parallel param_space = { 'C': np.logspace(-6, 6, 20), 'gamma': np.logspace(-6,1,20) } svc_rbf = SVC(kernel='rbf', shrinking=False) search = GridSearchCV(svc_rbf, param_space, return_train_score=True, n_jobs=len(c)) with parallel_backend('ipyparallel'): search.fit(X_train, y_train) results = search.cv_results_ results = pd.DataFrame(results) results.to_csv(os.path.join(FILE_DIR,'scores_rbf_digits.csv')) scores = search.cv_results_['mean_test_score'].reshape(len(param_space['C']),len(param_space['gamma'])) plt.figure() #plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot) plt.xlabel('gamma') plt.ylabel('C') plt.colorbar() plt.xticks(np.arange(len(param_space['gamma'])), map(lambda x : "%.2E"%(x),param_space['gamma']), fontsize=8, rotation=45)
from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV from sklearn.externals.joblib import parallel_backend import numpy as np import data # explicitly set random seed to help parallelization np.random.seed(0) # use naive oversampling for grid search x_train_rs, x_test_rs, y_train_rs, y_test_rs = data.parse_data_random_oversample( './creditcard.csv') params = { 'hidden_layer_sizes': [(100, 50, 25), (75, 35, 15), (50, 25, 12)], 'activation': ['logistic', 'tanh'], 'solver': ['sgd', 'adam'], 'alpha': 10.**-np.arange(3, 6), 'learning_rate': ['invscaling', 'adaptive'], 'max_iter': [1000, 1500, 2000] } # perform grid search with 5 fold cross validation, use all available cores mlp_model = GridSearchCV(MLPClassifier(), params, n_jobs=-1, cv=5, verbose=2) with parallel_backend('threading'): mlp_model.fit(x_train_rs, y_train_rs.ravel()) # output the best model params and score print('best score: {0:.6f}'.format(mlp_model.best_score_)) print('best params: ') print(mlp_model.best_params_)
import dask.datasets as ds import time from dask_ml.linear_model import LogisticRegression from dask_glm.datasets import make_classification X, y = make_classification(n_samples=1000) t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml: ' + str(time.time() - t)) # Parallelize Scikit-Learn Directly from dask.distributed import Client from sklearn.externals.joblib import parallel_backend client = Client('localhost:8786') # Connect to a Dask Cluster print(client) with parallel_backend('dask', scatter=[X, y]): # Your normal scikit-learn code here t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml distributed: ' + str(time.time() - t))
def use_dask_xgb(out_q, emotion, df: dd.DataFrame): # data_columns = [x for x in df.columns if 'annotated' not in x and 'predicted' not in x and 'patient' not in x] # data = df[data_columns] # # data.convert_objects(convert_numeric=True).dropna() # data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'),axis=1, meta={'x': 'f8', 'y': 'f8'}) # data = data.fillna(0) # labels = df[df['annotated'] != "N/A"] # labels = labels['annotated'] # # labels = labels.assign(lambda x: 1 if x['annotated'] == emotion else 0) # labels = labels.apply(lambda x: 1 if x == emotion else 0, meta={'x': 'f8', 'y': 'f8'}) # labels = labels.fillna(0) # # labels = labels.compute() # X_train, X_test, y_train, y_test = train_test_split(data.values, labels.values) # classifier = XGBClassifier() # scoring = ['precision', 'recall'] # scores = cross_val_score( # classifier, X_train, y_train, scoring=scoring) # out_q.put("Cross val precision for classifier {0}:\n{1}\n".format( # classifier, scores['precision'].mean())) # out_q.put("Cross val recall for classifier {0}:\n{1}\n".format( # classifier, scores['recall'].mean())) data_columns = [ x for x in df.columns if 'predicted' not in x and 'patient' not in x and 'session' not in x and 'vid' not in x ] df = df[data_columns] data = df[df['annotated'] != "N/A"] data = data[data['annotated'] != ""] emote_data = data[data['annotated'] == emotion] non_emote_data = data[data['annotated'] != emotion] non_emote_data = non_emote_data.sample(frac=len(emote_data) / len(non_emote_data)) data = dd.concat([emote_data, non_emote_data], interleave_partitions=True) labels = (data['annotated'] == emotion) # print(labels.unique().compute()) del data['annotated'] print("PERSISTING DATA") # data, labels = dask.persist(data, labels) # data = client.compute(data) # labels = client.compute(labels) data = data.compute() labels = labels.compute() # df2 = dd.get_dummies(data.categorize()).persist() # X_train, X_test, y_train, y_test = train_test_split(df2, labels) X_train, X_test, y_train, y_test = train_test_split(data, labels) # X_train, X_test = data.random_split([.9,.1]) # y_train, y_test = labels.random_split([.9,.1]) # cluster = LocalCluster(n_workers=16) # cluster = LocalCluster() # client = Client(cluster) # client = Client('scheduler-address:8786', processes=False) # classifier = XGBClassifier() scoring = ['precision', 'recall'] print("TRAINING") # classifier.fit(X_train, y_train) classifier = RandomForestClassifier(n_estimators=100) with parallel_backend('dask'): # scores = cross_validate( # classifier, X_train.values, y_train.values, scoring=scoring) scores = cross_validate(classifier, X_train, y_train, scoring=scoring, cv=5, return_train_score=True) out_q.put("Scores for emotion {0} \n".format(emotion)) out_q.put("Cross val train precision for classifier {0}:\n{1}\n".format( classifier, scores['train_precision'].mean())) out_q.put("Cross val train recall for classifier {0}:\n{1}\n".format( classifier, scores['train_recall'].mean())) out_q.put("Cross val test precision for classifier {0}:\n{1}\n".format( classifier, scores['test_precision'].mean())) out_q.put("Cross val test recall for classifier {0}:\n{1}\n".format( classifier, scores['test_recall'].mean())) # expected = y_test.values # predicted = classifier.predict(X_test.values) print("PREDICTING") expected = y_test with parallel_backend('dask'): classifier.fit(X_train, y_train) predicted = classifier.predict(X_test) # predicted = classifier.predict(X_test) out_q.put( "Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) out_q.put("Confusion matrix:\n%s\n" % metrics.confusion_matrix(expected, predicted)) # classifier.save_model('{0}_trained_XGBoost_with_pose') pickle.dump( classifier, open('{0}_trained_RandomForest_with_pose.pkl'.format(emotion), 'wb'))
def run_task(seed, task_id, estimator_name, n_iter, n_jobs, n_folds_inner_cv, profile, joblib_tmp_dir, run_tmp_dir): # retrieve dataset / task task = openml.tasks.get_task(task_id) num_features = task.get_X_and_y()[0].shape[1] indices = task.get_dataset().get_features_by_type('nominal', [task.target_name]) # retrieve classifier classifierfactory = openmlstudy14.pipeline.EstimatorFactory( n_folds_inner_cv, n_iter, n_jobs) estimator = classifierfactory.get_flow_mapping()[estimator_name]( indices, num_features=num_features) print('Running task with ID %d.' % task_id) print('Arguments: random search iterations: %d, inner CV folds %d, ' 'n parallel jobs: %d, seed %d' % (n_iter, n_folds_inner_cv, n_jobs, seed)) print('Model: %s' % str(estimator)) flow = openml.flows.sklearn_to_flow(estimator) flow.tags.append('study_14') import time start_time = time.time() # TODO generate a flow first if profile is None: import warnings with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', module='sklearn\.externals\.joblib\.parallel') run = openml.runs.run_flow_on_task(task, flow, seed=seed) else: print('Using ipython parallel with scheduler file %s' % profile) for i in range(1000): profile_file = os.path.join(os.path.expanduser('~'), '.ipython', 'profile_%s' % profile, 'security', 'ipcontroller-engine.json') try: with open(profile_file) as fh: scheduler_information = yaml.load(fh) break except FileNotFoundError: print('scheduler file %s not found. sleeping ... zzz' % profile_file) time.sleep(1) continue c = Client(profile=profile) bview = c.load_balanced_view() register_parallel_backend( 'ipyparallel', lambda: NPCachingIpyParallelBackend(view=bview, tmp_dir=joblib_tmp_dir)) with parallel_backend('ipyparallel'): run = openml.runs.run_flow_on_task(task, flow, seed=seed) end_time = time.time() run.tags.append('study_14') tmp_dir = os.path.join(run_tmp_dir, '%s_%s' % (str(task_id), estimator_name)) print(tmp_dir) try: os.makedirs(tmp_dir) except Exception as e: print(e) run_xml = run._create_description_xml() predictions_arff = arff.dumps(run._generate_arff_dict()) with open(tmp_dir + '/run.xml', 'w') as f: f.write(run_xml) with open(tmp_dir + '/predictions.arff', 'w') as f: f.write(predictions_arff) run_prime = run.publish() print('READTHIS', estimator_name, task_id, run_prime.run_id, end_time - start_time) return run
'--classifier', default='SVC', choices=classifier_choices, help='Classifier used by the model') parser.add_argument('--train', default=10000, help='Number of training sample to use') parser.add_argument('--valid', default=1000, help='Number of validation sample to use') args = parser.parse_args() logging.info(f'{args}') Model = getattr(models, args.model) Classifier = getattr(classifiers, args.classifier) X_train, Y_train = read_data(get_dataset('train'), sample_n=args.train) X_valid, Y_valid = read_data(get_dataset('valid'), sample_n=args.valid) model = Model(classifier=Classifier, steps=[args.feature_model], memory='data/feature_cache') with joblib.parallel_backend('threading', n_jobs=4): model.fit(X_train, Y_train) score = model.score(X_valid, Y_valid) logging.info('') logging.info(f'Overall F1: {score:.4f}') logging.info('') save_model(model)
end="") estimator_params = estimator.get_params() estimator.set_params( **{ p: RANDOM_STATE for p in estimator_params if p.endswith("random_state") }) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=-1) # Key for the results name = "%s, %s" % (est_name, backend) with parallel_backend(backend, **backend_kwargs): time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================")
import distributed.joblib # Scikit-learn bundles joblib, so you need to import from # `sklearn.externals.joblib` instead of `joblib` directly # This is not true depending on the packaging (e.g. Fedora/Debian) # in this case, use the following: # from joblib import parallel_backend from sklearn.externals.joblib import parallel_backend from sklearn.datasets import load_digits from sklearn.grid_search import RandomizedSearchCV from sklearn.svm import SVC import numpy as np #load mnist digits data digits = load_digits() #set up the parameters to be explored param_space = { 'C': np.logspace(-6, 6, 13), 'gamma': np.logspace(-8, 8, 17), 'tol': np.logspace(-4, -1, 4), 'class_weight': [None, 'balanced'], } #create the model model = SVC(kernel='rbf') search = RandomizedSearchCV(model, param_space, cv=3, n_iter=50, verbose=10) #using sklearn's parallel_backend with parallel_backend('dask.distributed', scheduler_host='localhost:8888'): search.fit(digits.data, digits.target)
def test_sklearn(): from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier, LogisticRegressionCV from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups from dask_ml.wrappers import ParallelPostFit categories = [ 'alt.atheism', 'talk.religion.misc', ] print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(max_iter=1000)), ]) parameters = { 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), # 'clf__alpha': (0.00001, 0.000001), # 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, refit=False, iid=False) grid_search.fit(data.data, data.target) with joblib.parallel_backend('dask'): grid_search.fit(data.data, data.target) X, y = load_digits(return_X_y=True) svc = ParallelPostFit(SVC(random_state=0, gamma='scale')) param_grid = { # use estimator__param instead of param 'estimator__C': [0.01, 1.0, 10], } grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3) grid_search.fit(X, y) big_X = da.concatenate( [da.from_array(X, chunks=X.shape) for _ in range(10)]) predicted = grid_search.predict(big_X) # X_train, y_train = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=1000) N = 100 X_large = da.concatenate( [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)]) y_large = da.concatenate( [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)]) clf = ParallelPostFit(LogisticRegressionCV(cv=3)) clf.fit(X_train, y_train) y_pred = clf.predict(X_large) clf.score(X_large, y_large) # est.partial_fit(X_train_1, y_train_1) # from tpot import TPOTClassifier pass
from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.externals import joblib def simple_nn(hidden_neurons): model = Sequential() model.add(Dense(hidden_neurons, activation='relu', input_dim=30)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) return model param_grid = {'hidden_neurons': [100, 200, 300]} if __name__=='__main__': client = Client() cv = GridSearchCV(KerasClassifier(build_fn=simple_nn, epochs=100), param_grid) X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) with joblib.parallel_backend("dask", scatter=[X_train, y_train]): cv.fit(X_train, y_train) print(f'Best Accuracy for {cv.best_score_:.4} using {cv.best_params_}')
from dask.distributed import Client, progress, wait client = Client('149.165.148.24:8786') print(client) X, y = Xdata / 255., ydata X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] mlp = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=10, solver='sgd', verbose=10, random_state=1) from sklearn.externals import joblib with joblib.parallel_backend('dask', scheduler_host='149.165.148.24:8786'): get_ipython().run_line_magic('time', 'mlp.fit(X_train, y_train)') print("Training set score: %f" % mlp.score(X_train, y_train)) print("Test set score: %f" % mlp.score(X_test, y_test)) # ### With 100 iterations mlp = MLPClassifier(hidden_layer_sizes=(100, 10), max_iter=100, solver='sgd', verbose=10, random_state=1) from sklearn.externals import joblib with joblib.parallel_backend('dask', scheduler_host='149.165.148.24:8786'): get_ipython().run_line_magic('time', 'mlp.fit(X_train, y_train)')
# Instead of creatinf direct client, you can form your own cluster and than build client on top of thatself. from dask.distributed import Client, LocalCluster cluster = LocalCluster() client = Client(cluster) # This will allow you to control your clusters properties. #The next step will be to instantiate dask joblib in the # backend. You need to import parallel_backend from sklearn # joblib like I have shown below. import dask_ml.joblib from sklearn.externals.joblib import parallel_backend with parallel_backend('dask'): # Your normal scikit-learn code here from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() # I want to parallize the custom workflows # This can be used with the sciki learn pipeline. # Let's say I have a function that do some process on the data. # And I want to parallize the process using dask_ml def process(data): return something