def test_sparse(self): params = { 'exec_path': path_to_exec, 'num_iterations': 1000, 'verbose': False, 'min_data_in_leaf': 1, 'learning_rate': 0.1, 'num_leaves': 5 } clfs = [ [sps.csr_matrix(X), Y, 'classification', GBMClassifier(**params)], [sps.csr_matrix(Xreg), Yreg, 'regression', GBMRegressor(**params)], ] for x, y, name, clf in clfs: clf.fit(x, y) if name == 'classification': score = metrics.accuracy_score(y, clf.predict(x)) assert score > 0.9 else: score = metrics.mean_squared_error(y, clf.predict(x)) assert score < 1.
def test_early_stopping(self): cv_params = dict(test_size=test_size, random_state=seed) xtr, xte, ytr, yte = model_selection.train_test_split( X, Y, **cv_params) xtr_reg, xte_reg, ytr_reg, yte_reg = model_selection.train_test_split( X, Y, **cv_params) params = dict(exec_path=path_to_exec, num_iterations=10000, min_data_in_leaf=3, learning_rate=0.01, num_leaves=2, early_stopping_round=2) clfs = [ [ xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression', GBMRegressor(boosting_type='gbdt', **params) ], [ xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression', GBMRegressor(boosting_type='dart', **params) ], [ xtr, ytr, xte, yte, 'classification', GBMClassifier(boosting_type='gbdt', **params) ], [ xtr, ytr, xte, yte, 'classification', GBMClassifier(boosting_type='dart', **params) ], ] for xtr, ytr, xte, yte, name, clf in clfs: clf.fit(xtr, ytr, test_data=[(xte, yte)]) if name == 'regression': score = metrics.mean_squared_error(yte, clf.predict(xte)) assert (score < 1. and clf.best_round < clf.param['num_iterations']) else: score = metrics.accuracy_score(yte, clf.predict(xte)) assert (score > 0.7 and clf.best_round < clf.param['num_iterations'])
def fit(self, X_train, y_train, X_eval=None, y_eval=None, seed=42, feature_names=None, eval_func=None, **kwa): params = self.params.copy() params['bagging_seed'] = seed params['feature_fraction_seed'] = seed + 3 self.model = GBMClassifier(**params) if X_eval is None: self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])
class LightGBM(BaseAlgo): default_params = {'exec_path': 'lightgbm', 'num_threads': 4} def __init__(self, params): self.params = self.default_params.copy() for k in params: self.params[k] = params[k] def fit(self, X_train, y_train, X_eval=None, y_eval=None, seed=42, feature_names=None, eval_func=None, **kwa): params = self.params.copy() params['bagging_seed'] = seed params['feature_fraction_seed'] = seed + 3 self.model = GBMClassifier(**params) if X_eval is None: self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)]) def predict(self, X): return self.model.predict(X) def predict_proba(self, X): return self.model.predict(X)
def test_pickle(self): params = {'exec_path': path_to_exec, 'verbose': False} clfs = [ [X, Y, GBMClassifier(**params)], [Xreg, Yreg, GBMRegressor(**params)], ] for x, y, clf in clfs: clf.fit(X, Y) pickle.dump(clf, open("clf_gbm.pkl", "wb")) clf2 = pickle.load(open("clf_gbm.pkl", "rb")) assert np.allclose(clf.predict(X), clf2.predict(X))
def test_simple_fit(self): params = dict(exec_path=path_to_exec, num_iterations=100, min_data_in_leaf=1, learning_rate=0.1, num_leaves=5, max_depth=10) clfs = [ [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='gbdt', **params) ], [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='dart', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='gbdt', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='dart', **params) ], ] for x, y, name, clf in clfs: clf.fit(x, y, init_scores=np.zeros(x.shape[0])) if name == 'regression': score = metrics.mean_squared_error(y, clf.predict(x)) score < 1. else: score = metrics.accuracy_score(Y, clf.predict(X)) assert score > 0.9
def test_multiclass(self): clf = GBMClassifier(exec_path=path_to_exec, min_data_in_leaf=1, learning_rate=0.1, num_leaves=5, num_class=n_classes, metric='multi_logloss', application='multiclass', num_iterations=100) clf.fit(Xmulti, Ymulti.argmax(-1)) clf.fit(Xmulti, Ymulti.argmax(-1), test_data=[(Xmulti, Ymulti.argmax(-1))]) score = metrics.accuracy_score(Ymulti.argmax(-1), clf.predict(Xmulti)) assert score > 0.8
# Parameters seed = 1337 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, n_classes=2, random_state=seed) # 'exec_path' is the path to lightgbm executable gbm = GBMClassifier(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.05, min_data_in_leaf=1, num_leaves=5, metric='binary_logloss', verbose=True) param_grid = { 'learning_rate': [0.1, 0.04], 'min_data_in_leaf': [1, 10], 'bagging_fraction': [0.5, 0.9] } scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(X, Y)
seed = 1337 nfolds = 5 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, n_classes=2, random_state=seed) # 'exec_path' is the path to lightgbm executable gbm = GBMClassifier(exec_path=path_to_exec, num_iterations=100, learning_rate=0.075, min_data_in_leaf=1, bagging_freq=10, metric='binary_error', early_stopping_round=10) param_grid = { 'learning_rate': [0.1, 0.04], 'min_data_in_leaf': [1, 10], 'bagging_fraction': [0.5, 0.9] } scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(X, Y)
# -*- coding: utf-8 -*- """ @author: Ardalan MEHRANI <*****@*****.**> @brief: """ import pickle import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMClassifier # Parameters path_to_exec = "~/Documents/apps/LightGBM/lightgbm" X, Y = datasets.make_classification(n_samples=1000, n_features=100, random_state=1337) # 'exec_path' is the path to lightgbm executable clf = GBMClassifier(exec_path=path_to_exec, verbose=False) clf.fit(X, Y) y_pred = clf.predict(X) print("Accuracy: ", metrics.accuracy_score(Y, y_pred)) # The sklearn API models are picklable print("Pickling sklearn API models") pickle.dump(clf, open("clf_gbm.pkl", "wb")) clf2 = pickle.load(open("clf_gbm.pkl", "rb")) print(np.allclose(clf.predict(X), clf2.predict(X)))
# Parameters seed = 1337 nfolds = 5 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=500, random_state=seed) skf = model_selection.StratifiedKFold(n_splits=nfolds, random_state=seed) clf = GBMClassifier(exec_path=path_to_exec, num_iterations=1000, min_data_in_leaf=1, num_leaves=10, metric='binary_error', learning_rate=0.1, early_stopping_round=10, verbose=False) best_rounds = [] scores = [] for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)): x_train = X[train_idx, :] y_train = Y[train_idx] x_valid = X[valid_idx, :] y_valid = Y[valid_idx] clf.fit(x_train, y_train, test_data=[(x_valid, y_valid)]) best_round = clf.best_round
# for step in [6]: result_preb = pd.DataFrame({'tel': valid['tel']}) for step in [2]: print '---------------------', step # train_X,test_X,train_Y,test_Y=train_test_split(index_data,index_lable , test_size=0.25, random_state=step) from imblearn.combine import SMOTEENN, SMOTETomek texec = u"E:\\code\\Debug\\lightgbm.exe" import subprocess # subprocess.Popen(texec) # X, Y = datasets.make_classification(n_samples=200, n_features=10) x_train, x_test, y_train, y_test = model_selection.train_test_split( index_data, index_lable, test_size=0.3) gbm = GBMClassifier(exec_path=texec, metric='binary_error,auc', early_stopping_round=10, bagging_freq=10) param_grid = {'learning_rate': [0.1, 0.04], 'bagging_fraction': [0.5, 0.9]} scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(x_train, y_train) print("Best score: ", clf.best_score_) print("Best params: ", clf.best_params_) # clf = GBMClassifier(exec_path=texec, min_data_in_leaf=10 , # metric='auc', # # feature_fraction=0.9,
@author: Ardalan MEHRANI <*****@*****.**> @brief: """ import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMClassifier # Parameters seed = 1337 n_classes = 10 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.load_digits(return_X_y=True, n_class=n_classes) x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMClassifier(exec_path=path_to_exec, num_class=n_classes, metric='multi_logloss', application='multiclass', num_iterations=1000, min_data_in_leaf=1, num_leaves=5, early_stopping_round=20) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_prob = clf.predict_proba(x_test) y_pred = y_prob.argmax(-1) print("Log loss: ", metrics.log_loss(y_test, y_prob)) print("Accuracy: ", metrics.accuracy_score(y_test, y_pred)) print("Best round: ", clf.best_round)
def get_model_obj(modelType, n_clusters=None, **kwargs): if modelType == 'knn': from sklearn.neighbors import KNeighborsClassifier # 6 seems to give the best trade-off between accuracy and precision knn = KNeighborsClassifier(n_neighbors=6, **kwargs) return knn elif modelType == 'gaussianNB': from sklearn.naive_bayes import GaussianNB gnb = GaussianNB(**kwargs) return gnb elif modelType == 'multinomialNB': from sklearn.naive_bayes import MultinomialNB # TODO: figure out how to configure binomial distribution mnb = MultinomialNB(**kwargs) return mnb elif modelType == 'bernoulliNB': from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB(**kwargs) return bnb elif modelType == 'randomForest': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(random_state=234, **kwargs) return rfc elif modelType == 'svm': from sklearn.svm import SVC svc = SVC(random_state=0, probability=True, **kwargs) return svc elif modelType == 'LinearRegression': #assert column, "Column name required for building a linear model" #assert dataframe[column].shape == target.shape from sklearn import linear_model l_reg = linear_model.LinearRegression(**kwargs) return l_reg elif modelType == 'RidgeRegression': from sklearn.linear_model import Ridge if not kwargs: kwargs = {'alpha': 0.5} ridge_reg = Ridge(**kwargs) return ridge_reg elif modelType == 'RidgeRegressionCV': from sklearn import linear_model if not kwargs: kwargs = {'alphas': [0.1, 1.0, 10.0]} ridge_cv_reg = linear_model.RidgeCV(**kwargs) return ridge_cv_reg elif modelType == 'LassoRegression': from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1} lasso_reg = linear_model.Lasso(**kwargs) return lasso_reg elif modelType == 'ElasticNetRegression': from sklearn.metrics import r2_score from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1, 'l1_ratio': 0.7} enet_reg = linear_model.ElasticNet(**kwargs) return enet_reg elif modelType == 'LogisticRegression': from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(random_state=123, **kwargs) return log_reg elif modelType == 'RANSACRegression': from sklearn.linear_model import LinearRegression, RANSACRegressor ransac_model = RANSACRegressor(LinearRegression()) return ransac_model elif modelType == 'kde': from sklearn.neighbors.kde import KernelDensity kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs) return kde elif modelType == 'AR': import statsmodels.api as sm # fit an AR model and forecast ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9, method='mle', disp=-1, **kwargs) #ts_forecast = ar_fitted.predict(start='2008', end='2050') return ar_fitted elif modelType == 'SARIMAX': mod = sm.tsa.statespace.SARIMAX(df.riders, trend='n', order=(0, 1, 0), seasonal_order=(1, 1, 1, 12), **kwargs) return mod elif modelType == 'sgd': # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(**kwargs) return sgd elif modelType == 'perceptron': from sklearn.linear_model import Perceptron perceptron = Perceptron(**kwargs) return perceptron elif modelType == 'xgboost': import xgboost as xgb xgbm = xgb.XGBClassifier(**kwargs) return xgbm elif modelType == 'baseNN': from keras.models import Sequential from keras.layers import Dense # create model model = Sequential() assert args.get('inputParams', None) assert args.get('outputParams', None) model.add(Dense(inputParams)) model.add(Dense(outputParams)) if args.get('compileParams'): # Compile model model.compile( compileParams ) # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model elif modelType == 'lightGBMRegression': from pylightgbm.models import GBMRegressor lgbm_lreg = GBMRegressor(num_iterations=100, early_stopping_round=10, num_leaves=10, min_data_in_leaf=10) return lgbm_lreg elif modelType == 'lightGBMBinaryClass': from pylightgbm.models import GBMClassifier lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1) return lgbm_bc # Clustering models elif modelType == 'KMeans': assert n_clusters, "Number of clusters argument mandatory" cluster_callable = KMeans # seed of 10 for reproducibility. clusterer = cluster_callable(n_clusters=n_clusters, random_state=10) return clusterer elif modelType == 'dbscan': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) cluster_callable = DBSCAN clusterer = cluster_callable(eps=0.5) return clusterer elif modelType == 'affinity_prop': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = AffinityPropagation(damping=.9, preference=-200) return clusterer elif modelType == 'spectral': assert n_clusters, "Number of clusters argument mandatory" clusterer = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") return clusterer elif modelType == 'birch': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = Birch(n_clusters=2) return clusterer elif modelType == 'agglomerativeCluster': # connectivity matrix for structured Ward connectivity = kneighbors_graph(dataframe, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) clusterer = AgglomerativeClustering(n_clusters=cluster, linkage='ward', connectivity=connectivity) return clusterer elif modelType == 'meanShift': # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3) clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) return clusterer elif modelType == 'gmm': from sklearn import mixture gmm = mixture.GaussianMixture(n_components=5, covariance_type='full') return gmm elif modelType == 'dgmm': from sklearn import mixture dgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full') return dgmm else: raise 'Unknown model type: see utils.py for available'
def get_classifiers(names): classifiers = [] for name in names: if name == 'LogisticRegression': clf = LogisticRegression(penalty='l1', C=0.007, random_state=CONFIG['RANDOM_SEED']) elif name == 'XGBClassifier': clf = XGBClassifier( base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9, gamma=0.7, learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=9.0, missing=None, n_estimators=1500, nthread=-1, objective='binary:logistic', reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, seed=CONFIG['RANDOM_SEED'], silent=True, subsample=0.9) elif name == 'ExtraTreesClassifier': clf = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=10, min_samples_leaf=5, max_features='auto', n_jobs=-1, random_state=CONFIG['RANDOM_SEED']) elif name == 'GBMClassifier': clf = GBMClassifier(exec_path="~/LightGBM/lightgbm", config="", application='binary', num_iterations=10, learning_rate=0.1, num_leaves=127, tree_learner="serial", num_threads=-1, min_data_in_leaf=100, metric='binary_logloss,', is_training_metric='False', feature_fraction=1., feature_fraction_seed=2, bagging_fraction=1., bagging_freq=0, bagging_seed=3, metric_freq=1, early_stopping_round=0, max_bin=255, is_unbalance=False, num_class=1, boosting_type='gbdt', min_sum_hessian_in_leaf=10, drop_rate=0.01, drop_seed=4, max_depth=-1, lambda_l1=0., lambda_l2=0., min_gain_to_split=0., verbose=False, model=None) else: raise ValueError('Unknown classifier name.') classifiers.append(clf) return classifiers
# Parameters seed = 1337 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, n_classes=2, random_state=seed) x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=seed) params = {'exec_path': path_to_exec, 'num_iterations': 1000, 'learning_rate': 0.01, 'min_data_in_leaf': 1, 'num_leaves': 5, 'metric': 'binary_error', 'verbose': False, 'early_stopping_round': 20} clfs = [ ['gbdt', GBMClassifier(boosting_type='gbdt', **params)], ['dart', GBMClassifier(boosting_type='dart', drop_rate=0.02, drop_seed=4, **params)], ] for boosting_type, clf in clfs: clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_prob = clf.predict_proba(x_test) y_pred = y_prob.argmax(-1) print("booster {} loss: {}, accuracy: {}, best round: {}".format( boosting_type, metrics.log_loss(y_test, y_prob), metrics.accuracy_score(y_test, y_pred), clf.best_round ))
bst1 = xgb.train(params, dtrain, params['n']) # ------------------------------------------------------------------ params = { 'exec_path': path_to_exec, 'num_iterations': 108, 'learning_rate': 0.079, 'num_leaves': 13, 'metric': 'binary_error', 'min_sum_hessian_in_leaf': 1, 'bagging_fraction': 0.642, 'bagging_freq': 1, 'verbose': 0 } bst2 = GBMClassifier(boosting_type='gbdt', **params) bst2.fit(X_train, y_train) # ------------------------------------------------------------------ params_est = { 'n_estimators': 300, 'loss': 'exponential', 'learning_rate': 0.08, 'subsample': 0.6910000000000001, 'min_samples_leaf': 340, 'max_features': 53, 'random_state': 1 } bst3 = GradientBoostingClassifier(**params_est) bst3.fit(X_train, y_train) # ------------------------------------------------------------------ from keras.callbacks import Callback as keras_clb
# Parameters seed = 1337 nfolds = 5 test_size = 0.2 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, random_state=seed) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMClassifier(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, min_data_in_leaf=1, num_leaves=5, metric='binary_error', early_stopping_round=20) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_prob = clf.predict_proba(x_test) y_pred = y_prob.argmax(-1) print("Log loss: ", metrics.log_loss(y_test, y_prob)) print("Accuracy: ", metrics.accuracy_score(y_test, y_pred)) print("Best round: ", clf.best_round)
""" import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMClassifier # params seed = 1337 np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, random_state=seed) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=0.2, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMClassifier(exec_path="~/Documents/apps/LightGBM/lightgbm", num_iterations=100, learning_rate=0.1, min_data_in_leaf=1, metric='binary_error', early_stopping_round=10) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_prob = clf.predict_proba(x_test) y_pred = y_prob.argmax(-1) print("Log loss: ", metrics.log_loss(y_test, y_prob)) print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
random_state=seed) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) params = { 'exec_path': path_to_exec, 'num_iterations': 1000, 'learning_rate': 0.01, 'early_stopping_round': 20, 'min_data_in_leaf': 1, 'num_leaves': 5, 'verbose': False } clf_binary = GBMClassifier(application='binary', metric='binary_error', **params) clf_multiclass = GBMClassifier(application='multiclass', num_class=n_classes, metric='multi_error', **params) for clf in [clf_binary, clf_multiclass]: clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_prob = clf.predict_proba(x_test) y_pred = y_prob.argmax(-1) print("Log loss: ", metrics.log_loss(y_test, y_prob)) print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
def test_grid_search(self): param_grid = { 'learning_rate': [0.01, 0.1, 1], 'num_leaves': [2, 5, 50], 'min_data_in_leaf': [1, 10, 100], 'bagging_fraction': [0.1, 1] } params = { 'exec_path': path_to_exec, 'num_threads': 2, 'num_iterations': 100, 'learning_rate': 0.1, 'min_data_in_leaf': 1, 'num_leaves': 10, 'bagging_freq': 2, 'verbose': False } clfs = [ [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='gbdt', metric='l2', **params) ], [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='dart', metric='l2', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='gbdt', metric='binary_logloss', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='dart', metric='binary_logloss', **params) ], ] for x, y, name, clf in clfs: if name == 'regression': scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) grid = model_selection.GridSearchCV(clf, param_grid, scoring=scorer, cv=2, refit=True) grid.fit(x, y) score = metrics.mean_squared_error(y, grid.predict(x)) print(score) assert score < 2000 else: scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) grid = model_selection.GridSearchCV(clf, param_grid, scoring=scorer, cv=2, refit=True) grid.fit(x, y) score = metrics.accuracy_score(y, grid.predict(x)) print(score) assert score > .9
# Parameters seed = 1337 np.random.seed(seed) # for reproducibility X, Y = datasets.make_classification(n_samples=1000, n_features=100, n_classes=2, random_state=seed) # 'exec_path' is the path to lightgbm executable gbm = GBMClassifier(exec_path="~/Documents/apps/LightGBM/lightgbm", num_iterations=100, learning_rate=0.075, min_data_in_leaf=1, bagging_freq=10, metric='binary_error', early_stopping_round=10) param_grid = { 'learning_rate': [0.1, 0.04], 'min_data_in_leaf': [1, 10], 'bagging_fraction': [0.5, 0.9] } scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(X, Y)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=_seed) cl = GBMClassifier( exec_path=exec_path, boosting_type='gbdt', # gbdt | dart | goss learning_rate=LEARNING_RATE, num_leaves=64, min_data_in_leaf=1, min_sum_hessian_in_leaf=1e-4, num_iterations=5000, num_threads=4, early_stopping_round=EARLY_STOPPING, drop_rate=0.0001, max_depth=6, lambda_l1=0., lambda_l2=0., max_bin=63, feature_fraction=1.0, #bagging_fraction=0.5, #bagging_freq=3, verbose=True) cl.fit(X_train, y_train, test_data=[(X_test, y_test)]) #</editor-fold> #<editor-fold desc="Генерация сабмита">