def build_model(model_type, num_targets = 1): if model_type == 'linear_regression': base = linear_model.SGDRegressor() elif model_type == 'random_forests': base = ensemble.RandomForestRegressor() elif model_type == 'gradient_boosting': base = ensemble.GradientBoostingRegressor() elif model_type == 'extra_trees': base = ensemble.ExtraTreesRegressor() elif model_type == 'bagging': base = ensemble.BaggingRegressor() elif model_type == 'adaboost': base = ensemble.AdaBoostRegressor() elif model_type == 'neural_network': base = neural_network.MLPRegressor() elif model_type == 'svm': base = svm.SVR(verbose=1) elif model_type == 'constant_mean': base = dummy.DummyRegressor('mean') elif model_type == 'constant_median': base = dummy.DummyRegressor('median') elif model_type == 'constant_zero': base = dummy.DummyRegressor('constant', constant=0) else: raise(ValueError('invalid model type: {}'.format(model_type))) # multiple outputs in the dataset => fit a separate regressor to each if num_targets > 1: return multioutput.MultiOutputRegressor(base) else: return base
def deserialize_gradient_boosting_regressor(model_dict): model = GradientBoostingRegressor(**model_dict['params']) trees = [ deserialize_decision_tree_regressor(tree) for tree in model_dict['estimators_'] ] model.estimators_ = np.array(trees).reshape(model_dict['estimators_shape']) if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy': model.init_ = dummy.DummyRegressor() model.init_.__dict__ = model_dict['init_'] model.init_.__dict__.pop('meta') model.train_score_ = np.array(model_dict['train_score_']) model.max_features_ = model_dict['max_features_'] model.n_features_ = model_dict['n_features_'] if model_dict['loss_'] == 'ls': model.loss_ = _gb_losses.LeastSquaresError(1) elif model_dict['loss_'] == 'lad': model.loss_ = _gb_losses.LeastAbsoluteError(1) elif model_dict['loss_'] == 'huber': model.loss_ = _gb_losses.HuberLossFunction(1) elif model_dict['loss_'] == 'quantile': model.loss_ = _gb_losses.QuantileLossFunction(1) if 'priors' in model_dict: model.init_.priors = np.array(model_dict['priors']) return model
def build_sklearn(self, model_id, model_params): """Method that builds models implemented in sklearn""" if model_id == 'sklearn_LogisticRegressionCV': return linear_model.LogisticRegressionCV(**model_params) if model_id == 'sklearn_LogisticRegression': return linear_model.LogisticRegression(**model_params) elif model_id == 'sklearn_MLPClassifier': return neural_network.MLPClassifier(**model_params) elif model_id == 'sklearn_GaussianNB': return naive_bayes.GaussianNB(**model_params) elif model_id == 'sklearn_MultinomialNB': return naive_bayes.MultinomialNB(**model_params) elif model_id == 'sklearn_BernoulliNB': return naive_bayes.BernoulliNB(**model_params) elif model_id == 'sklearn_RandomForestClassifier': return ensemble.RandomForestClassifier(**model_params) elif model_id == 'sklearn_SVC': return svm.SVC(**model_params) elif model_id == 'sklearn_AdaBoostClassifier': return ensemble.AdaBoostClassifier(**model_params) elif model_id == 'sklearn_SGDClassifier': return linear_model.SGDClassifier(**model_params) elif model_id == 'sklearn_PassiveAggressiveClassifier': return linear_model.PassiveAggressiveClassifier(**model_params) elif model_id == 'sklearn_RidgeClassifier': return linear_model.RidgeClassifier(**model_params) elif model_id == 'sklearn_DummyClassifier': return dummy.DummyClassifier(**model_params) elif model_id == 'sklearn_KNeighborsClassifier': return neighbors.KNeighborsClassifier(**model_params) elif model_id == 'sklearn_DecisionTreeClassifier': return tree.DecisionTreeClassifier(**model_params) elif model_id == 'sklearn_LinearRegression': return linear_model.LinearRegression(**model_params) elif model_id == 'sklearn_LassoCV': return linear_model.LassoCV(**model_params) elif model_id == 'sklearn_RidgeCV': return linear_model.RidgeCV(**model_params) elif model_id == 'sklearn_Ridge': return linear_model.Ridge(**model_params) elif model_id == 'sklearn_DummyRegressor': return dummy.DummyRegressor(**model_params) elif model_id == 'sklearn_RandomForestRegressor': return ensemble.RandomForestRegressor(**model_params) elif model_id == 'sklearn_GradientBoostingRegressor': return ensemble.GradientBoostingRegressor(**model_params) elif model_id == 'sklearn_MLPRegressor': return neural_network.MLPRegressor(**model_params) elif model_id == 'sklearn_KNeighborsRegressor': return neighbors.KNeighborsRegressor(**model_params) elif model_id == 'sklearn_SVR': return svm.SVR(**model_params) elif model_id == 'sklearn_SGDRegressor': return linear_model.SGDRegressor(**model_params) elif model_id == 'sklearn_DecisionTreeRegressor': return tree.DecisionTreeRegressor(**model_params) return None
def medDumb(X_train=[], X_test=[], y_train=[], y_test=[]): if len(X_train) == 0: return "Médiane (naïf)" #string name else: dum = dummy.DummyRegressor(strategy='median') dum.fit(X_train, y_train) y_pred_dum = dum.predict(X_test) return np.sqrt(metrics.mean_squared_error(y_test, y_pred_dum)), y_pred_dum
def regBaseline(data): strategies = ['mean', 'median'] baseDict = {} X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for strat in strategies: clf = dummy.DummyRegressor(strategy=strat) clf.fit(X_train, y_train) baseDict[strat] = clf.score(X_test, y_test) return baseDict, y
def __pred_randomly(self, X_train, y_train, X_test): dummyX_train = [[0] for x in X_train] dummyX_test = [[0] for x in X_test] clf = None if self.dataset.type == 'c': clf = dummy.DummyClassifier(strategy=self.dummy_strategy) else: clf = dummy.DummyRegressor() clf.fit(dummyX_train, y_train) return clf.predict(dummyX_test)
def zero_cost_model(self,X,y,add_to_model=False): if self.base_model._estimator_type=='classifier': model = dummy.DummyClassifier("prior") elif self.base_model._estimator_type=='regressor': model = dummy.DummyRegressor("mean") else: raise TypeError("sklearn Classifier or Regressor required!") cost = 0 features = [] model.fit(self.selectfeats(X,features),y) if add_to_model: self.model_costs.insert(0,cost) self.model_features.insert(0,features) self.models.insert(0,model) return (model, cost, features)
def fit_baseline(self, x, y): ''' Fit the baseline for the MetaEstimator. That is, depending on the loss function, determine the optimal constant predictor, based on the training data on the output ''' # Determine if regression or classification problem if self.method_type is None: is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical self.method_type = ('classif','regr')[is_above] # Fit a Dummy (constant) estimator if self.method_type == 'regr': self.fitted = dummy.DummyRegressor().fit(x, y) else: self.fitted = dummy.DummyClassifier().fit(x, y) self.classes = dummy.DummyClassifier().fit(x, y).classes_
from sklearn import preprocessing, dummy, svm from sklearn import linear_model, neighbors, ensemble print("PyPlot...", end='', flush=True) import matplotlib.pyplot as plt print("ALJI code...", end='', flush=True) from Framing import getFrame, getEmpathCols from ModelComparer import ModelComparer print("Done!") ''' RUNNING OPTIONS FOR MODELS ''' visualizeCGI = True folds = 5 scaler = preprocessing.StandardScaler(copy=False) # scaler = preprocessing.MinMaxScaler(copy=False) regressors = [ dummy.DummyRegressor(), svm.LinearSVR(tol=1), svm.SVR(kernel='rbf', gamma='scale'), linear_model.Ridge(), linear_model.Lasso(), linear_model.ElasticNet(), ensemble.RandomForestRegressor(), ensemble.GradientBoostingRegressor(), ensemble.AdaBoostRegressor() ] classifiers = [ dummy.DummyClassifier(), svm.LinearSVC(tol=1), svm.SVC(kernel='rbf', gamma='scale'), neighbors.KNeighborsClassifier(),
def fabrication_modele_feature_delay(input_X_Train, d_features, n_feature_cible, isRidge=False, isLasso=False, input_X_test=None): name = 'fabrication_modele_feature_delay' data = input_X_Train # ON s'assure que la feature est bien presente if n_feature_cible not in data.columns: log_info('!!!! ERREUR dans {} : feature {} non presente'.format( name, n_feature_cible)) return None, None tstamp1 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') # Les variables utiles l_numerical = d_features['l_numerical'] l_categoriel = d_features['l_categoriel'] # Debut log_info('-- DEBUT de {} : {} - {}'.format(name, 'Preparation', tstamp1)) ## 1 - Entrainement de l'Encodage categoriel encoder = OneHotEncoder(sparse=True) #encoder.fit(data[l_categoriel]) #### HACK : on ajoute input_X_test pour éviter que OneHotEncoder ## ne tombe sur des données non encore rencontrées !!!! ####JE PENSE QUE CELA RAJOUTE BEAUCOUP DE TEMPS ###tmp_data = input_X_Train.copy() tmp_data = input_X_Train tmp_data = tmp_data.append(input_X_test) encoder.fit(tmp_data[l_categoriel]) X_data = data[(l_numerical + l_categoriel)] Y_data = data[n_feature_cible] ## 3 - Préparation Modélisation Générale ## 3_1 - Standardisation des données numériques scaler = StandardScaler() #### Entrainement scaler.fit(X_data[l_numerical]) #### Transformation X_data_numerical = sparse.csr_matrix(scaler.transform(X_data[l_numerical])) ## 3_2 - Encodage des données categorielles X_data_categoriel = encoder.transform(X_data[l_categoriel]) ## 3_3 - Fabrication des données optimisées #print('X_train_numerical.shape = ', X_train_numerical.shape) #print('X_train_categoriel.shape = ', X_train_categoriel.shape) Opt_X_data = sparse.hstack((X_data_numerical, X_data_categoriel)) #Opt_X_test = sparse.hstack(X_test_numerical, X_test_categoriel) tstamp2 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Preparation', tstamp2, tstamp2 - tstamp1)) ## 4 - Modélisation ## 4_1 - Modélisation Linéaire log_info('-- DEBUT de {} : {} - {}'.format(name, 'Modélisation', tstamp2)) tstamp_lr1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') lr = LinearRegression() lr.fit(Opt_X_data, Y_data) tstamp_lr2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') ## Regression Naive dum = dummy.DummyRegressor(strategy='mean') tstamp_dum1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') dum.fit(Opt_X_data, Y_data) tstamp_dum2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') tstamp3 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Modélisation', tstamp3, tstamp3 - tstamp2)) ## 5 - Sauvegarde des données #N_Data = {'X_train': X_train, 'Y_train': Y_train, # 'X_test': X_test, 'Y_test': Y_test} F_Model_Optimisation = {'OneHotEncoder': encoder, 'StandardScaler': scaler} F_Model = { 'LinearRegression': { 'Model': lr, 'Temps': tstamp_lr2 - tstamp_lr1 } } F_Model['Naive'] = {'Model': dum, 'Temps': tstamp_dum2 - tstamp_dum1} if isRidge: ridge = RidgeCV(fit_intercept=False, cv=3) tstamp_ridge1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') ridge.fit(Opt_X_data, Y_data) tstamp_ridge2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') F_Model['RidgeCV'] = { 'Model': ridge, 'Temps': tstamp_ridge2 - tstamp_ridge1 } if isLasso: lasso = LassoCV(fit_intercept=False, cv=3) tstamp_lasso1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') lasso.fit(Opt_X_data, Y_data) tstamp_lasso2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') F_Model['LassoCV'] = { 'Model': lasso, 'Temps': tstamp_lasso2 - tstamp_lasso1 } return F_Model_Optimisation, F_Model
def _tune_classifiers( self, test_size, classifiers, min_pca, search_n_jobs): print('Tuning classifiers for output-feature "{0}" for path "{1}".'.format( self.output_feature.feature_name, self.path)) if not self.subset_has_data('training'): print(' -> Missing training data...') raise KeyError('Missing training data') self.tune_params = {} self.tune_params['classifiers'] = {} if self.subset_has_data('testing'): X_tr = self.X y_tr = self.y X_te = self.data['testing']['X'] y_te = self.data['testing']['y'] else: # Create training and testing partition (validation # split is handled by the 'StratifiedKFold' object) X_tr, X_te, y_tr, y_te = skms.train_test_split( self.X, self.y, stratify=self.y, shuffle=True, random_state=0, test_size=test_size) print(' -> Train size: ({0}); Test size: ({1}) Number of classes ({2}).'.format( X_tr.shape[0], X_te.shape[0], len(self.classes))) if len(self.classes) > 1: for classifier_name, classifier_value in classifiers.items(): print(' -> Tuning "{0}"'.format(classifier_name)) # Search grid dict_grid = {} self.tune_params['classifiers'][classifier_name] = { 'results': {}, 'parameters': {}, 'best_estimator': None } # Add standard scaller steps = [ ('std', skp.StandardScaler()) ] if min_pca is not None and X_tr.shape[0] > X_tr.shape[1] and X_tr.shape[1] > min_pca: steps.append( ('pca', skd.PCA( random_state=0))) dict_grid['pca__n_components'] = HierarchyElement.get_pca_nb_components( min_pca, X_tr.shape[1], 3) if classifier_name == 'SGDClassifier': steps.append( (classifier_name, sklm.SGDClassifier( shuffle=True, random_state=0, max_iter=1000, penalty='l2', loss='log', class_weight='balanced', n_jobs=2))) elif classifier_name == 'RandomForestClassifier': steps.append( (classifier_name, skle.RandomForestClassifier( random_state=0, max_depth=None, class_weight='balanced', n_jobs=2))) # Create a pipeline for the work to be done pipe = skpl.Pipeline(steps) for param_name, param_value in classifier_value.items(): # Add the search space to the grid dict_grid['{0}__{1}'.format( classifier_name, param_name)] = param_value # create the k-fold object kfold = skms.StratifiedKFold( n_splits=5, random_state=0, shuffle=True) search = skms.GridSearchCV( estimator=pipe, param_grid=dict_grid, scoring='f1_weighted', refit=True, cv=kfold, n_jobs=2) # capture start time start_time = ti.time() search.fit( X=X_tr, y=y_tr) elapsed_time = dt.timedelta( seconds=int(round(ti.time() - start_time))) # capture elapsed time self.tune_params['classifiers'][classifier_name]['fit_time'] = elapsed_time.total_seconds() # capture all tuning parameters self.tune_params['classifiers'][classifier_name]['parameters'].update(search.best_params_) # keep the best estimator self.tune_params['classifiers'][classifier_name]['best_estimator'] = search.best_estimator_ # capture the scores self.tune_params['classifiers'][classifier_name]['results'] = { 'validation': search.best_score_, 'test': search.score( X=X_te, y=y_te) } print(' -> Best validation score: {0:.4%}'.format( self.tune_params['classifiers'][classifier_name]['results']['validation'])) print(' -> Test score: {0:.4%}'.format( self.tune_params['classifiers'][classifier_name]['results']['test'])) print( ' -> Tuning time: {0} ({1}s)'.format(elapsed_time, elapsed_time.total_seconds())) else: classifier_name = 'DummyRegressor' print(' -> Tuning "{0}"'.format(classifier_name)) self.tune_params['classifiers'][classifier_name] = {} estimator = skpl.Pipeline( steps=[ (classifier_name, sky.DummyRegressor( strategy='constant', constant=0)) ]) estimator.fit( X=X_tr, y=y_tr) self.tune_params['classifiers'][classifier_name]['results'] = { 'validation': 1.0, 'test': 1.0 } self.tune_params['classifiers'][classifier_name]['fit_time'] = 0 self.tune_params['classifiers'][classifier_name]['parameters'] = {} self.tune_params['classifiers'][classifier_name]['best_estimator'] = estimator # find the model with the best results. all_classifiers = list(self.tune_params['classifiers'].keys()) all_results = [self.tune_params['classifiers'][classifier] ['results']['test'] for classifier in all_classifiers] best_estimator_index = np.argmax(all_results) best_estimator_name = all_classifiers[best_estimator_index] best_estimator = self.tune_params['classifiers'][best_estimator_name]['best_estimator'] # We need to check whether the best estimator implements the 'predict_proba' method. # If it does, we can 1) calibrate the estimator and 2) compute the optimized thresholds. if ch.MulticlassClassifierOptimizer.optimizable_model(best_estimator): print(' -> Optimizing "{0}"'.format(classifier_name)) # Create a calibrated estimator optimized_estimator = ch.MulticlassClassifierOptimizer( model=best_estimator, classes=self.classes, scoring_function=ch.BinaryClassifierHelper.f1_score) self.estimator = optimized_estimator.fit( X=X_tr, y=y_tr) self.tune_params['classifiers'][best_estimator_name]['results']['train_optimized'] = optimized_estimator.score( X=X_tr, y=y_tr) self.tune_params['classifiers'][best_estimator_name]['results']['test_optimized'] = optimized_estimator.score( X=X_te, y=y_te) else: self.estimator = self.tune_params['classifiers'][best_estimator_name]['best_estimator'] self.tune_params['best_score'] = self.tune_params['classifiers'][best_estimator_name]['results']['test'] self.tune_params['best_classifier'] = best_estimator_name print(' -> Best classifier is "{0}" with a test score of {1:.4%}'.format( best_estimator_name, self.tune_params['best_score'])) if 'test_optimized' in self.tune_params['classifiers'][best_estimator_name]['results']: print(' -> Optimized test score: {0:.4%}'.format( self.tune_params['classifiers'][best_estimator_name]['results']['test_optimized']))
def __init__(self): self._regressor = dummy.DummyRegressor() self._regressor_name = 'mean' self._name = 'Naive Mean' self._color = 'purple' Model.__init__(self)
def main(): # Read raw data. # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality raw_data = pd.read_csv('winequality-white.csv', sep=';') print('raw_data :\n', raw_data.head()) # Extract data from dataset. x = raw_data[raw_data.columns[:-1]].values # Dataset: variables. y = raw_data['quality'].values # Dataset: labels. print('x :\n', x[:5]) print('y :\n', y[:5]) # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn std_scale = preprocessing.StandardScaler().fit(x) x_scaled = std_scale.transform(x) std_scale = preprocessing.MinMaxScaler().fit(y.reshape(-1, 1)) y_scaled = std_scale.transform(y.reshape(-1, 1)).ravel() for var, lbl in zip([x, x_scaled], ['not scaled', 'scaled']): fig, all_axis = plt.subplots(3, 4) for feat_idx in range(var.shape[1]): # variable alone. axis = all_axis.ravel()[feat_idx] axis.hist(var[:, feat_idx], bins=50) axis.set_title(raw_data.columns[feat_idx]+' - '+lbl, fontsize=14) # variable superimposed with others. last_axis = all_axis.ravel()[11] last_axis.hist(var[:, feat_idx], bins=50) last_axis.set_title('whole dataset - '+lbl, fontsize=14) plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.3, hspace=0.3) plt.show() # Show variable magnitude before / after scaling. # Split data set into training set and testing set. # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.3) # Fix hyper-parameters to test. param_grid = { 'gamma': np.logspace(-2, 2, 6), # gamma coefficient between 10^-2 and 10^2. 'alpha': np.logspace(-2, 2, 6), # alpha coefficient between 10^-2 and 10^2. } # Choose a score to optimize: r2 (coefficient of determination: regression score). score = 'r2' # Kernel ridge regressor: use cross validation to find the best hyper-parameters. clf = GridSearchCV( kernel_ridge.KernelRidge(kernel='rbf'), # Kernel ridge regressor. param_grid, # hyper-parameters to test. cv=5, # number of folds to test in cross validation. scoring=score # score to optimize. ) # Optimize best regressor on training set. clf.fit(x_train, y_train) # Print hyper-parameters. print("\nBest hyper-parameters on the training set:") print(clf.best_params_) # Print performances. print("\nCross validation results:") for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], clf.cv_results_['params']): print("{} = {:.3f} (+/-{:.03f}) for {}".format(score, mean, std*2, params)) # Print scores. # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308276-evaluez-un-algorithme-de-regression y_pred = clf.predict(x_train) print("\nBest regressor score on training set: {:.3f}".format(r2_score(y_train, y_pred))) y_pred = clf.predict(x_test) print("\nBest regressor score on testing set: {:.3f}".format(r2_score(y_test, y_pred))) # Compare with baseline dummy regressor. best_dclf, best_dclf_score = None, -float('inf') for s in ['mean', 'median', 'quantile']: dclf = dummy.DummyRegressor(strategy=s, quantile=0.25) dclf.fit(x_train, y_train) dclf_score = r2_score(y_test, dclf.predict(x_test)) if dclf_score > best_dclf_score: best_dclf, best_dclf_score = dclf, dclf_score y_pred = best_dclf.predict(x_train) print("\nBest dummy regressor score on training set: {:.3f}".format(r2_score(y_train, y_pred))) y_pred = best_dclf.predict(x_test) print("\nBest dummy regressor score on testing set: {:.3f}".format(r2_score(y_test, y_pred)))
# -*- coding: utf-8 -*- """ Created on Thu Mar 5 14:12:14 2020 @author: 766810 """ from sklearn.datasets import make_regression, make_classification X, y = make_regression() from sklearn import dummy fakeestimator = dummy.DummyRegressor(strategy='median') fakeestimator.fit(X, y) print(fakeestimator.predict(X)[:5])
lr_best = lr_gs.best_estimator_ y_pred_lr = lr_best.predict(scaler.transform(X_test)) print(calculate_regression_metrics(y_test, y_pred_lr)) #Write the prediction of GLM model meta_X["predictions"] = y_pred_lr meta_X["labels"] = y_test rev_output_df = meta_X.iloc[:, [0, 2, 4, 5]].copy() rev_output_df.to_csv("../results/GLM_" + data_type_options[input_option] + "_supervised_test_predictions.csv", index=False) # + #Dummy mean regressor and median regressor strategy = 'mean' model = dummy.DummyRegressor(strategy=strategy) model.fit(X_train, y_train) y_pred_mean = model.predict(X_test) calculate_regression_metrics(y_test, y_pred_mean) strategy = 'median' model = dummy.DummyRegressor(strategy=strategy) model.fit(X_train, y_train) y_pred_median = model.predict(X_test) calculate_regression_metrics(y_test, y_pred_median) # + ##Get results for SARS-COV-2 #big_X_test = pd.read_csv("../data/COVID-19/sars_cov_2_additional_drug_viral_interactions_to_predict_with_LS_v2.csv",header='infer',sep=",") #total_length = len(big_X_test.columns) #X_test = big_X_test.iloc[:,range(8,total_length)]
from sklearn.datasets import make_regression, make_classification from sklearn import dummy x, y = make_regression() fakeestimator = dummy.DummyRegressor() #fakeestimator = dummy.DummyRegressor(strategy="median") fakeestimator.fit(x, y) print(fakeestimator.predict(x)[:5]) x, y = make_regression() fakeestimator = dummy.DummyRegressor(strategy="median") fakeestimator.fit(x, y) print(fakeestimator.predict(x)[:5]) x, y = make_classification() fakeestimator = dummy.DummyRegressor(strategy="median") fakeestimator.fit(x, y) print(fakeestimator.predict(x)[:5])
test_size=0.33, random_state=42) # reweight outliers weighter_scale = preprocessing.StandardScaler().fit(true_train) train_weight_outliers = 5.0 * np.abs( weighter_scale.transform(true_train)) + 1 cv_weight_outliers = 5.0 * np.abs(weighter_scale.transform(true_cv)) + 1 test_weight_outliers = 5.0 * np.abs( weighter_scale.transform(test_data['log_ret'].values)) + 1 from sklearn.metrics import mean_squared_error scorer = mean_squared_error from sklearn import dummy dumreg = dummy.DummyRegressor() dumreg.fit(stock_train, true_train) dumguess = dumreg.predict(sctest_arr) plot_error(test_data['log_ret'].values, dumguess, 'Dumb regression test set', scorer) plt.savefig('logregcorr_Dummy.png') clf = sklearn.linear_model.LinearRegression() clf.fit(stock_train, true_train) stock_data['predictchange'] = clf.predict(sctrain_arr) plot_error(test_data['log_ret'].values, clf.predict(sctest_arr), 'linear regression test set', scorer) plt.savefig('logregcorr_linear.png') print "Decision Tree" from sklearn import tree
if best_i == None or \ ( score - scores[best_i] > n_sigma * min(stderr, stderrs[best_i]) ): best_i = i return best_i ############# """ The simplest model: Median of *all* previous appraisals This is also an elementary test of the pipeline setup and predictive interval wrapper. """ # (The selected feature here doesn't matter, use special 'dummy') pl = Pipeline([('ColumnSelector', ColumnSelectTransformer('dummy')), ('Regressor', dummy.DummyRegressor(strategy='median'))]) r = PredictiveIntervalRegressor(pl, n_resamplings=n_bootstrap, save_models=False, max_residuals=None) r.fit([[0]] * len(df_global_train), df_global_train['LogAvg'].values) global_median_model = {category: r for category in categories} ############ """ Next-to-simplest model: Medians by category """ category_median_model = {} for category in categories: pl = Pipeline([('ColumnSelector', ColumnSelectTransformer('dummy')),
from sklearn.model_selection import train_test_split from sklearn import preprocessing, dummy, metrics import pandas as pd import numpy as np from helpers import root_mean_squared_log_error dataset = pd.read_csv('data/train_merged.csv') # Extract the objective values y = dataset['trip_duration'].values # Delete irrelevant columns in training set del dataset['trip_duration'], dataset["id"], dataset[ "trip_duration_in_minutes"] X = dataset.values means = np.nanmean(X, axis=0) nan_locations = np.where(np.isnan(X)) X[nan_locations] = np.take(means, nan_locations[1]) # Normalize X X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) regressor = dummy.DummyRegressor() regressor.fit(X_train, y_train) print("RMSLE =", root_mean_squared_log_error(y_test, regressor.predict(X_test)))
# - auto2.plot.scatter(x='horsepower', y='city_mpg') # very simple lr = linear_model.LinearRegression() lr.fit(auto2[['horsepower']], auto2.city_mpg) lr.score(auto2[['horsepower']], auto2.city_mpg) ax = auto2.plot.scatter(x='horsepower', y='city_mpg') xs = np.arange(40, 200) ax.plot(xs, xs * lr.coef_ + lr.intercept_) # Let's use all of the columns # Baseline model - default strategy is to always predict mean dm = dummy.DummyRegressor() dm.fit(auto_X, auto_y) dm.score(auto_X, auto_y) # Score is R2 score - coefficient of determintation # Usually between 0-1 - .92 amount that answer is explained by features # 1 - 100% of answer is explained by features lr = linear_model.LinearRegression() lr.fit(auto_X, auto_y) lr.score(auto_X, auto_y) pd.Series(lr.coef_, auto_X.columns) lr.intercept_ # ## Lab Data
def fabrication_model_general(data, d_features, isRidge=False, isLasso=False): name = 'fabrication_model_general' tstamp1 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') # Les variables utiles l_numerical = d_features['l_numerical'] l_categoriel = d_features['l_categoriel'] # Debut log_info('-- DEBUT de {} : {} - {}'.format(name, 'Preparation', tstamp1)) ## 1 - Entrainement de l'Encodage categoriel encoder = OneHotEncoder(sparse=True) encoder.fit(data[l_categoriel]) ## JE VEUX CONSERVER TOUTES LES DATAS DANS LE RESTE #X_data = data[(l_numerical + l_categoriel)] X_data = data Y_data = data['ARR_DELAY'] ## 2 - Séparation des données X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=0) ## ICI MODIF X_train_bis = X_train[(l_numerical + l_categoriel)] ## 3 - Préparation Modélisation Générale ## 3_1 - Standardisation des données numériques scaler = StandardScaler() #### Entrainement #scaler.fit(X_train[l_numerical]) scaler.fit(X_train_bis[l_numerical]) #### Transformation #X_train_numerical = sparse.csr_matrix(scaler.transform(X_train[l_numerical])) X_train_numerical = sparse.csr_matrix( scaler.transform(X_train_bis[l_numerical])) ## 3_2 - Encodage des données categorielles #X_train_categoriel = encoder.transform(X_train[l_categoriel]) X_train_categoriel = encoder.transform(X_train_bis[l_categoriel]) ## 3_3 - Fabrication des données optimisées #print('X_train_numerical.shape = ', X_train_numerical.shape) #print('X_train_categoriel.shape = ', X_train_categoriel.shape) Opt_X_train = sparse.hstack((X_train_numerical, X_train_categoriel)) #Opt_X_test = sparse.hstack(X_test_numerical, X_test_categoriel) tstamp2 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Preparation', tstamp2, tstamp2 - tstamp1)) ## 4 - Modélisation ## 4_1 - Modélisation Linéaire log_info('-- DEBUT de {} : {} - {}'.format(name, 'Modélisation', tstamp2)) ## REGRESSION LINEAIRE tstamp_lr1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') lr = LinearRegression() lr.fit(Opt_X_train, Y_train) tstamp_lr2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') ## VERSION NAIVE ## Regression Naive tstamp_dum1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') dum = dummy.DummyRegressor(strategy='mean') dum.fit(Opt_X_train, Y_train) tstamp_dum2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') tstamp3 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Modélisation', tstamp3, tstamp3 - tstamp2)) ## 5 - Sauvegarde des données N_Data = { 'X_train': X_train, 'Y_train': Y_train, 'X_test': X_test, 'Y_test': Y_test } N_Model_Optimisation = {'OneHotEncoder': encoder, 'StandardScaler': scaler} N_Model = { 'LinearRegression': { 'Model': lr, 'Temps': tstamp_lr2 - tstamp_lr1 } } N_Model['Naive'] = {'Model': dum, 'Temps': tstamp_dum2 - tstamp_dum1} if isRidge: ridge = RidgeCV(fit_intercept=False, cv=3) tstamp_ridge1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') ridge.fit(Opt_X_train, Y_train) tstamp_ridge2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') N_Model['RidgeCV'] = { 'Model': ridge, 'Temps': tstamp_ridge2 - tstamp_ridge1 } if isLasso: tstamp_lasso1 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') lasso = LassoCV(fit_intercept=False, cv=3) lasso.fit(Opt_X_train, Y_train) tstamp_lasso2 = datetime.strptime( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') N_Model['LassoCV'] = { 'Model': lasso, 'Temps': tstamp_lasso2 - tstamp_lasso1 } return N_Data, N_Model_Optimisation, N_Model
std_scale = preprocessing.StandardScaler().fit(X_train) X_train_std = std_scale.transform(X_train) X_test_std = std_scale.transform(X_test) ## Gram matrix: kmatrix = [] subtitles = [] ## Training score = 'neg_mean_squared_error' ## DummyClassifier if (dum): print("\n===== Dummy Classifier (Baseline 1) =====") rgs_dum = dummy.DummyRegressor(strategy='mean') print("Training...") rgs_dum.fit(X_train_std, y_train) print("Prediction:") y_test_pred = rgs_dum.predict(X_test_std) rmse_dum = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)) print("\tRMSE = %0.3f" % rmse_dum) ## Linear Ridge Regressor if (lRR): print("\n===== Linear Ridge Regressor =====") param_grid = {'alpha': np.logspace(-3, 3, 7)} rgs_lrr = model_selection.GridSearchCV(linear_model.Ridge(), param_grid=param_grid,