def test_fit_regressor(): """Test fit method of Regressor class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) regressor = Regressor() regressor.fit(df_train, y_train) assert np.all(regressor._Regressor__col == df_train.columns) assert regressor._Regressor__fitOK
def test_predict_regressor(): """Test predict method of Regressor class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) regressor = Regressor() with pytest.raises(ValueError): regressor.predict(df_train) regressor.fit(df_train, y_train) with pytest.raises(ValueError): regressor.predict(None) assert len(regressor.predict(df_train)) > 0
def test_init_regressor(): """Test init method of Regressor class.""" regressor = Regressor() assert regressor._Regressor__strategy == "LightGBM" assert regressor._Regressor__regress_params == {} assert regressor._Regressor__regressor assert not regressor._Regressor__col assert not regressor._Regressor__fitOK
def test_feature_importances_regressor(): """Test feature_importances of Regressor class.""" regressor = Regressor() with pytest.raises(ValueError): regressor.feature_importances() df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) regressor.set_params(strategy="LightGBM") regressor.fit(df_train, y_train) importance = regressor.feature_importances() assert importance != {} regressor.set_params(strategy="Linear") regressor.fit(df_train, y_train) importance = regressor.feature_importances() assert importance != {} regressor.set_params(strategy="RandomForest") regressor.fit(df_train, y_train) importance = regressor.feature_importances() assert importance != {} regressor.set_params(strategy="AdaBoost") regressor.fit(df_train, y_train) importance = regressor.feature_importances() assert importance != {} regressor.set_params(strategy="Bagging") regressor.fit(df_train, y_train) importance = regressor.feature_importances() assert importance != {}
def test_set_regressor(): """Test set method of Regressor class.""" regressor = Regressor() with pytest.raises(ValueError): regressor._Regressor__set_regressor("wrong_strategy")
def test_set_params_regressor(): """Test set_params method of Regressor class.""" regressor = Regressor() regressor.set_params(strategy="LightGBM") assert regressor._Regressor__strategy == "LightGBM" regressor.set_params(strategy="RandomForest") assert regressor._Regressor__strategy == "RandomForest" regressor.set_params(strategy="ExtraTrees") assert regressor._Regressor__strategy == "ExtraTrees" regressor.set_params(strategy="RandomForest") assert regressor._Regressor__strategy == "RandomForest" regressor.set_params(strategy="Tree") assert regressor._Regressor__strategy == "Tree" regressor.set_params(strategy="AdaBoost") assert regressor._Regressor__strategy == "AdaBoost" regressor.set_params(strategy="Linear") assert regressor._Regressor__strategy == "Linear" regressor.set_params(strategy="Bagging") assert regressor._Regressor__strategy == "Bagging" with pytest.warns(UserWarning) as record: regressor.set_params(wrong_strategy="wrong_strategy") assert len(record) == 1
def test_get_params_regressor(): """Test get_params method of Regressor class.""" regressor = Regressor() params = regressor.get_params() assert params == {'strategy': "LightGBM"} assert not regressor._Regressor__regress_params
def test_get_estimator_regressor(): """Test get_estimator of Regressor class.""" regressor = Regressor() estimator = regressor.get_estimator() assert isinstance(estimator, type(LGBMRegressor()))
def test_score_regressor(): """Test_score method of Regressor class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) regressor = Regressor(strategy="Linear") with pytest.raises(ValueError): regressor.score(df_train, y_train) regressor.fit(df_train, y_train) with pytest.raises(ValueError): regressor.score(None, y_train) with pytest.raises(ValueError): regressor.score(df_train, None) assert regressor.score(df_train, y_train) > 0
def fit_predict(self, params, df): """Fits the model and predicts on the test set. Also outputs feature importances and the submission file (.png and .csv format). Parameters ---------- params : dict, default = None. Hyper-parameters dictionary for the whole pipeline. - The keys must respect the following syntax : "enc__param". - "enc" = "ne" for na encoder - "enc" = "ce" for categorical encoder - "enc" = "fs" for feature selector [OPTIONAL] - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] - "enc" = "est" for the final estimator - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ... - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ... df : dict, default = None Dataset dictionary. Must contain keys and values: - "train": pandas DataFrame for the train set. - "test" : pandas DataFrame for the test set. - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set. Returns ------- object self. """ if(self.to_path is None): raise ValueError("You must specify a path to save your model " "and your predictions") else: ne = NA_encoder() ce = Categorical_encoder() ########################################## # Automatically checking the task ########################################## ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): # Estimator est = Classifier() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Clf_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingClassifier() else: pass ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): # Estimator est = Regressor() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Reg_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingRegressor() else: pass else: raise ValueError("Impossible to determine the task. " "Please check that your target is encoded.") ########################################## # Creating the Pipeline ########################################## pipe = [("ne", ne), ("ce", ce)] # Do we need to cache transformers? cache = False if (params is not None): if("ce__strategy" in params): if(params["ce__strategy"] == "entity_embedding"): cache = True else: pass else: pass if (fs is not None): if ("fs__strategy" in params): if(params["fs__strategy"] != "variance"): cache = True else: pass else: pass if (len(STCK) != 0): cache = True else: pass # Pipeline creation if (fs is not None): pipe.append(("fs", fs)) else: pass for stck in np.sort(list(STCK)): pipe.append((stck, STCK[stck])) pipe.append(("est", est)) if(cache): pp = Pipeline(pipe, memory=self.to_path) else: pp = Pipeline(pipe) ########################################## # Fitting the Pipeline ########################################## start_time = time.time() # No params : default configuration if(params is None): print("") print('> No parameters set. Default configuration is tested') set_params = True else: try: pp = pp.set_params(**params) set_params = True except: set_params = False if(set_params): try: if(self.verbose): print("") print("fitting the pipeline ...") pp.fit(df['train'], df['target']) if(self.verbose): print("CPU time: %s seconds"%(time.time() - start_time)) try: os.mkdir(self.to_path) except OSError: pass # Feature importances try: importance = est.feature_importances() self.__save_feature_importances(importance, self.to_path + "/" + est.get_params()["strategy"] + "_feature_importance.png") if(self.verbose): self.__plot_feature_importances(importance, 10) print("") print("> Feature importances dumped into directory : " + self.to_path) except: warnings.warn("Unable to get feature importances !") except: raise ValueError("Pipeline cannot be fitted") else: raise ValueError("Pipeline cannot be set with these parameters." " Check the name of your stages.") ########################################## # Predicting ########################################## if (df["test"].shape[0] == 0): warnings.warn("You have no test dataset. Cannot predict !") else: start_time = time.time() ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): ''' enc_name = "target_encoder.obj" try: fhand = open(self.to_path + "/" + enc_name, 'rb') enc = pickle.load(fhand) fhand.close() except: raise ValueError("Unable to load '" + enc_name + "' from directory : " + self.to_path) ''' try: if(self.verbose): print("") print("predicting ...") pred = pd.DataFrame(pp.predict_proba(df['test']), columns=['0', '1'], index=df['test'].index) except: raise ValueError("Can not predict") ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): pred = pd.DataFrame([], columns=[df['target'].name + "_predicted"], index=df['test'].index) try: if(self.verbose): print("") print("predicting...") pred[df['target'].name + "_predicted"] = pp.predict(df['test']) # noqa except: raise ValueError("Can not predict") else: pass if(self.verbose): print("CPU time: %s seconds" % (time.time() - start_time)) self.pp = pp return pred['1'].values