def test_transform_encoder(): """Test transform method of Categorical_encoder class.""" df = pd.read_csv("data_for_tests/train.csv") encoder = Categorical_encoder() with pytest.raises(ValueError): encoder.transform(df) encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert (df.columns == df_encoded.columns).all()
def test_init_encoder(): """Test init method of Categorical_encoder class.""" encoder = Categorical_encoder() assert encoder.strategy == "label_encoding" assert not (encoder.verbose) assert encoder._Categorical_encoder__Lcat == [] assert encoder._Categorical_encoder__Lnum == [] assert encoder._Categorical_encoder__Enc == dict() assert encoder._Categorical_encoder__K == dict() assert not encoder._Categorical_encoder__weights assert not encoder._Categorical_encoder__fitOK
def test_fit_encoder(): """Test method fit of Categorical_encoder class.""" df = pd.read_csv("data_for_tests/train.csv") encoder = Categorical_encoder(strategy="wrong_strategy") with pytest.raises(ValueError): encoder.fit(df, df["Survived"]) encoder.set_params(strategy="label_encoding") encoder.fit(df, df["Survived"]) assert encoder._Categorical_encoder__fitOK encoder.set_params(strategy="dummification") encoder.fit(df, df["Survived"]) assert encoder._Categorical_encoder__fitOK encoder.set_params(strategy="random_projection") encoder.fit(df, df["Survived"]) assert encoder._Categorical_encoder__fitOK encoder.set_params(strategy="entity_embedding") encoder.fit(df, df["Survived"]) assert encoder._Categorical_encoder__fitOK
def test_set_params_encoder(): """Test set_params method of Categorical_encoder class.""" encoder = Categorical_encoder() encoder.set_params(strategy="label_encoding") assert encoder.strategy == "label_encoding" encoder.set_params(strategy="dummification") assert encoder.strategy == "dummification" encoder.set_params(strategy="random_projection") assert encoder.strategy == "random_projection" encoder.set_params(strategy="entity_embedding") assert encoder.strategy == "entity_embedding" encoder.set_params(verbose=True) assert encoder.verbose encoder.set_params(verbose=False) assert not encoder.verbose with pytest.warns(UserWarning) as record: encoder.set_params(_Categorical_encoder__Lcat=[]) assert len(record) == 1
def test_get_params_encoder(): """Test get_params method of Categorical_encoder class.""" encoder = Categorical_encoder() dict = {'strategy': "label_encoding", 'verbose': False} assert encoder.get_params() == dict
def fit_predict(self, params, df): """Fits the model and predicts on the test set. Also outputs feature importances and the submission file (.png and .csv format). Parameters ---------- params : dict, default = None. Hyper-parameters dictionary for the whole pipeline. - The keys must respect the following syntax : "enc__param". - "enc" = "ne" for na encoder - "enc" = "ce" for categorical encoder - "enc" = "fs" for feature selector [OPTIONAL] - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] - "enc" = "est" for the final estimator - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ... - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ... df : dict, default = None Dataset dictionary. Must contain keys and values: - "train": pandas DataFrame for the train set. - "test" : pandas DataFrame for the test set. - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set. Returns ------- object self. """ if(self.to_path is None): raise ValueError("You must specify a path to save your model " "and your predictions") else: ne = NA_encoder() ce = Categorical_encoder() ########################################## # Automatically checking the task ########################################## ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): # Estimator est = Classifier() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Clf_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingClassifier() else: pass ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): # Estimator est = Regressor() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Reg_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingRegressor() else: pass else: raise ValueError("Impossible to determine the task. " "Please check that your target is encoded.") ########################################## # Creating the Pipeline ########################################## pipe = [("ne", ne), ("ce", ce)] # Do we need to cache transformers? cache = False if (params is not None): if("ce__strategy" in params): if(params["ce__strategy"] == "entity_embedding"): cache = True else: pass else: pass if (fs is not None): if ("fs__strategy" in params): if(params["fs__strategy"] != "variance"): cache = True else: pass else: pass if (len(STCK) != 0): cache = True else: pass # Pipeline creation if (fs is not None): pipe.append(("fs", fs)) else: pass for stck in np.sort(list(STCK)): pipe.append((stck, STCK[stck])) pipe.append(("est", est)) if(cache): pp = Pipeline(pipe, memory=self.to_path) else: pp = Pipeline(pipe) ########################################## # Fitting the Pipeline ########################################## start_time = time.time() # No params : default configuration if(params is None): print("") print('> No parameters set. Default configuration is tested') set_params = True else: try: pp = pp.set_params(**params) set_params = True except: set_params = False if(set_params): try: if(self.verbose): print("") print("fitting the pipeline ...") pp.fit(df['train'], df['target']) if(self.verbose): print("CPU time: %s seconds"%(time.time() - start_time)) try: os.mkdir(self.to_path) except OSError: pass # Feature importances try: importance = est.feature_importances() self.__save_feature_importances(importance, self.to_path + "/" + est.get_params()["strategy"] + "_feature_importance.png") if(self.verbose): self.__plot_feature_importances(importance, 10) print("") print("> Feature importances dumped into directory : " + self.to_path) except: warnings.warn("Unable to get feature importances !") except: raise ValueError("Pipeline cannot be fitted") else: raise ValueError("Pipeline cannot be set with these parameters." " Check the name of your stages.") ########################################## # Predicting ########################################## if (df["test"].shape[0] == 0): warnings.warn("You have no test dataset. Cannot predict !") else: start_time = time.time() ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): ''' enc_name = "target_encoder.obj" try: fhand = open(self.to_path + "/" + enc_name, 'rb') enc = pickle.load(fhand) fhand.close() except: raise ValueError("Unable to load '" + enc_name + "' from directory : " + self.to_path) ''' try: if(self.verbose): print("") print("predicting ...") pred = pd.DataFrame(pp.predict_proba(df['test']), columns=['0', '1'], index=df['test'].index) except: raise ValueError("Can not predict") ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): pred = pd.DataFrame([], columns=[df['target'].name + "_predicted"], index=df['test'].index) try: if(self.verbose): print("") print("predicting...") pred[df['target'].name + "_predicted"] = pp.predict(df['test']) # noqa except: raise ValueError("Can not predict") else: pass if(self.verbose): print("CPU time: %s seconds" % (time.time() - start_time)) self.pp = pp return pred['1'].values
def test_transform_encoder(): """Test transform method of Categorical_encoder class.""" df = pd.read_csv("data_for_tests/train.csv") encoder = Categorical_encoder() with pytest.raises(ValueError): encoder.transform(df) encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert (df.columns == df_encoded.columns).all() encoder.set_params(strategy="dummification") encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame) encoder.set_params(strategy="random_projection") encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert type(df_encoded) == pd.DataFrame encoder.set_params(strategy="entity_embedding") encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert type(df_encoded) == pd.DataFrame