Exemplo n.º 1
0
def test_fit_regressor():
    """Test fit method of Regressor class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    regressor = Regressor()
    regressor.fit(df_train, y_train)
    assert np.all(regressor._Regressor__col == df_train.columns)
    assert regressor._Regressor__fitOK
Exemplo n.º 2
0
def test_predict_regressor():
    """Test predict method of Regressor class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    regressor = Regressor()
    with pytest.raises(ValueError):
        regressor.predict(df_train)
    regressor.fit(df_train, y_train)
    with pytest.raises(ValueError):
        regressor.predict(None)
    assert len(regressor.predict(df_train)) > 0
Exemplo n.º 3
0
def test_init_regressor():
    """Test init method of Regressor class."""
    regressor = Regressor()
    assert regressor._Regressor__strategy == "LightGBM"
    assert regressor._Regressor__regress_params == {}
    assert regressor._Regressor__regressor
    assert not regressor._Regressor__col
    assert not regressor._Regressor__fitOK
Exemplo n.º 4
0
def test_feature_importances_regressor():
    """Test feature_importances of Regressor class."""
    regressor = Regressor()
    with pytest.raises(ValueError):
        regressor.feature_importances()
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    regressor.set_params(strategy="LightGBM")
    regressor.fit(df_train, y_train)
    importance = regressor.feature_importances()
    assert importance != {}
    regressor.set_params(strategy="Linear")
    regressor.fit(df_train, y_train)
    importance = regressor.feature_importances()
    assert importance != {}
    regressor.set_params(strategy="RandomForest")
    regressor.fit(df_train, y_train)
    importance = regressor.feature_importances()
    assert importance != {}
    regressor.set_params(strategy="AdaBoost")
    regressor.fit(df_train, y_train)
    importance = regressor.feature_importances()
    assert importance != {}
    regressor.set_params(strategy="Bagging")
    regressor.fit(df_train, y_train)
    importance = regressor.feature_importances()
    assert importance != {}
Exemplo n.º 5
0
def test_set_regressor():
    """Test set method of Regressor class."""
    regressor = Regressor()
    with pytest.raises(ValueError):
        regressor._Regressor__set_regressor("wrong_strategy")
Exemplo n.º 6
0
def test_set_params_regressor():
    """Test set_params method of Regressor class."""
    regressor = Regressor()
    regressor.set_params(strategy="LightGBM")
    assert regressor._Regressor__strategy == "LightGBM"
    regressor.set_params(strategy="RandomForest")
    assert regressor._Regressor__strategy == "RandomForest"
    regressor.set_params(strategy="ExtraTrees")
    assert regressor._Regressor__strategy == "ExtraTrees"
    regressor.set_params(strategy="RandomForest")
    assert regressor._Regressor__strategy == "RandomForest"
    regressor.set_params(strategy="Tree")
    assert regressor._Regressor__strategy == "Tree"
    regressor.set_params(strategy="AdaBoost")
    assert regressor._Regressor__strategy == "AdaBoost"
    regressor.set_params(strategy="Linear")
    assert regressor._Regressor__strategy == "Linear"
    regressor.set_params(strategy="Bagging")
    assert regressor._Regressor__strategy == "Bagging"
    with pytest.warns(UserWarning) as record:
        regressor.set_params(wrong_strategy="wrong_strategy")
    assert len(record) == 1
Exemplo n.º 7
0
def test_get_params_regressor():
    """Test get_params method of Regressor class."""
    regressor = Regressor()
    params = regressor.get_params()
    assert params == {'strategy': "LightGBM"}
    assert not regressor._Regressor__regress_params
Exemplo n.º 8
0
def test_get_estimator_regressor():
    """Test get_estimator of Regressor class."""
    regressor = Regressor()
    estimator = regressor.get_estimator()
    assert isinstance(estimator, type(LGBMRegressor()))
Exemplo n.º 9
0
def test_score_regressor():
    """Test_score method of Regressor class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    regressor = Regressor(strategy="Linear")
    with pytest.raises(ValueError):
        regressor.score(df_train, y_train)
    regressor.fit(df_train, y_train)
    with pytest.raises(ValueError):
        regressor.score(None, y_train)
    with pytest.raises(ValueError):
        regressor.score(df_train, None)
    assert regressor.score(df_train, y_train) > 0
Exemplo n.º 10
0
    def fit_predict(self, params, df):


        """Fits the model and predicts on the test set.
        Also outputs feature importances and the submission file
        (.png and .csv format).
        Parameters
        ----------
        params : dict, default = None.
            Hyper-parameters dictionary for the whole pipeline.
            - The keys must respect the following syntax : "enc__param".
                - "enc" = "ne" for na encoder
                - "enc" = "ce" for categorical encoder
                - "enc" = "fs" for feature selector [OPTIONAL]
                - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
                - "enc" = "est" for the final estimator
                - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ...
            - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ...
        df : dict, default = None
            Dataset dictionary. Must contain keys and values:
            - "train": pandas DataFrame for the train set.
            - "test" : pandas DataFrame for the test set.
            - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set.
        Returns
        -------
        object
            self.
        """

        if(self.to_path is None):
            raise ValueError("You must specify a path to save your model "
                             "and your predictions")

        else:

            ne = NA_encoder()
            ce = Categorical_encoder()

            ##########################################
            #    Automatically checking the task
            ##########################################

            ##########################################
            #             Classification
            ##########################################

            if (df['target'].dtype == 'int'):

                # Estimator

                est = Classifier()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Clf_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingClassifier()
                        else:
                            pass

        ##########################################
        #               Regression
        ##########################################

            elif (df['target'].dtype == 'float'):

                # Estimator

                est = Regressor()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Reg_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingRegressor()
                        else:
                            pass

            else:
                raise ValueError("Impossible to determine the task. "
                                 "Please check that your target is encoded.")

            ##########################################
            #          Creating the Pipeline
            ##########################################

            pipe = [("ne", ne), ("ce", ce)]

            # Do we need to cache transformers?

            cache = False

            if (params is not None):
                if("ce__strategy" in params):
                    if(params["ce__strategy"] == "entity_embedding"):
                        cache = True
                    else:
                        pass
                else:
                    pass

            if (fs is not None):
                if ("fs__strategy" in params):
                    if(params["fs__strategy"] != "variance"):
                        cache = True
                    else:
                        pass
            else:
                pass

            if (len(STCK) != 0):
                cache = True
            else:
                pass

            # Pipeline creation

            if (fs is not None):
                pipe.append(("fs", fs))
            else:
                pass

            for stck in np.sort(list(STCK)):
                pipe.append((stck, STCK[stck]))

            pipe.append(("est", est))

            if(cache):
                pp = Pipeline(pipe, memory=self.to_path)
            else:
                pp = Pipeline(pipe)

            ##########################################
            #          Fitting the Pipeline
            ##########################################

            start_time = time.time()

            # No params : default configuration

            if(params is None):
                print("")
                print('> No parameters set. Default configuration is tested')
                set_params = True

            else:
                try:
                    pp = pp.set_params(**params)
                    set_params = True
                except:
                    set_params = False

            if(set_params):

                try:
                    if(self.verbose):
                        print("")
                        print("fitting the pipeline ...")

                    pp.fit(df['train'], df['target'])

                    if(self.verbose):
                        print("CPU time: %s seconds"%(time.time() - start_time))

                    try:
                        os.mkdir(self.to_path)
                    except OSError:
                        pass

                    # Feature importances

                    try:

                        importance = est.feature_importances()
                        self.__save_feature_importances(importance,
                                                        self.to_path
                                                        + "/"
                                                        + est.get_params()["strategy"]
                                                        + "_feature_importance.png")

                        if(self.verbose):
                            self.__plot_feature_importances(importance, 10)
                            print("")
                            print("> Feature importances dumped into directory : " + self.to_path)

                    except:
                        warnings.warn("Unable to get feature importances !")

                except:
                    raise ValueError("Pipeline cannot be fitted")
            else:
                raise ValueError("Pipeline cannot be set with these parameters."
                                 " Check the name of your stages.")

            ##########################################
            #               Predicting
            ##########################################

            if (df["test"].shape[0] == 0):
                warnings.warn("You have no test dataset. Cannot predict !")

            else:

                start_time = time.time()

                ##########################################
                #             Classification
                ##########################################

                if (df['target'].dtype == 'int'):

                    '''
                    enc_name = "target_encoder.obj"

                    try:

                        fhand = open(self.to_path + "/" + enc_name, 'rb')
                        enc = pickle.load(fhand)
                        fhand.close()

                    except:
                        raise ValueError("Unable to load '" + enc_name +
                                         "' from directory : " + self.to_path)
                    '''

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting ...")

                        pred = pd.DataFrame(pp.predict_proba(df['test']),
                                            columns=['0', '1'],
                                            index=df['test'].index)

                    except:
                        raise ValueError("Can not predict")

                ##########################################
                #               Regression
                ##########################################

                elif (df['target'].dtype == 'float'):

                    pred = pd.DataFrame([],
                                        columns=[df['target'].name + "_predicted"],
                                        index=df['test'].index)

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting...")

                        pred[df['target'].name + "_predicted"] = pp.predict(df['test'])  # noqa

                    except:
                        raise ValueError("Can not predict")

                else:
                    pass

                if(self.verbose):
                    print("CPU time: %s seconds" % (time.time() - start_time))

        self.pp = pp
        return pred['1'].values