예제 #1
0
def plot_transfer_graph_fitted(path, name, analysis_folder):

    from sklearn.linear_model import Ridge
    from scipy.interpolate import UnivariateSpline

    result_file = open(
        os.path.join(path, '0_results', analysis_folder, name,
                     name + '_classifier.pyobj'), 'r')

    results = pickle.load(result_file)

    runs = 6

    values = results.estimates
    run_length = len(values) / runs

    ridge = Ridge()

    f = plt.figure()
    a = f.add_subplot(111)
    for i in range(runs):

        v = values[i * run_length:(i + 1) * run_length]
        yy = v.copy()

        xx = np.linspace(0, len(v), len(v))

        try:
            ridge.transform(np.vander(xx, 12), yy)
            y_fit = ridge.predict(np.vander(xx, 12))
        except LinAlgError, err:
            ridge.transform(np.vander(xx, 9), yy)
            y_fit = ridge.predict(np.vander(xx, 9))

        a.plot(y_fit)
예제 #2
0
def plot_transfer_graph_fitted(path, name, analysis_folder):
    
    from sklearn.linear_model import Ridge
    from scipy.interpolate import UnivariateSpline
    
    result_file = open(
                       os.path.join(path, 
                               '0_results', 
                               analysis_folder, 
                               name, 
                               name+'_classifier.pyobj')
                       , 'r')
    
    results = pickle.load(result_file)
    
    runs = 6
    
    values = results.estimates
    run_length = len(values)/runs
    
    ridge = Ridge()
    
    f = plt.figure()
    a = f.add_subplot(111)
    for i in range(runs):
        
        v = values[i*run_length:(i+1)*run_length]
        yy = v.copy()
        
        xx = np.linspace(0, len(v), len(v))
        
        try:
            ridge.transform(np.vander(xx, 12), yy)
            y_fit = ridge.predict(np.vander(xx, 12))
        except LinAlgError,err:
            ridge.transform(np.vander(xx, 9), yy)
            y_fit = ridge.predict(np.vander(xx, 9))
        
        a.plot(y_fit)
예제 #3
0
class Regressor():

    """
    Wraps scikitlearn regressors.


    Parameters
    ----------

    strategy : string, defaut = "LightGBM" (if installed else "XGBoost")
        The choice for the regressor.
        Available strategies = "LightGBM" (if installed), "XGBoost", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear".

    **params : parameters of the corresponding regressor.
        Examples : n_estimators, max_depth...

    """

    def __init__(self, **params):

        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            if (lgbm_installed):
                self.__strategy = "LightGBM"
            else:
                self.__strategy = "XGBoost"

        self.__regress_params = {}

        self.__regressor = None
        self.__set_regressor(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False



    def get_params(self, deep=True):

        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__regress_params)

        return params


    def set_params(self, **params):

        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_regressor(params['strategy'])

            for k,v in self.__regress_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for regressor "+str(self.__strategy)+". Parameter IGNORED. Check the list of available parameters with `regressor.get_params().keys()`")
                else:
                    ret = setattr(self.__regressor,k,v)

        for k,v in params.items():
            if(k=="strategy"):
                pass
            else:
                if k not in self.__regressor.get_params().keys():
                    warnings.warn("Invalid parameter for regressor "+str(self.__strategy)+". Parameter IGNORED. Check the list of available parameters with `regressor.get_params().keys()`")
                else:
                    ret = setattr(self.__regressor,k,v)
                    self.__regress_params[k] = v


    def __set_regressor(self, strategy):

        self.__strategy = strategy

        if(strategy == 'RandomForest'):
            self.__regressor = RandomForestRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap = True, n_jobs=-1, random_state=0)

        elif(strategy == 'XGBoost'):
            self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0)

        elif(strategy == "LightGBM"):
            if(lgbm_installed):
                self.__regressor = LGBMRegressor(n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)
            else:
                warnings.warn("Package lightgbm is not installed. Model LightGBM will be replaced by XGBoost")
                self.__strategy = "XGBoost"
                self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0)

        elif(strategy == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap = True, n_jobs=-1, random_state=0)

        elif(strategy == 'Tree'):
            self.__regressor = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False)

        elif(strategy == "Bagging"):
            self.__regressor = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0)

        elif(strategy == "AdaBoost"):
            self.__regressor = AdaBoostRegressor(base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0)

        elif(strategy == "Linear"):
            self.__regressor = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0)

        else:
            raise ValueError("Strategy invalid. Please choose between 'LightGBM' (if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', 'Tree', 'Bagging', 'AdaBoost' or 'Linear'")


    def fit(self, df_train, y_train):

        '''

        Fits Regressor.

        Parameters
        ----------

        df_train : pandas dataframe of shape = (n_train, n_features)
        The train dataset with numerical features.

        y_train : pandas series of shape = (n_train, )
        The target for regression tasks.


        Returns
        -------
        self

        '''

        ### sanity checks
        if ((type(df_train)!=pd.SparseDataFrame)&(type(df_train)!=pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__regressor.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self


    def feature_importances(self):

        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}

                for i, col in enumerate(self.__col):
                    importance[col] = np.abs(self.get_estimator().coef_)[i]


            elif (self.get_params()["strategy"] in ["LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree"]):

                importance = {}

                for i, col in enumerate(self.__col):
                    importance[col] = self.get_estimator().feature_importances_[i]


            elif (self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # XGB, RF, ET, Tree and AdaBoost

                except:
                    f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # Linear

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]


            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        f = b.feature_importances_  # XGB, RF, ET, Tree and AdaBoost
                    except:
                        f = np.abs(b.coef_)  # Linear

                    for j, c in enumerate(self.get_estimator().estimators_features_[i]):
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    importance[col] = np.mean(filter(lambda x: x != 0, [d[col] if col in d else 0 for d in importance_bag]))


            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")


    def predict(self, df):

        '''

        Predicts the target.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        y : array of shape = (n, )
        The target to be predicted.

        '''

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            ### sanity checks
            if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")


    def transform(self, df):

        '''

        Transforms df.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
        The dataset with numerical features.


        Returns
        -------
        df_transform : pandas dataframe of shape = (n, n_selected_features)
        The transformed dataset with its most important features.

        '''

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            ### sanity checks
            if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.transform(df.values)
        else:
            raise ValueError("You must call the fit function before !")


    def score(self, df, y, sample_weight=None):

        '''

        Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------

        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        score : float
        R^2 of self.predict(df) wrt. y.

        '''

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            ### sanity checks
            if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if (type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__regressor.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")


    def get_estimator(self):

        return copy(self.__regressor)
예제 #4
0
class Regressor():
    """Wrap scikitlearn regressors.

    Parameters
    ----------
    strategy : str, default = "LightGBM"
        The choice for the regressor.
        Available strategies = {"LightGBM", "RandomForest", "ExtraTrees",
        "Tree", "Bagging", "AdaBoost" or "Linear"}

    **params : default = None
        Parameters of the corresponding regressor.
        Examples : n_estimators, max_depth...

    """

    def __init__(self, **params):
        """Init Regressor object where user can pass a strategy."""
        if ("strategy" in params):
            self.__strategy = params["strategy"]
        else:
            self.__strategy = "LightGBM"

        self.__regress_params = {}

        self.__regressor = None
        self.__set_regressor(self.__strategy)
        self.__col = None

        self.set_params(**params)
        self.__fitOK = False

    def get_params(self, deep=True):
        """Get parameters of Regressor object."""
        params = {}
        params["strategy"] = self.__strategy
        params.update(self.__regress_params)

        return params

    def set_params(self, **params):
        """Set parameters of Regressor object."""
        self.__fitOK = False

        if 'strategy' in params.keys():
            self.__set_regressor(params['strategy'])

            for k, v in self.__regress_params.items():
                if k not in self.get_params().keys():
                    warnings.warn("Invalid parameter for regressor "
                                  + str(self.__strategy)
                                  + ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)

        for k, v in params.items():
            if(k == "strategy"):
                pass
            else:
                if k not in self.__regressor.get_params().keys():
                    warnings.warn("Invalid parameter for regressor "
                                  + str(self.__strategy)
                                  + ". Parameter IGNORED. Check the list of "
                                  "available parameters with "
                                  "`regressor.get_params().keys()`")
                else:
                    setattr(self.__regressor, k, v)
                    self.__regress_params[k] = v

    def __set_regressor(self, strategy):
        """Set strategy of a regressor object."""
        self.__strategy = strategy

        if(strategy == 'RandomForest'):
            self.__regressor = RandomForestRegressor(
                n_estimators=400, max_depth=10, max_features='sqrt',
                bootstrap=True, n_jobs=-1, random_state=0)

        elif(strategy == "LightGBM"):
            self.__regressor = LGBMRegressor(
                n_estimators=500, learning_rate=0.05,
                colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0)

        elif(strategy == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor(
                n_estimators=400, max_depth=10, max_features='sqrt',
                bootstrap=True, n_jobs=-1, random_state=0)

        elif(strategy == 'Tree'):
            self.__regressor = DecisionTreeRegressor(
                criterion='mse', splitter='best', max_depth=None,
                min_samples_split=2, min_samples_leaf=1,
                min_weight_fraction_leaf=0.0, max_features=None,
                random_state=0, max_leaf_nodes=None, presort=False)

        elif(strategy == "Bagging"):
            self.__regressor = BaggingRegressor(
                base_estimator=None, n_estimators=500, max_samples=.9,
                max_features=.85, bootstrap=False, bootstrap_features=False,
                n_jobs=-1, random_state=0)

        elif(strategy == "AdaBoost"):
            self.__regressor = AdaBoostRegressor(
                base_estimator=None, n_estimators=400, learning_rate=.05,
                random_state=0)

        elif(strategy == "Linear"):
            self.__regressor = Ridge(
                alpha=1.0, fit_intercept=True, normalize=False, copy_X=True,
                max_iter=None, tol=0.001, solver='auto', random_state=0)

        else:
            raise ValueError(
                "Strategy invalid. Please choose between 'LightGBM'"
                ", 'RandomForest', 'ExtraTrees', "
                "'Tree', 'Bagging', 'AdaBoost' or 'Linear'")

    def fit(self, df_train, y_train):
        """Fits Regressor.

        Parameters
        ----------
        df_train : pandas dataframe of shape = (n_train, n_features)
            The train dataset with numerical features.

        y_train : pandas series of shape = (n_train, )
            The target for regression tasks.

        Returns
        -------
        object
            self

        """
        # sanity checks
        if((type(df_train) != pd.SparseDataFrame) and
           (type(df_train) != pd.DataFrame)):
            raise ValueError("df_train must be a DataFrame")

        if (type(y_train) != pd.core.series.Series):
            raise ValueError("y_train must be a Series")

        self.__regressor.fit(df_train.values, y_train)
        self.__col = df_train.columns
        self.__fitOK = True

        return self

    def feature_importances(self):
        """Computes feature importances.

        Regressor must be fitted before.

        Returns
        -------
        dict
            Dictionnary containing a measure of feature importance (value)
            for each feature (key).

        """
        if self.__fitOK:

            if (self.get_params()["strategy"] in ["Linear"]):

                importance = {}
                f = np.abs(self.get_estimator().coef_)

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest",
                                                    "ExtraTrees", "Tree"]):

                importance = {}
                f = self.get_estimator().feature_importances_

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["AdaBoost"]):

                importance = {}
                norm = self.get_estimator().estimator_weights_.sum()

                try:
                    # LGB, RF, ET, Tree and AdaBoost
                    # TODO: Refactor this part
                    f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa

                except Exception:
                    f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm  # noqa

                for i, col in enumerate(self.__col):
                    importance[col] = f[i]

            elif (self.get_params()["strategy"] in ["Bagging"]):

                importance = {}
                importance_bag = []

                for i, b in enumerate(self.get_estimator().estimators_):

                    d = {}

                    try:
                        # LGB, RF, ET, Tree and AdaBoost
                        f = b.feature_importances_
                    except Exception:
                        f = np.abs(b.coef_)  # Linear

                    estimator = self.get_estimator()
                    items = enumerate(estimator.estimators_features_[i])
                    for j, c in items:
                        d[self.__col[c]] = f[j]

                    importance_bag.append(d.copy())

                for i, col in enumerate(self.__col):
                    list_filtered = filter(lambda x: x != 0,
                                           [k[col] if col in k else 0
                                            for k in importance_bag])
                    importance[col] = np.mean(list(list_filtered))

            else:

                importance = {}

            return importance

        else:

            raise ValueError("You must call the fit function before !")

    def predict(self, df):
        """Predicts the target.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        Returns
        -------
        array of shape = (n, )
            The target to be predicted.

        """
        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.predict(df.values)

        else:
            raise ValueError("You must call the fit function before !")

    def transform(self, df):
        """Transform dataframe df.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        Returns
        -------
        pandas dataframe of shape = (n, n_selected_features)
            The transformed dataset with its most important features.

        """
        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            return self.__regressor.transform(df.values)
        else:
            raise ValueError("You must call the fit function before !")

    def score(self, df, y, sample_weight=None):
        """Return R^2 coefficient of determination of the prediction.

        Parameters
        ----------
        df : pandas dataframe of shape = (n, n_features)
            The dataset with numerical features.

        y : pandas series of shape = (n,)
            The numerical encoded target for classification tasks.

        Returns
        -------
        float
            R^2 of self.predict(df) wrt. y.

        """
        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            # sanity checks
            if((type(df) != pd.SparseDataFrame) and
               (type(df) != pd.DataFrame)):
                raise ValueError("df must be a DataFrame")

            if (type(y) != pd.core.series.Series):
                raise ValueError("y must be a Series")

            return self.__regressor.score(df.values, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")

    def get_estimator(self):
        """Return classfier."""
        return copy(self.__regressor)
예제 #5
0
def plot_transfer_graph_prob_fitted(path, name, analysis_folder):
    
    from sklearn.linear_model import Ridge
    from scipy.interpolate import UnivariateSpline
    result_file = open(
                       os.path.join(path, 
                               '0_results', 
                               analysis_folder, 
                               name, 
                               name+'_classifier.pyobj')
                       , 'r')
    
    results = pickle.load(result_file)
    
    runs = 12
    
    probabilities = results.probabilities
    prob = np.array([p[1][p[0]] for p in probabilities])
    pred = np.array([p[0] for p in probabilities])
    lab = np.unique(results.predictions)
    run_length = len(prob)/runs
       
    ridge = Ridge()
    
    f = plt.figure(figsize=(11,8))
    f2 = plt.figure(figsize=(11,8))
    data_sm = dict()
    data_or = dict()
    for c in np.unique(lab):
        data_sm[c] = []
        data_or[c] = []
    for i in range(12):
        if i < 6:
            aggregate = 1
            l = '_pre'
        else:
            aggregate = 2
            l = '_post'
        avg = []
        for c in np.unique(pred):
            a = f.add_subplot(3,2,(c*2)+aggregate)
            a2 = f2.add_subplot(3,2,(c*2)+aggregate)
            a.set_title(lab[c]+l)
            #v = prob[i*run_length:(i+1)*run_length]
            v = prob[i*run_length:(i+1)*run_length] * (pred[i*run_length:(i+1)*run_length] == c)
            v[len(v)-1] = 0
            yy = v.copy()

            xx = np.linspace(0, len(v), len(v))
            s = UnivariateSpline(xx, yy, s=5)
            ys = s(xx)
            try:
                ridge.transform(np.vander(xx, 7), yy)
                y_fit = ridge.predict(np.vander(xx, 7))
            except LinAlgError,err:
                ridge.transform(np.vander(xx, 9), yy)
                y_fit = ridge.predict(np.vander(xx, 9))
            
            data_sm[lab[c]].append(ys)
            data_or[lab[c]].append(v)

            a.plot(y_fit)
            a2.plot(ys)

            a.set_ybound(upper=1.1, lower=-0.1)
            a2.set_ybound(upper=1.1, lower=-0.1)
예제 #6
0
class Regressor(object):

    """Wraps scikit regressors"""

    def __init__(self, modelname='Linear', num_bagged_est=None, random_state=None, **kwargs):
        """Construct a regressor
    
        Parameters
        ----------
        modelname : str, model name to be used as regressor
            Available models:
            - "XGBoost", 
            - "LightGBM",
            - "Keras", 
            - "RandomForest", 
            - "ExtraTrees", 
            - "Tree", 
            - "Bagging", 
            - "AdaBoost" 
            - "Linear"
        num_bagged_est: int or None
            Number of estimators to be averaged after bagged fitting. 
            If None then bagged fitting is not performed. 
        random_state:  int, RandomState instance or None, optional, default=None
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator; 
            If None, the random number generator is the RandomState instance used by models. 
        **kwargs : default = None
            Parameters of the corresponding regressor.
            Examples : n_estimators, max_depth, ...
        """
        if not _IS_SKLEARN_INSTALLED:
            raise ValueError('Scikit-learn is required for this module')
            
        self.__modelname = modelname
        if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED:
            raise ValueError('Package XGBoost is not installed.')
        elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED:
            raise ValueError('Package LightGBM is not installed.')
        elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED:
            raise ValueError('Package Keras is not installed.')

        self.__regressor = None
        self.__set_regressor(self.__modelname)
        self.set_params(**kwargs)
        
        self.__num_bagged_est = num_bagged_est
        if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None:
            raise ValueError("num_bagged_est must be either None or an integer.")
        self.__random_state = random_state
        if type(self.__random_state) != int and self.__random_state is not None:
            raise ValueError("random_state must be either None or an integer.")
        self.set_params(random_state=self.__random_state)
        
        self.__fitOK = False
        self.__bagged_est = None
        
    def get_params(self, deep=True):

        params = {}
        params.update({"modelname": self.__modelname,
                       "num_bagged_est": self.__num_bagged_est,
                       "random_state": self.__random_state})
        params.update(self.__regressor.get_params())

        return params
    
    def set_params(self, **params):

        self.__fitOK = False
        self.__bagged_est = None

        if 'modelname' in params.keys():
            self.__set_regressor(params['modelname'])
            del params['modelname']
            if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED:
                raise ValueError('Package XGBoost is not installed.')
            elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED:
                raise ValueError('Package LightGBM is not installed.')
            elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED:
                raise ValueError('Package Keras is not installed.')
                    
        if 'num_bagged_est' in params.keys():
            self.__num_bagged_est = params['num_bagged_est']
            del params['num_bagged_est']
            if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None:
                raise ValueError("num_bagged_est must be either None or an integer.")
                
        if 'random_state' in params.keys():
            self.__random_state = params['random_state']
            if 'random_state' not in self.__regressor.get_params().keys():
                del params['random_state']
            if type(self.__random_state) != int and self.__random_state is not None:
                raise ValueError("random_state must be either None or an integer.")
        
        if 'build_fn' in params.keys() and self.get_estimator_name == 'Keras':
            setattr(self.__regressor, 'build_fn', params['build_fn'])
            del params['build_fn']
            
        self.__regressor.set_params(**params)
                    
    def __set_regressor(self, modelname):

        self.__modelname = modelname

        if(modelname == 'XGBoost'):
            self.__regressor = XGBRegressor()

        elif(modelname == "LightGBM"):
            self.__regressor = LGBMRegressor()
        
        elif(modelname == "Keras"):
            self.__regressor = KerasRegressor(build_fn=Sequential())
            
        elif(modelname == 'RandomForest'):
            self.__regressor = RandomForestRegressor()

        elif(modelname == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor()

        elif(modelname == 'Tree'):
            self.__regressor = DecisionTreeRegressor()

        elif(modelname == "Bagging"):
            self.__regressor = BaggingRegressor()

        elif(modelname == "AdaBoost"):
            self.__regressor = AdaBoostRegressor()

        elif(modelname == "Linear"):
            self.__regressor = Ridge()

        else:
            raise ValueError(
                "Model name invalid. Please choose between LightGBM " +
                "(if installed), XGBoost(if installed), Keras(if installed)," +
                "RandomForest, ExtraTrees, Tree, Bagging, AdaBoost or Linear")
            
    def fit(self, X, y, **kwargs):
        """Fit model. In case num_bagged_est is not None then additionally 
        performing a type of bagging ensamble - ensamble from the same models, 
        but with different seed values/reshuffled data which aims to decrease
        variance of the predictions.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.  
            
        Returns
        -------
        object
            self
        """
        y = self.__process_target(y)
            
        if self.__num_bagged_est is None:
            self.__regressor.fit(X, y, **kwargs)
            
        else:
            if not hasattr(self.__regressor, 'random_state'):
                 warnings.warn("The regressor " + str(self.__modelname) + 
                               " has no random_state attribute and only random " +
                               " shuffling will be used.")
        
            self.__bagged_est = []
            for i in range(0, self.__num_bagged_est):
                X_shuff, y_shuff = shuffle(X, y, random_state=self.__random_state+i)
                est = self.get_estimator()
                if hasattr(est, 'random_state'):
                    est.set_params(random_state=self.__random_state+i)
                est.fit(X_shuff, y_shuff, **kwargs)
                self.__bagged_est.append(est)
                
        self.__fitOK = True
        
        return self
    
    def predict(self, X):

        """Predicts the target.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            
        Returns
        -------
        array of shape = [n_samples, ] 
            The target to be predicted.
        """

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:
            if self.__num_bagged_est is None:
                return self.__regressor.predict(X)
            else:
                bagged_pred = np.zeros(X.shape[0])
                for c, est in enumerate(self.__bagged_est): 
                    bagged_pred += est.predict(X) / self.__num_bagged_est
                    
        else:
            raise ValueError("You must call the fit function before !")
        
        return bagged_pred
 
    def transform(self, X):

        """Transforms X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]

        Returns
        -------
        array-like or sparse matrix of shape = [n_samples, n_features]
            The transformed X.
        """

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            return self.__regressor.transform(X)
        else:
            raise ValueError("You must call the fit function before !")


    def score(self, X, y, sample_weight=None):

        """Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training and cv.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.

        Returns
        -------
        float
            R^2 of self.predict(df) wrt. y.
        """

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            return self.__regressor.score(X, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")
            
    def cross_val_predict(self, X, y, cv=None, scoring=None, **kwargs):
        
        """Performing cross validation hold out predictions for stacking.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training and cv.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        scoring : callable, default: None
                A callable to evaluate the predictions on the cv set.
                None, accuracy score
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.         
        Returns
        -------
        array of shape = [n_samples, ]
            The hold out target
        """
        y = self.__process_target(y)
        
        y_pred = np.zeros(X.shape[0]) 
        
        cv = check_cv(cv, y, classifier=False)
        n_splits = cv.get_n_splits(X, y)
           
        if scoring is None:
            scoring = make_scorer(accuracy_score)
            
        i = 0 
        score_mean = 0.0
        print("Starting hold out prediction with {} splits.".format(n_splits))
        for train_index, cv_index in cv.split(X, y): 
            X_train = X[train_index]    
            y_train = y[train_index]
            X_cv = X[cv_index]
            y_cv = y[cv_index]
            
            est = self.get_estimator()
            est.fit(X_train, y_train, **kwargs)
            y_pred_cv = est.predict(X_cv)
            
#            score = scoring(y_cv, y_pred_proba_cv)                        
            
#            print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score)) 
#            score_mean += score / float(n_splits)
            
            y_pred[cv_index] = y_pred_cv
            
            i += 1 
        
#        print("Mean score: {:.4f}".format(score_mean))    

        return y_pred
        
    def cross_validate(self, X, y, cv=None, scoring=None, **kwargs):
        """Performing a cross validation method.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        scoring : 
            For scikit learn models:
                string, callable, list/tuple, dict or None, default: None
                A single string or a callable to evaluate the predictions on the test set.
                None, the estimator’s default scorer (if available) is used.
            For LightGBM:
                callable or None, optional (default=None)
                Customized evaluation function.
                Note: should return (eval_name, eval_result, is_higher_better) or list of such tuples.
            For XGBoost:
                callable or None, optional (default=None)
                Customized evaluation function.  
        **kwargs : default = None
            Additional fitting arguments.  
            
        Returns
        -------
        object
            self
        """  
        y = self.__process_target(y)
        
        if self.get_estimator_name == 'LightGBM':
            params = self.__regressor.get_params()
            data = lgb.Dataset(X, label=y)
            cv = check_cv(cv, y, classifier=False)
            ret = lgb.cv(params, data, feval=scoring, folds=cv, **kwargs)
        
        elif self.get_estimator_name == 'XGBoost':
            params = self.__regressor.get_xgb_params()
            data = xgb.DMatrix(X, label=y)
            cv = check_cv(cv, y, classifier=False)
            ret = xgb.cv(params, data, feval=scoring, folds=cv, **kwargs)

        else:  
            ret = cross_validate(self.__regressor, X, y, cv=cv, scoring=scoring)
        
        return ret
    
    def __process_target(self, y):
        
        y = np.array(y, dtype='float') 
               
        return y
    
    def get_estimator(self):

        return self.__classifier 
    
    def get_estimator_copy(self):

        return make_copy(self.__classifier)
    
    @property
    def feature_importances_(self):  
        if self.__fitOK:
            
            if hasattr(self.__regressor, 'feature_importances_'):
                return self.__regressor.feature_importances_
            else:
                raise ValueError('The regressor ' + self.get_estimator_name + 
                                 ' does not have feature_importances_ attribute.')
                
        else:
            
            raise ValueError("You must call the fit function before !")
            
    @property
    def get_estimator_name(self):
        
        return self.__modelname
예제 #7
0
def plot_transfer_graph_prob_fitted(path, name, analysis_folder):

    from sklearn.linear_model import Ridge
    from scipy.interpolate import UnivariateSpline
    result_file = open(
        os.path.join(path, '0_results', analysis_folder, name,
                     name + '_classifier.pyobj'), 'r')

    results = pickle.load(result_file)

    runs = 12

    probabilities = results.probabilities
    prob = np.array([p[1][p[0]] for p in probabilities])
    pred = np.array([p[0] for p in probabilities])
    lab = np.unique(results.predictions)
    run_length = len(prob) / runs

    ridge = Ridge()

    f = plt.figure(figsize=(11, 8))
    f2 = plt.figure(figsize=(11, 8))
    data_sm = dict()
    data_or = dict()
    for c in np.unique(lab):
        data_sm[c] = []
        data_or[c] = []
    for i in range(12):
        if i < 6:
            aggregate = 1
            l = '_pre'
        else:
            aggregate = 2
            l = '_post'
        avg = []
        for c in np.unique(pred):
            a = f.add_subplot(3, 2, (c * 2) + aggregate)
            a2 = f2.add_subplot(3, 2, (c * 2) + aggregate)
            a.set_title(lab[c] + l)
            #v = prob[i*run_length:(i+1)*run_length]
            v = prob[i * run_length:(i + 1) * run_length] * (
                pred[i * run_length:(i + 1) * run_length] == c)
            v[len(v) - 1] = 0
            yy = v.copy()

            xx = np.linspace(0, len(v), len(v))
            s = UnivariateSpline(xx, yy, s=5)
            ys = s(xx)
            try:
                ridge.transform(np.vander(xx, 7), yy)
                y_fit = ridge.predict(np.vander(xx, 7))
            except LinAlgError, err:
                ridge.transform(np.vander(xx, 9), yy)
                y_fit = ridge.predict(np.vander(xx, 9))

            data_sm[lab[c]].append(ys)
            data_or[lab[c]].append(v)

            a.plot(y_fit)
            a2.plot(ys)

            a.set_ybound(upper=1.1, lower=-0.1)
            a2.set_ybound(upper=1.1, lower=-0.1)