Exemplo n.º 1
0
    def fit(self, data, **kwargs):
        """
        Fit the classifier to given training data.
        :param data: instance of DataManager
        :return: self
        """
        metric = accuracy_score if 'metric' not in kwargs else kwargs['metric']

        # TODO:Automated feature engineering
        if isinstance(data, pd.DataFrame):
            self.pre_pipeline = DP_Pipeline(None)
            data = self.pre_pipeline.execute(data, phase='train')

        # Check the task type: {binary, multiclass}
        task_type = type_of_target(data.train_y)
        if task_type in [
                'multiclass-multioutput', 'continuous',
                'continuous-multioutput', 'unknown'
        ]:
            raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type)
        self.task_type = task_type
        kwargs['task_type'] = task_type

        # Options for multiclass averaging.
        average = 'weighted'

        metric = get_metric(metric)
        kwargs['metric'] = metric

        super().fit(data, **kwargs)

        return self
Exemplo n.º 2
0
    def fit(self, data, **kwargs):
        """
        Fit the regressor to given training data.
        :param data: instance of DataManager
        :return: self
        """
        metric = mean_squared_error if 'metric' not in kwargs else kwargs[
            'metric']

        # TODO:Automated feature engineering
        if isinstance(data, pd.DataFrame):
            self.pre_pipeline = DP_Pipeline(None)
            data = self.pre_pipeline.execute(data,
                                             phase='train',
                                             stratify=False)
        # Check the task type: {continuous}
        task_type = type_of_target(data.train_y)
        if task_type != 'continuous':
            raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type)
        self.task_type = task_type
        kwargs['task_type'] = task_type

        metric = get_metric(metric)
        kwargs['metric'] = metric

        super().fit(data, **kwargs)

        return self
Exemplo n.º 3
0
    def fit(self, data, **kwargs):
        """Fit the classifier to given training data.

        Parameters
        ----------

        data : instance of DataManager or DataFrame

        metric : callable, optional (default='autosklearn.metrics.accuracy_score').

        feat_type : list, optional (default=None)
            List of str of `len(X.shape[1])` describing the attribute type.
            Possible types are `Categorical` and `Numerical`. `Categorical`
            attributes will be automatically One-Hot encoded. The values
            used for a categorical attribute must be integers, obtained for
            example by `sklearn.preprocessing.LabelEncoder
            <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.

        dataset_name : str, optional (default=None)
            Create nicer output. If None, a string will be determined by the
            md5 hash of the dataset.

        Returns
        -------
        self

        """

        metric = accuracy_score if 'metric' not in kwargs else kwargs['metric']
        # feat_type = None if 'feat_type' not in kwargs else kwargs['feat_type']
        # dataset_name = None if 'dataset_name' not in kwargs else kwargs['dataset_name']
        # # The number of evaluations.
        # runcount = None if 'runcount' not in kwargs else kwargs['runcount']

        # TODO:Automated feature engineering
        if isinstance(data, pd.DataFrame):
            self.pre_pipeline = DP_Pipeline(None)
            data = self.pre_pipeline.execute(data, phase='train')

        # Check the task type: {binary, multiclass}
        task_type = type_of_target(data.train_y)
        if task_type in [
                'multiclass-multioutput', 'continuous',
                'continuous-multioutput', 'unknown'
        ]:
            raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type)
        self.task_type = task_type
        kwargs['task_type'] = task_type

        # Options for multiclass averaging.
        average = 'weighted'

        metric = get_metric(metric)
        kwargs['metric'] = metric

        super().fit(data, **kwargs)

        return self
Exemplo n.º 4
0
    def _get_performance(self, train_x, train_y, valid_x, valid_y, model):
        metric = get_metric(self.metricstr)
        model.fit(train_x, train_y)
        if self.metricstr == 'auc':
            pred = model.predict_proba(valid_x)[:, 1]
        else:
            pred = model.predict(valid_x)

        return metric(pred, valid_y)
Exemplo n.º 5
0
 def __init__(self,
              max_iter,
              metrics,
              stratify,
              model=LogisticRegression(multi_class='auto',
                                       solver='liblinear')):
     self.max_iter = max_iter
     self.model = model
     self.metricstr = metrics
     self.stratify = stratify
     self.metric = get_metric(metrics)
     self.feature_sets = None
     self.feature_cols = dict()
     self.train_data = None
     self.valid_data = None
     self.numerical_features = None
     self.init_length = None
Exemplo n.º 6
0
    def fit(self, data, **kwargs):
        """Fit the regressor to given training data.

        Parameters
        ----------

        data : instance of DataManager.

        metric : callable, optional (default='autosklearn.metrics.mean_squared_error').

        dataset_name : str, optional (default=None)
            Create nicer output. If None, a string will be determined by the
            md5 hash of the dataset.

        Returns
        -------
        self

        """
        # feat_type = None if 'feat_type' not in kwargs else kwargs['feat_type']
        # dataset_name = None if 'dataset_name' not in kwargs else kwargs['dataset_name']
        # # The number of evaluations.
        # runcount = None if 'runcount' not in kwargs else kwargs['runcount']

        # TODO:Automated feature engineering
        if isinstance(data, pd.DataFrame):
            self.pre_pipeline = DP_Pipeline(None)
            data = self.pre_pipeline.execute(data, phase='train', stratify=False)
        # Check the task type: {continuous}
        task_type = type_of_target(data.train_y)
        if task_type != 'continuous':
            raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type)
        self.task_type = task_type
        kwargs['task_type'] = task_type

        metric = mean_squared_error if 'metric' not in kwargs else kwargs['metric']
        metric = get_metric(metric)
        kwargs['metric'] = metric

        super().fit(data, **kwargs)

        return self
Exemplo n.º 7
0
    def _selection_process(self, dm):
        generated_train_data, generated_valid_data, generated_test_data = self.solver.transform(
        )
        features = np.load("features_27.npz")
        generated_train_data, generated_valid_data, generated_test_data = features[
            "train"], features["valid"], None
        feature_num = dm.train_X.shape[1]

        if feature_num < 20:
            dm.train_X = generated_train_data
            dm.val_X = generated_valid_data
            dm.test_X = generated_test_data
        else:
            print("start selection process...............")
            selector = RandomForestSelector()
            selector.fit(dm.train_X, dm.train_y)

            lr = LogisticRegression()

            best_perf = get_metric(self.metrics, generated_train_data,
                                   dm.train_y, generated_valid_data, dm.val_y,
                                   lr)
            best_k = 0
            for percentile in range(1, 10):
                k = int((percentile / 10.0) * feature_num)
                selected_train_data = selector.transform(dm.train_X, k)
                selected_valid_data = selector.transform(dm.val_X, k)

                perf = get_metric(metrics=self.metrics,
                                  x_train=np.hstack((generated_train_data,
                                                     selected_train_data)),
                                  y_train=dm.train_y,
                                  x_valid=np.hstack((generated_valid_data,
                                                     selected_valid_data)),
                                  y_valid=dm.val_y,
                                  model=lr)
                if perf <= best_perf:
                    break
                else:
                    print("selected pertentile:",
                          percentile,
                          "perf:",
                          best_perf,
                          flush=True)
                    best_perf = perf
                    best_k = k
            if best_k != 0:
                dm.train_X = np.hstack((generated_train_data,
                                        selector.transform(dm.train_X,
                                                           best_k)))
                dm.val_X = np.hstack((generated_valid_data,
                                      selector.transform(dm.val_X, best_k)))
                if dm.test_X is not None:
                    dm.test_X = np.hstack(
                        (generated_test_data,
                         selector.transform(dm.test_X, best_k)))

            else:
                dm.train_X = generated_train_data
                dm.val_X = generated_valid_data
                dm.test_X = generated_test_data