Пример #1
0
class OnlineTpotClassifer(base.Classifier):

    def __init__(self, n_training_samples, classes: list, max_time_mins: int = 15):
        self.n_training_samples = n_training_samples
        self.max_time_mins = max_time_mins
        self.training_samples_x = []
        self.training_samples_y = []
        self.estimator = None
        self.classes = classes

    def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs) -> base.Classifier:
        if self.estimator is None:
            self.training_samples_x.append(x)
            self.training_samples_y.append(y)
        if len(self.training_samples_x) >= self.n_training_samples and self.estimator is None:
            x_train = np.stack([dict2numpy(i) for i in self.training_samples_x])
            self.estimator = TPOTClassifier(max_time_mins=self.max_time_mins)
            self.estimator.fit(x_train, self.training_samples_y)
            self.training_samples_y = []
            self.training_samples_x = []
        return self

    def predict_proba_one(self, x: dict) -> typing.Dict[base.typing.ClfTarget, float]:
        if self.estimator is not None:
            y_pred = self.estimator.predict_proba([list(x.values())])[0]
            return {self.classes[i]: p for i, p in enumerate(y_pred)}
        else:
            return {c: 1 / len(self.classes) for c in self.classes}

    def predict_proba_many(self, X):
        return pd.Series(self.estimator.predict_proba(X), columns=self.classes)

    @property
    def _multiclass(self):
        return True

    def learn_many(self, X, y):
        self.estimator.partial_fit(X=X.values, y=y.values, classes=self.classes)
        return self

    def predict_one(self, x):
        if self.estimator is not None:
            y_pred = self.estimator.predict([list(x.values())])[0]
            return y_pred
        else:
            return self.classes[0]

    def predict_many(self, X):
        return pd.Series(self.estimator.predict(X))
Пример #2
0
def build_classifier(data, feature_pipeline, generations, population_size,
                     name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    config = filter_config(config)
    del config[
        "sklearn.naive_bayes.GaussianNB"]  # Does not support nesting - see http://mantis.dmg.org/view.php?id=208
    del config["sklearn.neighbors.KNeighborsClassifier"]
    del config[
        "sklearn.svm.LinearSVC"]  # Does not support classifier.predict_proba(Xt)
    del config["sklearn.tree.DecisionTreeClassifier"]
    classifier = TPOTClassifier(generations=generations,
                                population_size=population_size,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(Xt, y)
    pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps +
                                           classifier.fitted_pipeline_.steps),
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(classifier.predict(Xt), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(Xt),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name)
Пример #3
0
class TpotEstimator(BaseEstimator):
    def __init__(self, task, **kwargs):
        super(TpotEstimator, self).__init__(task)
        if task == 'regression':
            self.tpot = TPOTRegressor(**kwargs)
        else:
            self.tpot = TPOTClassifier(**kwargs)
        self.name = 'tpot'
        self.label_encoder = None
        self.obj_cols = None

    def train(self, X, y, X_test):
        self.obj_cols = column_object_category_bool(X)
        self.label_encoder = SafeOrdinalEncoder()
        X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols])
        self.tpot.fit(X, y)

    def predict_proba(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        proba = self.tpot.predict_proba(X)
        print(f'proba.shape:{proba.shape}')
        return proba

    def predict(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        return self.tpot.predict(X)
Пример #4
0
def build_classifier(data, name):
    X, y = data
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    del config["sklearn.neighbors.KNeighborsClassifier"]
    classifier = TPOTClassifier(generations=1,
                                population_size=3,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(X, y)
    pipeline = make_pmml_pipeline(classifier.fitted_pipeline_,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name + ".pkl")
    result = DataFrame(classifier.predict(X), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(X),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name + ".csv")
Пример #5
0
    def run_AutoTpot(self):
        # Running the AutoTpot pipeline
        automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse')
        automl.fit(self.train, self.y_train)

        # TPOT produces ready-to-run, standalone Python code for the best-performing model,
        # in the form of a scikit-learn pipeline.
        # Exporting the best models
        automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py'))

        print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_))

        print('Saving the best model discovered through TPOT.')
        # Dumping ensemble of the models
        joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle'))

        # Calculating time per prediction
        # Start time ******************************************************************************
        start = timeit.default_timer()

        # Predicting label, confidence probability on the test data set
        predictions = automl.predict(self.test)
        predictions_prob = automl.predict_proba(self.test)

        # Binary class values : rounding them to 0 or 1
        predictions = [round(value) for value in predictions]

        end = timeit.default_timer()
        # End Time ******************************************************************************
        print('Time per prediction : {}'.format((end - start) / self.test.shape[0]))

        self.visualize(predictions, automl)
Пример #6
0
def test_predict_proba2():
    """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""

    tpot_obj = TPOTClassifier()
    pipeline_string = (
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
        ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(
        pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)

    rows = result.shape[0]
    columns = result.shape[1]

    try:
        for i in range(rows):
            for j in range(columns):
                float_range(result[i][j])
        assert True
    except Exception:
        assert False
Пример #7
0
class TPOTClassifierModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = TPOTClassifier(**model_params)

    def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)
        # TODO: This is required by DESlib which is kind of annoying.
        #       They call check_if_fitted(model, 'classes_'), meaning these have
        #       to act more like general sklearn models
        #
        #       If using more classifiers, the creation of a ClassifierModel
        #       base class is probably required to ensure consistency
        self.classes_ = self._model.fitted_pipeline_.classes_

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        if not isinstance(self._model, TPOTClassifier):
            raise RuntimeError(
                'Due to TPOT being unpickelable, saving this' +
                ' means only the actual sklearn.Pipeline' +
                ' was saved. Calling fit will fit this pipeline' +
                ' rather than the TPOT algorithm. If this is' +
                ' desired behaviour, please use `_force_fit`' + ' instead')
        self._force_fit(X, y)

    def save(self, path: str) -> None:

        # See comment above class
        if isinstance(self._model, TPOTClassifier):
            self._model = self._model.fitted_pipeline_

        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(TPOTClassifierModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict_proba(X)
Пример #8
0
def test_predict_proba():
    """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)
    num_labels = np.amax(testing_classes) + 1

    assert result.shape == (testing_features.shape[0], num_labels)
Пример #9
0
def test_predict_proba():
    """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)
    num_labels = np.amax(testing_classes) + 1

    assert result.shape == (testing_features.shape[0], num_labels)
Пример #10
0
class TPOTModelParam(ModelParamObject):
    def __init__(self, **kwargs):
        ModelParamObject.__init__(self)
        self.model = TPOTClassifier(**kwargs)
        self.fitted = False
        self.to_grid_search = False

    def optimize(self, X, y):
        """
        In the TPOT case, this runs the standard TPOT optimization algorithm.
        """
        print("Performing TPOT genetic optimization.")
        self.model.fit(X, y)
        self.optimized = True

    def predict_proba(self, X):
        return self.model.predict_proba(X)
Пример #11
0
def main():
    df_train = pd.read_csv(os.getenv('PREPARED_TRAINING'))
    df_valid = pd.read_csv(os.getenv('PREPARED_VALIDATING'))
    df_test = pd.read_csv(os.getenv('PREPARED_TESTING'))

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    prefix = os.getenv('STORING')
    tsne_data = np.load(os.path.join(prefix, 'tsne_2d_5p.npz'))
    tsne_train = tsne_data['train']
    tsne_valid = tsne_data['valid']
    tsne_test = tsne_data['test']

    # concat features
    X_train_concat = np.concatenate([X_train, tsne_train], axis=1)
    X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1)
    X_test_concat = np.concatenate([X_test, tsne_test], axis=1)

    tpot = TPOTClassifier(max_time_mins=int(os.getenv('TIME_LIMIT_ALL',
                                                      '1440')),
                          max_eval_time_mins=int(
                              os.getenv('TIME_LIMIT_PART', '5')),
                          population_size=100,
                          scoring='log_loss',
                          cv=3,
                          verbosity=2,
                          random_state=67)
    tpot.fit(X_train_concat, y_train)
    loss = tpot.score(X_valid_concat, y_valid)
    print(loss)
    tpot.export(os.path.join(prefix, 'tpot_pipeline.py'))

    p_test = tpot.predict_proba(X_test_concat)
    df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]})
    csv_path = os.getenv('PREDICTING')
    df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
    print('Saved: {}'.format(csv_path))
Пример #12
0
class TPOTBaselineModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = TPOTClassifier(**model_params)

    def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        if not isinstance(self._model, TPOTClassifier):
            raise RuntimeError(
                'Due to TPOT being unpickelable, saving this' +
                ' means only the actual sklearn.Pipeline' +
                ' was saved. Calling fit will fit this pipeline' +
                ' rather than the TPOT algorithm. If this is' +
                ' desired behaviour, please use `_force_fit`' + ' instead')
        self._force_fit(X, y)

    def save(self, path: str) -> None:
        # See comment above class
        if isinstance(self._model, TPOTClassifier):
            self._model = self._model.fitted_pipeline_

        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(TPOTBaselineModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        # TODO: May cause issues if SVG or SVM model is best
        return self._model.predict_proba(X)
Пример #13
0
def process_tpot(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using tpot"""

    from tpot import TPOTClassifier
    from tpot import TPOTRegressor
    from ..config import classifier_config_dict

    # Register Timer
    def handler(signum, frame):
        raise SystemExit('Time limit exceeded, sending system exit...')

    signal.signal(signal.SIGALRM, handler)

    # default cv is 5
    if m_type == 'classification':
        automl = TPOTClassifier(generations=100,
                                population_size=100,
                                config_dict=classifier_config_dict,
                                verbosity=3,
                                max_time_mins=int(10800/60),
                                scoring='f1_weighted',
                                n_jobs=N_CORES,
                                random_state=seed)
    else:
        automl = TPOTRegressor(generations=100, 
                               population_size=100,
                               verbosity=3,
                               max_time_mins=int(10800/60),
                               scoring='neg_mean_squared_error',
                               n_jobs=N_CORES,
                               random_state=seed)

    # Set timer
    # for long running processes TPOT sometimes does not end even with generations
    signal.alarm(TIME_PER_TASK+GRACE_PERIOD)
    automl.fit(X_train.values, y_train.values)
    signal.alarm(0)

    return (automl.predict_proba(X_test.values) if m_type == 'classification' else 
            automl.predict(X_test.values))
Пример #14
0
def StackerTPOT(X_trainval, y_trainval, X_test, y_test):
    import tpot

    from sklearn.model_selection import RepeatedStratifiedKFold
    from tpot import TPOTClassifier

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    model = TPOTClassifier(generations=5,
                           population_size=50,
                           cv=cv,
                           scoring='neg_brier_score',
                           verbosity=2,
                           random_state=1,
                           n_jobs=-1)
    model.fit(X_trainval, y_trainval)

    model.export('tpot_best_model.py')

    #Now calculate the brier score of the best model on the test data
    y_prob = model.predict_proba(X_test)

    return brier_score_loss(y_test, y_prob[:, 0], pos_label=1)
Пример #15
0
def test_predict_proba2():
    """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict_proba(testing_features)

    rows = result.shape[0]
    columns = result.shape[1]

    try:
        for i in range(rows):
            for j in range(columns):
                float_range(result[i][j])
        assert True
    except Exception:
        assert False
Пример #16
0
import uuid


if __name__ == '__main__':
    multiprocessing.set_start_method('forkserver')

    import pandas as pd
    from sklearn import metrics
    from tpot import TPOTClassifier

    from kirgsn import reducing

    path_input_extended = 'input'

    tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2,
                          scoring='roc_auc', cv=5, #max_time_mins=60*3,
                          random_state=1990, n_jobs=-1,
                          periodic_checkpoint_folder='out')
    train = pd.read_csv(join(path_input_extended, 'train.csv'), na_values="-1")
    test = pd.read_csv(join(path_input_extended, 'test.csv'), na_values="-1")

    cols = [c for c in train.columns if c not in ['id', 'target']]
    tpot.fit(train[cols], train['target'])
    tpot.export('out/tpotted.py')
    test['target'] = tpot.predict_proba(test[cols])[:, 1]
    test[['id', 'target']].to_csv('out/submissions/tpot_{}_{}.csv.gz'.format(
        str(uuid.uuid4()).split(sep='-')[0]),
        index=False,
        float_format='%.5f',
        compression='gzip')
Пример #17
0
class TpotBaseline(object):
    def __init__(self, *, input_path, output_path, output_file_name):
        self.__input_path = input_path
        self.__output_path = output_path
        self.__output_file_name = output_file_name

        self.__train, self.__test = [None for _ in range(2)]
        self.__sample_submission = None
        self.__train_feature, self.__train_label = [None for _ in range(2)]
        self.__test_feature = None
        self.__categorical_index = None
        self.__numeric_index = None
        self.__encoder = None
        self.__imputer = None

        self.__clf = None

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__input_path, "sample_submission_one.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop("TARGET", axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns]

        # 离散变量缺失值处理 + 连续化
        self.__categorical_index = np.where(
            self.__train_feature.dtypes == "object")[0]
        self.__numeric_index = np.where(
            self.__train_feature.dtypes != "object")[0]

        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__train_feature.iloc[:, self.__categorical_index].fillna(
                "missing"))
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__test_feature.iloc[:, self.__categorical_index].fillna(
                "missing"))
        self.__encoder = ce.TargetEncoder()
        self.__encoder.fit(
            self.__train_feature.iloc[:, self.__categorical_index],
            self.__train_label)
        self.__train_feature.iloc[:, self.
                                  __categorical_index] = self.__encoder.transform(
                                      self.__train_feature.
                                      iloc[:, self.__categorical_index])
        self.__test_feature.iloc[:, self.
                                 __categorical_index] = self.__encoder.transform(
                                     self.__test_feature.
                                     iloc[:, self.__categorical_index])

        # 连续变量缺失值处理
        self.__imputer = Imputer(strategy="median")
        self.__imputer.fit(self.__train_feature.iloc[:, self.__numeric_index])
        self.__train_feature.iloc[:, self.
                                  __numeric_index] = self.__imputer.transform(
                                      self.__train_feature.
                                      iloc[:, self.__numeric_index])
        self.__test_feature.iloc[:, self.
                                 __numeric_index] = self.__imputer.transform(
                                     self.__test_feature.iloc[:, self.
                                                              __numeric_index])

    def model_fit(self):
        self.__clf = TPOTClassifier(scoring="roc_auc", n_jobs=-1, verbosity=2)
        self.__clf.fit(self.__train_feature.values, self.__train_label.values)

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__clf.predict_proba(
            self.__test_feature.values)[:, 1]
        self.__sample_submission.to_csv(os.path.join(self.__output_path,
                                                     self.__output_file_name),
                                        index=False)
        self.__clf.export(os.path.join(self.__output_path, "tpot_baseline.py"))
Пример #18
0
def autoframe(
    task,
    metalearning,
    prepb,
    feat_type,
    resultsfile,
    X_train,
    y_train,
    X_test,
    y_test,
    dataset,
    framework,
    foldn,
    ncore,
    timeforjob,
    dirt,
    meta,
    fitmetrics,
    outputdir,
    target,
):

    shape = []
    shape = [X_train.shape, y_train.shape, X_test.shape, y_test.shape]
    start = time.time()
    if framework == 'autosklearn':
        if task == "bt" or task == "bre":
            automl = autoclf(
                metalearning,
                framework,
                feat_type,
                timeforjob,
                foldn,
                ncore,
                X_train,
                y_train,
                fitmetrics,
            )
            y_pred_prob = automl.predict_proba(X_test)
        elif task == "it":
            automl = autoreg(
                metalearning,
                framework,
                feat_type,
                timeforjob,
                foldn,
                ncore,
                X_train,
                y_train,
                fitmetrics,
            )
            y_pred_prob = []
        y_pred = automl.predict(X_test)

        ###################################################################
    elif framework == 'tpot':
        if task == "bt" or task == "bre":
            tpot = TPOTClassifier(max_time_mins=int(timeforjob / 60),
                                  max_eval_time_mins=float(timeforjob / 60),
                                  n_jobs=ncore,
                                  verbosity=2)
            tpot.fit(X_train, y_train)
            y_pred_prob = tpot.predict_proba(X_test)
        elif task == "it":
            tpot = TPOTRegressor(generations=5,
                                 population_size=50,
                                 verbosity=2)
            y_pred_prob = []
        automl = tpot

        y_pred = tpot.predict(X_test)
        print(tpot.score(X_test, y_test))

    end = time.time()
    timespend = float(end - start)
    ###################################################################
    ###################################################################
    save_prob(timeforjob, dataset, resultsfile, foldn, y_pred, y_pred_prob,
              outputdir)
    metrics = metric(task, y_test, y_pred, y_pred_prob)
    print(dataset)
    get_run_info(
        metalearning,
        automl,
        dataset,
        shape,
        timeforjob,
        ncore,
        foldn,
        framework,
        resultsfile,
        fitmetrics,
        metrics,
        timespend,
        prepb,
        outputdir,
        target,
    )
                          scoring="roc_auc",
                          cv=5,
                          n_jobs=-1,
                          verbosity=2,
                          random_state=0)
tpot_cls.fit(x_train, y_train)

# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot_cls.fitted_pipeline_.steps,
                                        start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')

# y_pred_prob is a 2-D array of probability of being labeled as 0 (first  column of array) vs 1 (2nd column in array)
y_pred_prob = tpot_cls.predict_proba(x_test)

# AUC score for tpot model
tpot_roc_auc_score = roc_auc_score(y_test, y_pred_prob[:, 1])
print(f'\nAUC score for TPOT Best Model: {tpot_roc_auc_score:.4f}')

#Exporting the model
tpot_cls.export("tpot_best_model.py")

# X_train's variance, rounding the output to 3 decimal places
print('\n', x_train.var().round(3))

#Lowering the variance of the feature with high variance using log transformation
x_train_normed, x_test_normed = x_train.copy(), x_test.copy()

# Log normalization
Пример #20
0
def automl(market):
    print "evaluating market ", market
    dt = datetime.now()
    #rng = sri.KISS(123958, 34987243, 3495825239, 2398172431)
    rng = sri.KISS(dt.microsecond)

    csv = pd.read_csv('data/' + market + '.csv')
    print csv.shape
    #print csv.columns

    # Dropping rows with label 0
    # Doing binary logistic regression here
    # in F_AD there are only 181 0 labels
    # but there could be more.
    csv = csv[(csv.y != 0)]

    y = csv['y']
    dates = csv['date']
    # lookback 255. 254 to 0. Exclude 0.
    X = csv.loc[:, '254':'1']  # Accuracy: 0.5172

    #seed = 342
    #seed = 3165278097
    seed = next(rng)
    print "seed:", seed

    cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=seed)

    # time: 6252.53945184
    tpot_config = {
        'xgboost.sklearn.XGBClassifier': {
            #'gamma': [0,0.5,1.0],
            #'subsample': [0.4,0.6,0.8,1.0],
            #'colsample_bylevel': [0.5,0.75,1.0],
            'max_depth': [1],  # [1,2,3]
            'learning_rate': [1],  # [1,0.1,0.01]
            #'silent': [1.0],
            'nthread': [-1],
            #'n_estimators': [50,75,100,125,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]}
            'n_estimators': [100]
        }
    }

    # default: gen=5, pop=20
    # target: gen=10, pop=100
    pipeline_optimizer = TPOTClassifier(
        generations=2,
        population_size=20,
        cv=cv,
        n_jobs=-1,
        random_state=seed,
        verbosity=3,
        periodic_checkpoint_folder='checkpoints',
        config_dict=tpot_config)

    start_time = timeit.default_timer()
    pipeline_optimizer.fit(X, y)
    elapsed = timeit.default_timer() - start_time
    print "time:", elapsed

    # pseudo test
    X_test = csv.loc[:9, '254':'1']  # Accuracy: 0.5172
    print X_test.shape

    # Performance on test set. Might (probably will) differ from best pipeline score.
    #print X_test
    joblib.dump(X_test, "xtest.txt")
    print(pipeline_optimizer.predict(X_test))
    print(pipeline_optimizer.predict_proba(X_test))

    # Write out best pipeline as python code
    t = time.localtime()
    timestamp = time.strftime('%m%d%Y_%H%M%S', t)
    pipeline_optimizer.export('export/tpot_exported_pipeline_' + market + '_' +
                              timestamp + '_' + str(seed) + '.py')

    # Serialize Pipe as JSON. Because dump string does not ASCII encode bytearrays.
    clfname = 'clfs/pipe_' + market + '_' + timestamp + '_' + str(
        seed) + '.json'
    frozen = jsonpickle.encode(pipeline_optimizer.fitted_pipeline_)
    f = open(clfname, 'w')
    f.write(frozen)
    f.close()

    f = open(clfname, 'r')
    frozen = f.read()
    f.close()

    thawed = jsonpickle.decode(frozen)
    #thawed = jsonpickle.decode(jsonstring)
    print "*** debug"

    print thawed.predict(X_test)
    print thawed.predict_proba(X_test)
Пример #21
0
with open("./classification_tables.pkl", 'rb') as fp:
    datasets = pickle.load(fp)

data = datasets[0]['data']

X = data.iloc[:, :-1].to_numpy(dtype=int)
y = data.iloc[:, -1].to_numpy(dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

tpot = TPOTClassifier(generations=20,
                      population_size=20,
                      n_jobs=8,
                      verbosity=2,
                      scoring='balanced_accuracy')
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))

output_fname = f"pipeline_{datasets[0]['assay']}.py"
tpot.export(output_fname)

print("## EXPORTED FILE: ##")
with open(output_fname, 'r') as fp:
    print(fp.read())
print("## END EXPORTED FILE ##")

print("## PFHXS PREDICTED PROBABILITY:")
print(tpot.predict_proba(pfhxs))
print("## PFHXS PREDICTED CLASS:")
print(tpot.predict(pfhxs))
Пример #22
0
print('y_valid', y_valid)

t1 = time.time()
print('TPOT...!')
tpot = TPOTClassifier(
    max_time_mins=60 * 10,
    population_size=100,
    scoring='roc_auc',
    cv=3,
    verbosity=2,
    random_state=67, n_jobs= -1)
tpot.fit(x_t, y_train)
tpot.export('./tpot_pipeline.py')
print('accuracy is {}'.format(tpot.score(x_v, y_valid)))

probab = tpot.predict_proba(x_v)
probab = probab[:,1]
print('AUC Score is {}'.format(roc_auc_score(y_valid,probab)))
t2 = time.time()
print('Total time taken by TPOT:', int(t2-t1))



check_x = x_v.set_index(X_valid['AGREEMENTID'])

check_x.set_index(X_valid['AGREEMENTID'],inplace = True)

check_y = pd.DataFrame(y_valid).set_index(X_valid['AGREEMENTID'])

check_pred = pd.DataFrame(tpot.predict(x_v)).set_index(X_valid['AGREEMENTID'])
Пример #23
0
    # rf_y_pred = RF.predict(Xtest)



    '''Optimización RF'''
    tpot_classifier = TPOTClassifier(generations=5, population_size=24, offspring_size=12,
                                     verbosity=2, early_stop=12,
                                     config_dict={'sklearn.ensemble.RandomForestClassifier': parameters},
                                     cv=4, scoring='balanced_accuracy')

    '''Ajuste del modelo'''
    tpot_classifier.fit(Xtrain, ytrain)

    '''Predicción'''
    rf_y_pred = tpot_classifier.predict(Xtest)
    rf_y_prob = [probs[1] for probs in tpot_classifier.predict_proba(Xtest)]

    '''Validación y metricas de desempeño'''
    print('RF')
    print(confusion_matrix(ytest, rf_y_pred))
    print('kappa', cohen_kappa_score(ytest, rf_y_pred))
    report = precision_recall_fscore_support(ytest, rf_y_pred, average='weighted')
    auc_test_RF[fold] = roc_auc_score(ytest, rf_y_pred, average='weighted')
    kappa_test_RF[fold] = cohen_kappa_score(ytest, rf_y_pred)
    f1_test_RF[fold] = report[2]
    acc_test_RF[fold] = report[0]

    # Compute area under the curve
    fpr, tpr, _ = roc_curve(ytest, rf_y_prob)
    roc_auc = auc(fpr, tpr)
Пример #24
0
tpotC.fit(X_train, y_train)
fin = process_time()

print("Elapsed time in seconds : ", fin - debut)

X_test = pd.read_csv(
    '/home/bench/notebooks/data/TomWilliams/x_test_IoT_Botnet.csv')
predictions = tpotC.predict(X_test)

y_test = pd.read_csv(
    '/home/bench/notebooks/data/TomWilliams/y_test_IoT_Botnet.csv')

y_pred = pd.DataFrame(data=predictions)
y_pred.to_csv("y_pred_IoT_Botnet_Tpot.csv")

probs = tpotC.predict_proba(X_test)

probs = probs[:, 1]

auc = roc_auc_score(y_test, probs)
print('AUC: %.2f' % auc)

fpr, tpr, _ = roc_curve(y_test, probs)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr,
         tpr,
         color='darkorange',
         lw=lw,
Пример #25
0
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data.astype(np.float64),
    iris.target.astype(np.float64),
    train_size=0.75,
    test_size=0.25)
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.predict(X_test))
print(tpot.predict_proba(X_test))
print(tpot.score(X_test, y_test))
Пример #26
0
class BotClassifier:
    FEATURE_IMPORTANCE_CONFIG = {

        # Classifiers
        'sklearn.tree.DecisionTreeClassifier': {
            'criterion': ["gini", "entropy"],
            'max_depth': range(1, 11),
            'min_samples_split': range(2, 21),
            'min_samples_leaf': range(1, 21)
        },

        'sklearn.ensemble.ExtraTreesClassifier': {
            'n_estimators': [100],
            'criterion': ["gini", "entropy"],
            'max_features': np.arange(0.05, 1.01, 0.05),
            'min_samples_split': range(2, 21),
            'min_samples_leaf': range(1, 21),
            'bootstrap': [True, False]
        },

        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [100],
            'criterion': ["gini", "entropy"],
            'max_features': np.arange(0.05, 1.01, 0.05),
            'min_samples_split': range(2, 21),
            'min_samples_leaf':  range(1, 21),
            'bootstrap': [True, False]
        },

        'sklearn.ensemble.GradientBoostingClassifier': {
            'n_estimators': [100],
            'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
            'max_depth': range(1, 11),
            'min_samples_split': range(2, 21),
            'min_samples_leaf': range(1, 21),
            'subsample': np.arange(0.05, 1.01, 0.05),
            'max_features': np.arange(0.05, 1.01, 0.05)
        },

        'xgboost.XGBClassifier': {
            'n_estimators': [100],
            'max_depth': range(1, 11),
            'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
            'subsample': np.arange(0.05, 1.01, 0.05),
            'min_child_weight': range(1, 21),
            'nthread': [1]
        },
    }

    FEATURE_IMPORTANCE_TEMPLATE = "Classifier"

    def __init__(
        self,
        number_of_generations : int = 3,
        population_size : int = 10,
        scoring : str = "accuracy", # "accuracy", "f1", "precision", "recall", "roc_auc"
        cv : Union[int, List[Tuple[List[int], List[int]]]] = 5,
        verbosity : int = 0, # 0, 1, 2, 3
        number_of_jobs : int = -1, # -1 = number of cores
        is_feature_importances : bool = False,
    ) -> None:

        config = self.FEATURE_IMPORTANCE_CONFIG if is_feature_importances else None
        template = self.FEATURE_IMPORTANCE_TEMPLATE if is_feature_importances else None

        self.classifier = TPOTClassifier(
            generations = number_of_generations, 
            population_size = population_size, 
            scoring = scoring,
            cv = cv, 
            verbosity = verbosity, 
            n_jobs = number_of_jobs, 
            config_dict = config,
            template = template,
        )

        self.is_feature_importances = is_feature_importances

    def fit(self, features : pd.DataFrame, classes : pd.DataFrame) -> None:
        self.feature_names = features.columns
        self.classifier.fit(features, classes)

    def predict(self, features : pd.DataFrame) -> np.ndarray:
        return self.classifier.predict(features)

    def predict_proba(self, features : pd.DataFrame) -> np.ndarray:
        return self.classifier.predict_proba(features)

    def score(self, testing_features : pd.DataFrame, testing_classes : pd.DataFrame) -> float:
        return self.classifier.score(testing_features, testing_classes)

    def export(self, output_file_name : str) -> None:
        self.classifier.export(output_file_name)

    def scores(self, testing_features : pd.DataFrame, testing_classes : pd.DataFrame) -> Dict[str, float]:
        # labels = testing_classes["label"].tolist()
        print(1)
        classifier_predictions = self.predict(testing_features)
        print(2)
        classifier_prob_predictions = self.predict_proba(testing_features)[:, 1]
        print(3)

        scores_dict = {
            "accuracy" : accuracy_score(testing_classes, classifier_predictions),
            "precision" : precision_score(testing_classes, classifier_predictions),
            "recall" : recall_score(testing_classes, classifier_predictions),
            "f1" : f1_score(testing_classes, classifier_predictions),
            "roc_auc" : roc_auc_score(testing_classes, classifier_prob_predictions),
        }
        print(4)
        
        return scores_dict

    def get_fitted_pipeline(self) -> Pipeline:
        fitted_pipeline = self.classifier.fitted_pipeline_
        return fitted_pipeline

    def get_pareto_front_fitted_pipelines(self) -> Dict[str, Pipeline]:
        try:
            return self.classifier.pareto_front_fitted_pipelines_
        except Exception as e:
            print(f"cannot get pareto_front_fitted_pipelines_\n{e}")
            return {}

    def get_evaluated_individuals(self) -> Dict[str, Dict[str, Union[int, float, Tuple[str, ...]]]]:
        try:
            return self.classifier.evaluated_individuals_
        except Exception as e:
            print(f"cannot get evaluated_individuals_\n{e}")
            return {}

    def get_feature_importances(self) -> Dict[str, float]:
        if True or self.is_feature_importances:
            try:
                classifier_with_feature_importance = self.get_fitted_pipeline()[-1]
                feature_importances = classifier_with_feature_importance.feature_importances_
                return dict(zip(self.feature_names, feature_importances))
            except Exception as e:
                print(f"cannot get feature importances\n{e}")
                return 
        else:
            print(f"is_feature_importances is passed as False, cannot retrieve feature importances")
            return
Пример #27
0
# with joblib.parallel_backend("dask"):
tpot.fit(X_train, y_train)

tpot.score(X_test, y_test)

# Winning pipelines
print(tpot.fitted_pipeline_)

# copy file
# tpot.export('tpot_mnist_pipeline.py')

# Get predictions
y_predict = tpot.predict(X_test)

# Probability of malignant tissue produced by the model
y_prob = [probs[1] for probs in tpot.predict_proba(X_test)]

#Accuracy on test set
print("Test accuracy: %s\n" % (accuracy_score(y_test, y_predict).round(2)))

# Confusion matrix test set
conf_mat = pd.DataFrame(confusion_matrix(y_test, y_predict),
                        columns=['Predicted NO', 'Predicted YES'],
                        index=['Actual NO', 'Actual YES'])
print(conf_mat)

# Compute area under the curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# only use if you can visualise
Пример #28
0
        # Auto_ml
        START_EXPERIMENT = time.time()

        automl = TPOTClassifier(
            max_time_mins=(TIME_LIMIT // 60),
            scoring='roc_auc',
            verbosity=1,
            random_state=RANDOM_SEED,
        )
        automl.fit(
            X_train,
            y_train,
        )
        try:
            predictions = automl.predict_proba(X_test)
        except RuntimeError:
            predictions = automl.predict(X_test)
        y_test_predict_proba = predictions[:, 1]
        y_test_predict = automl.predict(X_test)

        print('AUC: ', roc_auc_score(y_test, y_test_predict_proba))

        END_EXPERIMENT = time.time()

        #preds = pd.DataFrame(predictions)
        #preds['Y'] = y_test.reset_index(drop=True)
        #preds.to_csv(f'./result/predicts/{DATASET_NAME}_{MODEL_NAME}_predict_proba_exp_{EXPERIMENT}.csv', index=False,)

        metrics.append({
            'AUC':
Пример #29
0
float_tab = float_tab[float_tab['date'] > '2009']
float_tab.to_csv(os.path.join(path_output, 'data.csv'),
                 index=False,
                 encoding='utf8')
X_train, X_test, y_train, y_test = split_by_date(float_tab, 'date')

pipeline_optimizer = TPOTClassifier()

pipeline_optimizer = TPOTClassifier(generations=10,
                                    population_size=20,
                                    cv=5,
                                    random_state=42,
                                    verbosity=2)

pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline_batiment_detail.py')

# on veut savoir si on claase bien les 150 premiers:
proba = pipeline_optimizer.predict_proba(X_test)
proba_etre_insalubre = [x[1] for x in proba]
proba = pd.Series(proba_etre_insalubre)

prediction = proba.rank(ascending=False, method='min') < 150 + 1
y_test.loc[prediction.values].value_counts(normalize=True)

autre_approche = pipeline_optimizer.predict(X_test) == 1
y_test.loc[autre_approche].value_counts(normalize=True)

autre_approche = pipeline_optimizer.predict(X_test) == 0
y_test.loc[autre_approche].value_counts(normalize=True)
X_train.info()

# In[17]:

tpot = TPOTClassifier(generations=10,
                      population_size=20,
                      verbosity=2,
                      scoring='roc_auc',
                      random_state=42,
                      disable_update_check=True,
                      config_dict='TPOT light')
tpot.fit(X_train, y_train)

# In[18]:

tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')

# In[19]:

print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')

# In[20]:

tpot.fitted_pipeline_

# In[21]:
Пример #31
0
            model_specific_preprocessing=True)
        X_test = lr.preprocess(
            X_test,
            is_train=False,
            vect_max_features=hyperparams['vectorizer_dict_size'],
            model_specific_preprocessing=True)

        t = Thread(target=sleep, args=(hours * 60 * 60, ))
        t.start()

        # optimize roc_auc metric
        clf = TPOTClassifier(scoring='roc_auc',
                             random_state=0,
                             verbosity=2,
                             config_dict=classifier_config_dict,
                             population_size=20)
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:, 1]

        with open(f'tpot_pred/{data_name}.pickle', 'wb') as f:
            pickle.dump(y_pred, f)

        # get prediction
        auc = roc_auc_score(y_test.values, y_pred)
        res[data_name] = auc
        print(data_name, auc)
        print('#' * 100)
    print(res)
    with open(f"tpot_pred/result.pickle", 'wb') as f:
        pickle.dump(res, f)