Exemplo n.º 1
0
def test_predict():
    """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.predict(testing_features)
        assert False  # Should be unreachable
    except ValueError:
        pass
Exemplo n.º 2
0
def test_predict():
    """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.predict(testing_features)
        assert False  # Should be unreachable
    except ValueError:
        pass
Exemplo n.º 3
0
class OnlineTpotClassifer(base.Classifier):

    def __init__(self, n_training_samples, classes: list, max_time_mins: int = 15):
        self.n_training_samples = n_training_samples
        self.max_time_mins = max_time_mins
        self.training_samples_x = []
        self.training_samples_y = []
        self.estimator = None
        self.classes = classes

    def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs) -> base.Classifier:
        if self.estimator is None:
            self.training_samples_x.append(x)
            self.training_samples_y.append(y)
        if len(self.training_samples_x) >= self.n_training_samples and self.estimator is None:
            x_train = np.stack([dict2numpy(i) for i in self.training_samples_x])
            self.estimator = TPOTClassifier(max_time_mins=self.max_time_mins)
            self.estimator.fit(x_train, self.training_samples_y)
            self.training_samples_y = []
            self.training_samples_x = []
        return self

    def predict_proba_one(self, x: dict) -> typing.Dict[base.typing.ClfTarget, float]:
        if self.estimator is not None:
            y_pred = self.estimator.predict_proba([list(x.values())])[0]
            return {self.classes[i]: p for i, p in enumerate(y_pred)}
        else:
            return {c: 1 / len(self.classes) for c in self.classes}

    def predict_proba_many(self, X):
        return pd.Series(self.estimator.predict_proba(X), columns=self.classes)

    @property
    def _multiclass(self):
        return True

    def learn_many(self, X, y):
        self.estimator.partial_fit(X=X.values, y=y.values, classes=self.classes)
        return self

    def predict_one(self, x):
        if self.estimator is not None:
            y_pred = self.estimator.predict([list(x.values())])[0]
            return y_pred
        else:
            return self.classes[0]

    def predict_many(self, X):
        return pd.Series(self.estimator.predict(X))
Exemplo n.º 4
0
    def run_AutoTpot(self):
        # Running the AutoTpot pipeline
        automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse')
        automl.fit(self.train, self.y_train)

        # TPOT produces ready-to-run, standalone Python code for the best-performing model,
        # in the form of a scikit-learn pipeline.
        # Exporting the best models
        automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py'))

        print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_))

        print('Saving the best model discovered through TPOT.')
        # Dumping ensemble of the models
        joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle'))

        # Calculating time per prediction
        # Start time ******************************************************************************
        start = timeit.default_timer()

        # Predicting label, confidence probability on the test data set
        predictions = automl.predict(self.test)
        predictions_prob = automl.predict_proba(self.test)

        # Binary class values : rounding them to 0 or 1
        predictions = [round(value) for value in predictions]

        end = timeit.default_timer()
        # End Time ******************************************************************************
        print('Time per prediction : {}'.format((end - start) / self.test.shape[0]))

        self.visualize(predictions, automl)
Exemplo n.º 5
0
class AutomlInstance:
    def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None):
        self.y_class_dict = None
        self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id)
        if memory_path != None:
            if Path(memory_path).is_file():
                self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3)
            else:
                self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3)
        else:
            self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3)
        self.tpot.fit(self.X_train,self.y_train)

    def predict(self, X):
        return self.tpot.predict(X)

    def get_segments(self)->List[Segment]:
        segments = []
        for model in self.tpot.evaluated_individuals_:
            try:
                classifier = self.tpot._toolbox.compile(creator.Individual.from_string(model, self.tpot._pset))
                classifier.fit(self.X_train,self.y_train)
                y_pred = classifier.predict(self.X_test)
                segments.append(Segment(y_ground=self.y_test,y_pred=y_pred))
            except ValueError:
                print("One classifier could not be evaluated.")
            except RuntimeError:
                print("One classifier could not be evaluated.")
        return segments

    def get_dataset(self, openMl_id, test_size=0.2):
        X, y = openml.fetch_openml(data_id=openMl_id, return_X_y=True)
        self.dataset_categories = openml.fetch_openml(data_id=31).categories
        openml_data = openml.fetch_openml(data_id=openMl_id, return_X_y=False)
        self.feature_names_X = openml_data.feature_names
        imp = Imputer()
        self.target_categories = numpy.unique(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        x_imp = imp.fit(X_train)
        X_train = x_imp.transform(X_train)
        x_imp = imp.fit(X_test)
        X_test = x_imp.transform(X_test)
        y_train = self._y_string_2_int(y_train)
        y_test = self._y_string_2_int(y_test)
        return X_train, X_test, y_train, y_test

    def _y_string_2_int(self, y: numpy.ndarray):
        if self.y_class_dict == None:
            self._create_class_dict(y)
        transdict = {y:x for x,y in self.y_class_dict.items()}
        return numpy.array([transdict[val] for val in y])

    def _create_class_dict(self, y:numpy.ndarray):
        res = {}
        unique_values = numpy.unique(y)
        counter = 0
        for x in unique_values.tolist():
            res[counter] = x
            counter = counter +1
        self.y_class_dict = res
Exemplo n.º 6
0
def run_tpot(zeros, ones):
    all_data, y = make_all_data(zeros, ones)
    X_train, X_test, y_train, y_test = train_test_split(all_data,
                                                        y,
                                                        test_size=.1)
    pca = PCA(n_components=15)
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)

    # if not os.path.exists('tpot_checkpoint'):
    # os.mkdir('tpot_checkpoint')

    tpot = TPOTClassifier(
        n_jobs=-1,
        generations=50,
        verbosity=3,
        scoring='f1',
        # subsample=.5,
        # periodic_checkpoint_folder='tpot_checkpoint',
        max_eval_time_mins=30,
        memory='auto')

    tpot.fit(X_train, y_train)
    tpot.export('tpot_ecog_pipeline.py')
    results = tpot.predict(X_test)
    out_file = open('tpot_metrics.txt', 'w')
    out_file.write(sklearn.metrics.classification_report(y_test, results))
    tpot.export('tpot_ecog_pipeline.py')
Exemplo n.º 7
0
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ):

    print(n_jobs)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    
    if include_preprocessors:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                            use_dask=False,
                                            #template="Selector-Transformer-Classifier",
                                            n_jobs=n_jobs,)
    else:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                    use_dask=False,
                                    template='Classifier',
                                    n_jobs=n_jobs,)
    
    pipeline_optimizer.fit(X_train, y_train)
    y_hat = pipeline_optimizer.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_hat)
    f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted')
    metrs = []
    metrs.append("Accuracy score - " + str(acc))
    metrs.append("F1 score - " + str(f1_s))
    res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()]
    
    
    return str(metrs),res
Exemplo n.º 8
0
def build_classifier(data, name):
    X, y = data
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    del config["sklearn.neighbors.KNeighborsClassifier"]
    classifier = TPOTClassifier(generations=1,
                                population_size=3,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(X, y)
    pipeline = make_pmml_pipeline(classifier.fitted_pipeline_,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name + ".pkl")
    result = DataFrame(classifier.predict(X), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(X),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name + ".csv")
Exemplo n.º 9
0
def build_classifier(data, feature_pipeline, generations, population_size,
                     name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    config = filter_config(config)
    del config[
        "sklearn.naive_bayes.GaussianNB"]  # Does not support nesting - see http://mantis.dmg.org/view.php?id=208
    del config["sklearn.neighbors.KNeighborsClassifier"]
    del config[
        "sklearn.svm.LinearSVC"]  # Does not support classifier.predict_proba(Xt)
    del config["sklearn.tree.DecisionTreeClassifier"]
    classifier = TPOTClassifier(generations=generations,
                                population_size=population_size,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(Xt, y)
    pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps +
                                           classifier.fitted_pipeline_.steps),
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(classifier.predict(Xt), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(Xt),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name)
Exemplo n.º 10
0
def train(X,
          Y,
          test_size=0.2,
          auto_ml=False,
          use_best_classifier=False,
          classifier_name=None):
    trained_classifiers = []
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        X, Y, test_size=test_size)
    #x_scaler = StandardScaler()
    #x_train = x_scaler.fit_transform(x_train)
    #x_test = x_scaler.transform(x_test)

    if auto_ml:
        classifier = TPOTClassifier(generations=6, verbosity=2)
        classifier.fit(x_train, y_train)
    elif use_best_classifier and classifier_name:
        cls = copy.deepcopy(classifiers)
        estimator = cls[classifier_name].pop("estimator")
        classifier = estimator(**cls[classifier_name])
        classifier.fit(x_train, y_train)
    else:
        classifier = tree.DecisionTreeClassifier(max_depth=3,
                                                 criterion="entropy")
        #classifier = LogisticRegression(C=15, dual=False)
        classifier.fit(x_train, y_train)

    predicted = classifier.predict(x_test)
    print("Classification report for classifier %s:\n%s\n" %
          (classifier.__class__,
           metrics.classification_report(y_test, predicted)))
    print("Confusion matrix:\n%s" %
          metrics.confusion_matrix(y_test, predicted))
    trained_classifiers.append(classifier)
    return trained_classifiers
Exemplo n.º 11
0
class TpotEstimator(BaseEstimator):
    def __init__(self, task, **kwargs):
        super(TpotEstimator, self).__init__(task)
        if task == 'regression':
            self.tpot = TPOTRegressor(**kwargs)
        else:
            self.tpot = TPOTClassifier(**kwargs)
        self.name = 'tpot'
        self.label_encoder = None
        self.obj_cols = None

    def train(self, X, y, X_test):
        self.obj_cols = column_object_category_bool(X)
        self.label_encoder = SafeOrdinalEncoder()
        X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols])
        self.tpot.fit(X, y)

    def predict_proba(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        proba = self.tpot.predict_proba(X)
        print(f'proba.shape:{proba.shape}')
        return proba

    def predict(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        return self.tpot.predict(X)
Exemplo n.º 12
0
def cli(erv_data):
    # import the ERV expression data as a Pandas dataframe
    df = pd.read_csv(erv_data)
    class_codes = dict(enumerate(
        df['class'].astype("category").cat.categories))
    df["class"] = df["class"].astype("category").cat.codes

    # create the test and training data
    X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:],
                                                        df.values[:, 1],
                                                        train_size=0.75,
                                                        test_size=0.25)

    # convert them all to floats
    X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype(
        float), y_train.astype(float), y_test.astype(float)

    # create a pipeline
    pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1)
    pipeline_optimizer.fit(X_train, y_train)
    pipeline_optimizer.export('tpot_exported_pipeline.py')

    print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}")
    cm = ConfusionMatrix([class_codes[y] for y in y_test], [
        class_codes[y] for y in
        [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test]
    ])
    cm.save_html("report")
Exemplo n.º 13
0
def TPOT(df, task, timelife):
    df_new = copy.copy(df)
    pd.options.mode.chained_assignment = None
    #if isinstance(df_new, pd.DataFrame):
    df_new = fill_and_to_category(df_new)
    X, y, _ = return_X_y(df_new)
    #if not isinstance(df_new, pd.DataFrame):
    #X = fill_and_to_category(X)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)
    le = LabelEncoder()
    if task == 'classification':
        model = TPOTClassifier(generations=timelife,
                               cv=5,
                               max_time_mins=1,
                               random_state=1,
                               verbosity=2,
                               n_jobs=1)
        model.fit(X_train, y_train)

        y_test = le.fit_transform(y_test)
        y_pred = le.fit_transform(model.predict(X_test))

        pipelines = get_stat(model)

        if len(np.unique(y)) > 2:
            return accuracy_score(y_test, y_pred), f1_score(
                y_test, y_pred, average='weighted'), pipelines
        else:
            return accuracy_score(y_test, y_pred), f1_score(y_test,
                                                            y_pred), pipelines
    else:
        model = TPOTRegressor(generations=timelife,
                              cv=5,
                              max_time_mins=1,
                              random_state=1,
                              verbosity=2,
                              n_jobs=1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        pipelines = get_stat(model)
        return np.sqrt(mean_squared_error(y_test,
                                          y_pred)), r2_score(y_test,
                                                             y_pred), pipelines
Exemplo n.º 14
0
def clfWithTpot(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    my_tpot = TPOTClassifier(generations=10, verbosity=2)
    my_tpot.fit(np.array(X_train), np.array(y_train))
    print(my_tpot.score(np.array(X_test), np.array(y_test)))
    my_tpot.export('exported_pipeline.py')
    predictions = my_tpot.predict(np.array(X_test))
    print(confusion_matrix(y_test, predictions))
Exemplo n.º 15
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0],)
Exemplo n.º 16
0
def run_tpot(X, y, target_ft, time_budget=30, include_preprocessors=None):
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, random_state=1)
    pipeline_optimizer = TPOTClassifier(max_time_mins=time_budget / 60,
                                        generations=None)
    pipeline_optimizer.fit(X_train, y_train)
    y_hat = pipeline_optimizer.predict(X_test)
    metrs = []
    metrs.append("Accuracy score - " +
                 str(sklearn.metrics.accuracy_score(y_test, y_hat)))
    metrs.append("F1 score - " +
                 str(sklearn.metrics.f1_score(y_test, y_hat, average='macro')))
    return str(metrs)
Exemplo n.º 17
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0], )
Exemplo n.º 18
0
def TPOT_Classifier():
    tpot = TPOTClassifier(
        verbosity=2,
        max_time_mins=390,
        population_size=40,
    )
    tpot.fit(x_train, y_train)
    tpot.export('tpot_assignment_pipeline.py')
    TPOT_predict = tpot.predict(x_test)
    score = tpot.score(x_test, y_test)
    print(score)
    print(y_test)
    print(TPOT_predict)
    return score
Exemplo n.º 19
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0],)
Exemplo n.º 20
0
def test_predict_2():
    """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""

    tpot_obj = TPOTClassifier()
    pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
    ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
    'DecisionTreeClassifier__min_samples_split=5)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    result = tpot_obj.predict(testing_features)

    assert result.shape == (testing_features.shape[0],)
Exemplo n.º 21
0
class TPOTClassifierModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = TPOTClassifier(**model_params)

    def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)
        # TODO: This is required by DESlib which is kind of annoying.
        #       They call check_if_fitted(model, 'classes_'), meaning these have
        #       to act more like general sklearn models
        #
        #       If using more classifiers, the creation of a ClassifierModel
        #       base class is probably required to ensure consistency
        self.classes_ = self._model.fitted_pipeline_.classes_

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        if not isinstance(self._model, TPOTClassifier):
            raise RuntimeError(
                'Due to TPOT being unpickelable, saving this' +
                ' means only the actual sklearn.Pipeline' +
                ' was saved. Calling fit will fit this pipeline' +
                ' rather than the TPOT algorithm. If this is' +
                ' desired behaviour, please use `_force_fit`' + ' instead')
        self._force_fit(X, y)

    def save(self, path: str) -> None:

        # See comment above class
        if isinstance(self._model, TPOTClassifier):
            self._model = self._model.fitted_pipeline_

        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(TPOTClassifierModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict_proba(X)
Exemplo n.º 22
0
def Classifier(x, y):
    x_train = x
    y_train = y
    tpot = TPOTClassifier(
        verbosity=2,
        max_time_mins=10,
        population_size=50,
    )
    tpot.fit(x_train, y_train)
    tpot.export('tpot_pipeline.py')
    TPOT_predict = tpot.predict(x_test)
    score = tpot.score(x_test, y_test)
    #print(score)
    #print(y_test)
    #print(TPOT_predict)
    return score
Exemplo n.º 23
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(verbosity=3)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv',
                      index=False)
Exemplo n.º 24
0
class TPOTBaselineModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = TPOTClassifier(**model_params)

    def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        if not isinstance(self._model, TPOTClassifier):
            raise RuntimeError(
                'Due to TPOT being unpickelable, saving this' +
                ' means only the actual sklearn.Pipeline' +
                ' was saved. Calling fit will fit this pipeline' +
                ' rather than the TPOT algorithm. If this is' +
                ' desired behaviour, please use `_force_fit`' + ' instead')
        self._force_fit(X, y)

    def save(self, path: str) -> None:
        # See comment above class
        if isinstance(self._model, TPOTClassifier):
            self._model = self._model.fitted_pipeline_

        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(TPOTBaselineModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        # TODO: May cause issues if SVG or SVM model is best
        return self._model.predict_proba(X)
Exemplo n.º 25
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(generations=5,
                        population_size=20,
                        random_state=42,
                        verbosity=2)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_' + name_dataset + '.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open("time_files/" + name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv("submit_files/" + name_dataset + '_' +
                      'tpot_submission' + '.csv',
                      index=False)
Exemplo n.º 26
0
def process_tpot(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using tpot"""

    from tpot import TPOTClassifier
    from tpot import TPOTRegressor
    from ..config import classifier_config_dict

    # Register Timer
    def handler(signum, frame):
        raise SystemExit('Time limit exceeded, sending system exit...')

    signal.signal(signal.SIGALRM, handler)

    # default cv is 5
    if m_type == 'classification':
        automl = TPOTClassifier(generations=100,
                                population_size=100,
                                config_dict=classifier_config_dict,
                                verbosity=3,
                                max_time_mins=int(10800/60),
                                scoring='f1_weighted',
                                n_jobs=N_CORES,
                                random_state=seed)
    else:
        automl = TPOTRegressor(generations=100, 
                               population_size=100,
                               verbosity=3,
                               max_time_mins=int(10800/60),
                               scoring='neg_mean_squared_error',
                               n_jobs=N_CORES,
                               random_state=seed)

    # Set timer
    # for long running processes TPOT sometimes does not end even with generations
    signal.alarm(TIME_PER_TASK+GRACE_PERIOD)
    automl.fit(X_train.values, y_train.values)
    signal.alarm(0)

    return (automl.predict_proba(X_test.values) if m_type == 'classification' else 
            automl.predict(X_test.values))
Exemplo n.º 27
0
def main():

  # set up the path to the data sets and the data were are going to experiment 
  # with 
  base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/'
  data_setz = [#'bank',
    'blood',
    'breast-cancer-wisc-diag',
    'breast-cancer-wisc-prog',
    'breast-cancer-wisc',
    'breast-cancer',
    'congressional-voting',
    'conn-bench-sonar-mines-rocks',
    'credit-approval',
    'cylinder-bands',
    'echocardiogram',
    #'fertility',
    'haberman-survival',
    'heart-hungarian',
    'hepatitis',
    'ionosphere',
    'mammographic',
    'molec-biol-promoter',
    'musk-1',
    'oocytes_merluccius_nucleus_4d',
    'oocytes_trisopterus_nucleus_2f',
    'ozone',
    'parkinsons',
    'pima',
    #'pittsburg-bridges-T-OR-D';
    'planning',
    'ringnorm',
    #'spambase',
    'spectf_train',
    'statlog-australian-credit',
    'statlog-german-credit',
    'statlog-heart',
    'titanic',
    #'twonorm',
    'vertebral-column-2clases']

  # nsplits is like the number of cv (its bootstraps here) then set up some variales
  # to save the results to. 
  n_splitz = 10
  errors = np.zeros((len(data_setz),))
  fms = np.zeros((len(data_setz),))
  times = np.zeros((len(data_setz),))
  m = 0

  for n in range(n_splitz):
    print 'Spilt ' + str(n) + ' of ' + str(n_splitz)
    for i in range(len(data_setz)):
      print '    ' + data_setz[i]
      df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',')
      data = df.as_matrix()
      X = data[:, :-1]
      y = data[:, -1]
      X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m)
      m += 1
      
      ts = time.time()
      tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1)
      tpot.fit(X_train, y_train)
      times[i] += (time.time() - ts)

      errors[i] += (1-tpot.score(X_test, y_test))
      yhat = tpot.predict(X_test)
      fms[i] += f1_score(y_test, yhat, average='macro')
  
  errors /= n_splitz
  fms /= n_splitz
  times /= n_splitz

  df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times})
  df.to_csv(path_or_buf='tpot-results2.csv', sep=',')

  return None 
Exemplo n.º 28
0
probab = tpot.predict_proba(x_v)
probab = probab[:,1]
print('AUC Score is {}'.format(roc_auc_score(y_valid,probab)))
t2 = time.time()
print('Total time taken by TPOT:', int(t2-t1))



check_x = x_v.set_index(X_valid['AGREEMENTID'])

check_x.set_index(X_valid['AGREEMENTID'],inplace = True)

check_y = pd.DataFrame(y_valid).set_index(X_valid['AGREEMENTID'])

check_pred = pd.DataFrame(tpot.predict(x_v)).set_index(X_valid['AGREEMENTID'])

check_probab = pd.DataFrame(tpot.predict_proba(x_v)).set_index(X_valid['AGREEMENTID'])

# new_y = check_y.reset_index().groupby(['AGREEMENTID'])['FORECLOSURE'].agg({'y':np.mean})
new_y = check_y.reset_index().groupby(['AGREEMENTID'])['FORECLOSURE'].agg(lambda x: stats.mode(x)[0][0])

# new_pred = check_pred.reset_index().groupby(['AGREEMENTID'])[0].agg({'y':stats.mode(axis = None)})
new_pred = check_pred.reset_index().groupby(['AGREEMENTID'])[0].agg(lambda x: stats.mode(x)[0][0])

new_probab = check_probab.reset_index().groupby(['AGREEMENTID'])[1].agg({'probab':np.mean})

print('new_accuracy is {}'.format(np.mean(new_pred==new_y)))

print('new roc auc is {}'.format(roc_auc_score(new_y,new_probab)))
Exemplo n.º 29
0
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tpot import TPOTClassifier

X_train = read_csv('input/aps_failure_training_set.csv',na_values='na')
X_test = read_csv('input/aps_failure_test_set.csv',na_values='na')

# deal with missing values and constant features and normalize
X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test)
print(f'Data loaded: {len(X_train)} training observations, {len(X_test)} testing observations')

X_train, y_train = balance_data(X_train, y_train, n_samples = 2500)
print(f'Balanced training data ({2500/1000}/1): {len(X_train)} training observations, {len(X_test)} testing observations')

# A custom scorer function is created in order to reflect on the different cost of misclassification (fn > fp)
def scania_scorer(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()  
    total_cost = 10*fp + 500*fn
    return total_cost

custom_scania_scorer = make_scorer(scania_scorer, greater_is_better=False)

tpot = TPOTClassifier(generations=100, population_size=100, verbosity=3, random_state=42, use_dask=True, n_jobs=-1, memory='auto', early_stop=10, scoring=custom_scania_scorer)
tpot.fit(X_train, y_train)
y_pred = tpot.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Total cost: " + str(scania_scorer(y_test, y_pred)))
print(tpot.score(X_test, y_test))
tpot.export('tpot_scania_pipeline.py')
Exemplo n.º 30
0
# #不用onehot
# train_data['Sex']=train_data['Sex'].replace(['male','female'],[0,1])
# test_data['Sex']=test_data['Sex'].replace(['male','female'],[0,1])
# train_data['Embarked']=train_data['Embarked'].replace(['S','Q','C'],[0,1,2])
# test_data['Embarked']=test_data['Embarked'].replace(['S','Q','C'],[0,1,2])

#定义特征值
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
#定义训练和测试数据
train_features = train_data[features]
train_y = train_data['Survived']
test_features = test_data[features]

#将xonehot化
dv = DictVectorizer(sparse=False)
train_dv = dv.fit_transform(train_features.to_dict(orient='record'))
test_dv = dv.transform(test_features.to_dict(orient='record'))

# #训练模型 CART
TP = TPOTClassifier()
TP.fit(train_dv, train_y)
pred = TP.predict(test_dv)

# for n,p in zip(test_data['Name'],pred):
#     print('{}是{}'.format(n,p))

# 得到决策树准确率(基于训练集)
acc_decision_tree = round(TP.score(train_dv, train_y), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)
Exemplo n.º 31
0
        START_EXPERIMENT = time.time()

        automl = TPOTClassifier(
            max_time_mins=(TIME_LIMIT // 60),
            scoring='roc_auc',
            verbosity=1,
            random_state=RANDOM_SEED,
        )
        automl.fit(
            X_train,
            y_train,
        )
        try:
            predictions = automl.predict_proba(X_test)
        except RuntimeError:
            predictions = automl.predict(X_test)
        y_test_predict_proba = predictions[:, 1]
        y_test_predict = automl.predict(X_test)

        print('AUC: ', roc_auc_score(y_test, y_test_predict_proba))

        END_EXPERIMENT = time.time()

        #preds = pd.DataFrame(predictions)
        #preds['Y'] = y_test.reset_index(drop=True)
        #preds.to_csv(f'./result/predicts/{DATASET_NAME}_{MODEL_NAME}_predict_proba_exp_{EXPERIMENT}.csv', index=False,)

        metrics.append({
            'AUC':
            round(roc_auc_score(y_test, y_test_predict_proba), 4),
            'log_loss':
Exemplo n.º 32
0
def main():

    # Import Data
    train = pd.read_csv(
        "../input/train.csv",
        dtype={"Age": np.float64},
    )
    test = pd.read_csv(
        "../input/test.csv",
        dtype={"Age": np.float64},
    )

    # Check to see if there are any null training set values
    print '********Checking NaNs for Test Data********'
    print 'Fare NaNs:', len(train[pd.isnull(train['Fare'])])
    print 'Class NaNs:', len(train[pd.isnull(train['Pclass'])])
    print 'Age NaNs:', len(train[pd.isnull(train['Age'])])
    print 'Sibling NaNs:', len(train[pd.isnull(train['SibSp'])])
    print 'Parent/Child NaNs:', len(train[pd.isnull(train['Parch'])])
    print 'Sex NaNs:', len(train[pd.isnull(train['Sex'])])
    print 'Embarked NaNs:', len(train[pd.isnull(train['Embarked'])])
    print '*******************************************'

    pt.plot_distribution(train, var='Age', target='Survived', row='Sex')
    pt.plot_distribution(train, var='Fare', target='Survived', row='Sex')
    input()

    # Replacing on fine grouping
    print 'Replacing Age NaNs with categorical means for Class, Sex, Siblings, Parent/Child'
    train['Age'] = train.groupby(
        ['Pclass', 'Sex', 'SibSp',
         'Parch'])['Age'].transform(lambda x: x.fillna(x.mean()))

    # Replacing on less granular gropuing
    print 'Checking Age NaNs after first replacement:', len(train[pd.isnull(
        train['Age'])])
    train['Age'] = train.groupby(
        ['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.mean()))

    print 'Checking Age NaNs after replacement:', len(train[pd.isnull(
        train['Age'])])
    train['Age'] = train['Age'].astype(int)

    train['Embarked'].fillna('S', inplace=True)
    print 'Checking Embarked NaNs after replacement:', len(train[pd.isnull(
        train['Embarked'])])

    seaborn_hist(train, 'Age', 'Age Distributions Before Replacement')
    seaborn_hist(train, 'Age', 'Age Distributions After Replacement')
    """ Use train average information to replace NaNs in test set """
    # Check to see if there are any null testing set values
    print '********Checking NaNs for Test Data********'
    print 'Fare NaNs:', len(test[pd.isnull(test['Fare'])])
    print 'Class NaNs:', len(test[pd.isnull(test['Pclass'])])
    print 'Age NaNs:', len(test[pd.isnull(test['Age'])])
    print 'Sibling NaNs:', len(test[pd.isnull(test['SibSp'])])
    print 'Parent/Child NaNs:', len(test[pd.isnull(test['Parch'])])
    print 'Sex NaNs:', len(test[pd.isnull(test['Sex'])])
    print 'Embarked NaNs:', len(test[pd.isnull(test['Embarked'])])
    print '*******************************************'

    # Replacing on fine grouping
    print 'Replacing Age NaNs with categorical means for Class, Sex, Siblings, Parent/Child'
    test['Age'] = train.groupby(
        ['Pclass', 'Sex', 'SibSp',
         'Parch'])['Age'].transform(lambda x: x.fillna(x.mean()))
    print 'Checking Age NaNs after first replacement:', len(test[pd.isnull(
        test['Age'])])
    test['Age'] = test['Age'].astype(int)

    # Replacing on fine grouping
    print 'Replacing Fare NaNs with categorical means for Class, Sex, Siblings, Parent/Child'
    test['Fare'] = train.groupby(
        ['Pclass', 'Sex', 'SibSp',
         'Parch'])['Fare'].transform(lambda x: x.fillna(x.mean()))
    print 'Checking Fare NaNs after first replacement:', len(test[pd.isnull(
        test['Fare'])])

    # Create a Family Size column
    train['Family_Size'] = train['SibSp'] + train['Parch']

    # Creating Titles column in DataFrame
    titles = sorted(set([x for x in train.Name.map(lambda x: get_title(x))]))
    print 'List of titles in data'
    print len(titles), ':', titles
    train['Title'] = train['Name'].map(lambda x: get_title(x))
    train['Title'] = train.apply(replace_titles, axis=1)

    print '*******************************************'

    # Determine the number of cabins reserved per person
    # print train['Deck'].unique()
    # raw_input('press enter...')
    # train['Cabin_Length'] = train['Cabin'].str.split(' ').str.len()
    # train['Cabin_Length'].fillna(0,inplace=True)

    column_vals = [
        'Sex', 'Fare', 'Age', 'Pclass', 'Family_Size', 'Title', 'Embarked'
    ]
    mean_analysis(train, column_vals)

    print '*******************************************'
    """ Creating deck from cabin, age label bands, fare label bands, titles from names, and applying LabelEncoder to categorical variables """
    # Convert Categorical Variables to Numerical
    le_age = LabelEncoder()
    le_fare = LabelEncoder()
    le_title = LabelEncoder()
    le_embarked = LabelEncoder()
    le_deck = LabelEncoder()
    le_sex = LabelEncoder()
    le_fam = LabelEncoder()

    train['Deck'] = train['Cabin'].str[0]
    train['Deck'].fillna('Z', inplace=True)

    age_labels = [
        'Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'Band_6', 'Band_7',
        'Band_8', 'Band_9', 'Band_10'
    ]
    train['AgeBand'] = pd.cut(train['Age'], bins=10, labels=age_labels)
    train['AgeBand'] = le_age.fit_transform(train['AgeBand'])

    fare_labels = [
        'Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'Band_6', 'Band_7',
        'Band_8', 'Band_9', 'Band_10'
    ]
    train['FareBand'] = pd.cut(train['Fare'], bins=10, labels=fare_labels)
    train['FareBand'] = le_fare.fit_transform(train['FareBand'])

    fam_size_labels = ['Band_1', 'Band_2', 'Band_3']
    train['FamilySizeBand'] = pd.cut(train['Family_Size'],
                                     bins=3,
                                     labels=fam_size_labels)
    train['FamilySizeBand'] = le_fam.fit_transform(train['FamilySizeBand'])

    train['Title'] = le_title.fit_transform(train['Title'])

    train['Embarked'] = le_embarked.fit_transform(train['Embarked'])

    train['Deck'] = le_deck.fit_transform(train['Deck'])

    train['Sex'] = le_sex.fit_transform(train['Sex'])

    train.drop([
        'PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'FareBand',
        'Cabin', 'Family_Size'
    ],
               inplace=True,
               axis=1)
    train.drop(['AgeBand', 'Deck', 'Title'], inplace=True, axis=1)
    # survived = train['Survived']
    # train.drop(['Survived'],inplace=True,axis=1)
    # scaler = preprocessing.StandardScaler().fit(train)
    # train = pd.DataFrame(scaler.transform(train))
    # train['Survived'] = survived
    # del survived
    print train.head()
    input()

    colormap = plt.cm.viridis
    plt.figure(figsize=(12, 12))
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
    sns.heatmap(train.corr(),
                linewidths=0.1,
                vmax=1.0,
                square=True,
                cmap=colormap,
                linecolor='white',
                annot=True)
    # pt.plot_correlation_map(train)
    plt.show(block=False)

    print train.corr()['Survived']
    input()

    # """ Cleaning Test Set """
    test['Deck'] = test['Cabin'].str[0]
    test['Deck'].fillna('Z', inplace=True)
    test['Family_Size'] = test['SibSp'] + test['Parch']

    test['AgeBand'] = pd.cut(test['Age'], bins=10, labels=age_labels)
    test['AgeBand'] = le_age.transform(test['AgeBand'])

    test['FareBand'] = pd.cut(test['Fare'], bins=10, labels=fare_labels)
    test['FareBand'] = le_fare.transform(test['FareBand'])

    test['FamilySizeBand'] = pd.cut(test['Family_Size'],
                                    bins=3,
                                    labels=fam_size_labels)
    test['FamilySizeBand'] = le_fam.transform(test['FamilySizeBand'])

    test['Title'] = test['Name'].map(lambda x: get_title(x))
    test['Title'] = test.apply(replace_titles, axis=1)
    test['Title'] = le_title.transform(test['Title'])

    test['Embarked'] = le_embarked.transform(test['Embarked'])

    test['Deck'] = le_deck.transform(test['Deck'])

    test['Sex'] = le_sex.transform(test['Sex'])

    results = pd.DataFrame(columns=['PassengerId', 'Survived'])
    results['PassengerId'] = test['PassengerId']
    test.drop([
        'PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'FareBand',
        'Cabin', 'Family_Size'
    ],
              inplace=True,
              axis=1)
    test.drop(['AgeBand', 'Deck', 'Title'], inplace=True, axis=1)
    # test = pd.DataFrame(scaler.transform(test))
    """ Model Training """
    train.rename(columns={'Survived': 'class'}, inplace=True)
    X_train = train.drop(['class'], axis=1)
    y_train = train['class']
    print y_train.head()
    input()

    # Cross Validation
    train_data, test_data, train_target, test_target = train_test_split(
        X_train, y_train, test_size=0.25, random_state=0)

    pipeline_optimizer = TPOTClassifier(generations=10,
                                        population_size=25,
                                        random_state=42,
                                        cv=5,
                                        verbosity=2,
                                        n_jobs=3,
                                        scoring='f1')
    pipeline_optimizer.fit(train_data, train_target)
    print pipeline_optimizer.score(test_data, test_target)
    pipeline_optimizer.export('Titanic_TPOT_Classifier.py')
    results['Survived'] = pipeline_optimizer.predict(test)

    # Trying a bunch of different classifiers
    # classifiers = [LinearSVC(), KNeighborsClassifier(), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), GaussianNB(), SVC(gamma=2, C=1), AdaBoostClassifier(), XGBClassifier()]

    # print 'Classifier score:', clf.score(test_data,test_target.values.ravel())

    # # """ Model Analysis """
    # # # cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    # # # title = "Learning Curves (XGBoost)"
    # # # plot_learning_curve(clf, title, X_train, y_train.values, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
    # # # y_pred = clf.predict(X_train)
    # # # print 'done in %0.3fs' % (time() - t0)
    # # # print confusion_matrix(y_train,y_pred)

    # # # print '*******************************************'
    # # # print classification_report(y_train,y_pred)
    # # # print 0.5 * (precision_recall_fscore_support(y_train,y_pred)[2][0] + precision_recall_fscore_support(y_train,y_pred)[2][1])
    # # # print '*******************************************'

    # # # print max(evaluation(classifiers,train_data,train_target,test_data,test_target)[:][1])
    # # _ , f_scores = evaluation(classifiers,train_data,train_target,test_data,test_target)
    # # print f_scores.index(max(f_scores))
    # # model = classifiers[-1]#f_scores.index(max(f_scores))]
    # # print model
    # # rfecv = RFECV( estimator = model , step = 1 , cv = StratifiedKFold( 2 ) , scoring = 'accuracy' )
    # # rfecv.fit( train_data , train_target )

    # # print rfecv.score( train_data , train_target ) , rfecv.score( test_data , test_target )
    # # print "Optimal number of features : %d" % rfecv.n_features_
    # # input()

    # # # Plot number of features VS. cross-validation scores
    # # plt.figure()
    # # plt.xlabel( "Number of features selected" )
    # # plt.ylabel( "Cross validation score (nb of correct classifications)" )
    # # plt.plot( range( 1 , len( rfecv.grid_scores_ ) + 1 ) , rfecv.grid_scores_ )
    # # plt.show(block=False)

    # # print X_train.head()
    # # X_train = pd.DataFrame(rfecv.transform(X_train))
    # # print rfecv.ranking_
    # # print X_train.head()
    # # input()

    # # # """ Model Predicting """
    # # print 'Fitting model to full training set...'
    # # model.fit(X_train,y_train)

    # # X_test = test[['Pclass','Sex','Embarked','Title','Deck','AgeBand','FareBand','FamilySizeBand']]
    # # X_test = pd.DataFrame(rfecv.transform(pd.DataFrame(scaler.transform(X_test))))
    # # results['Survived'] = model.predict(X_test)
    """ Ensemble Stacking """
    # results['Survived'] = ensemble_stacking(train,test)

    results.to_csv('Titanic_Results.csv', sep=',', index=False)

    plt.show(block=False)
    raw_input('Press [enter] to close.')
Exemplo n.º 33
0
X_train_index, X_test_index, y_train_string, y_test_string = train_test_split(
    list(range(nImg)), image_order[:, 0], test_size=0.2)
X_train = data_concat[X_train_index, :]
X_test = data_concat[X_test_index, :]
y_train = Label(y_train_string)
y_test = Label(y_test_string)
#
"""
-------------- TPOT does is magic-------------------------------------
#"""
from tpot import TPOTClassifier
clf = TPOTClassifier(verbosity=2, n_jobs=-1, config_dict='TPOT light')
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))
predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))
score1 = clf.score(X_test, y_test)
confusion_matrix1 = confusion_matrix(y_test, predictions)

print('Order of which element that was the most of=',
      Counter(y_test_string).keys())
print('How many that were of each unique element',
      Counter(y_test_string).values())

print(
    '-------------------------------------PERSON 2-------------------------------------------------------'
)
"""
-------------- Data is imported, concated and normalized--------------------
"""
Exemplo n.º 34
0
def evaluate_tpot(dataset, task_type, run_id, time_limit, seed=1, use_fe=True):
    n_job = args.n_job
    # Construct the ML model.
    if not use_fe:
        from mindware.utils.tpot_config import classifier_config_dict
        config = classifier_config_dict

    _task_type = MULTICLASS_CLS if task_type == 'cls' else REGRESSION
    if task_type == 'cls':
        if space_type == 'large':
            from tpot.config.classifier import classifier_config_dict
        elif space_type == 'small':
            from tpot.config.classifier_small import classifier_config_dict
        else:
            from tpot.config.classifier_extremely_small import classifier_config_dict
        config_dict = classifier_config_dict
    else:
        if space_type == 'large':
            from tpot.config.regressor import regressor_config_dict
        elif space_type == 'small':
            from tpot.config.regressor_small import regressor_config_dict
        else:
            from tpot.config.regressor_extremely_small import regressor_config_dict
        config_dict = regressor_config_dict

    if task_type == 'cls':
        automl = TPOTClassifier(config_dict=config_dict,
                                generations=10000,
                                population_size=20,
                                verbosity=2,
                                n_jobs=n_job,
                                cv=0.2,
                                scoring='balanced_accuracy',
                                max_eval_time_mins=max_eval_time,
                                max_time_mins=int(time_limit / 60),
                                random_state=seed)
        raw_data, test_raw_data = load_train_test_data(dataset,
                                                       task_type=_task_type)
        X_train, y_train = raw_data.data
        X_test, y_test = test_raw_data.data
        X_train, y_train = X_train.astype('float64'), y_train.astype('int')
        X_test, y_test = X_test.astype('float64'), y_test.astype('int')
    else:
        automl = TPOTRegressor(config_dict=config_dict,
                               generations=10000,
                               population_size=20,
                               verbosity=2,
                               n_jobs=n_job,
                               cv=0.2,
                               scoring='neg_mean_squared_error',
                               max_eval_time_mins=max_eval_time,
                               max_time_mins=int(time_limit / 60),
                               random_state=seed)
        raw_data, test_raw_data = load_train_test_data(dataset,
                                                       task_type=_task_type)
        X_train, y_train = raw_data.data
        X_test, y_test = test_raw_data.data
        X_train, y_train = X_train.astype('float64'), y_train.astype('float64')
        X_test, y_test = X_test.astype('float64'), y_test.astype('float64')

    start_time = time.time()
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    pareto_front = automl._pareto_front

    if task_type == 'cls':
        score_func = balanced_accuracy_score
    else:
        score_func = mean_squared_error

    valid_score = max([
        pareto_front.keys[x].wvalues[1] for x in range(len(pareto_front.keys))
    ])
    test_score = score_func(y_test, y_hat)
    print('Run ID         : %d' % run_id)
    print('Dataset        : %s' % dataset)
    print('Val/Test score : %f - %f' % (valid_score, test_score))
    scores = automl.scores
    times = automl.times
    _space_type = '%s_' % space_type if space_type != 'large' else ''
    save_path = save_dir + '%s%s_tpot_%s_false_%d_1_%d.pkl' % (
        _space_type, task_type, dataset, time_limit, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump(
            [dataset, valid_score, test_score, times, scores, start_time], f)