Пример #1
0
def test_fit2():
    """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light')
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert not (tpot_obj._start_datetime is None)
Пример #2
0
class TpotEstimator(BaseEstimator):
    def __init__(self, task, **kwargs):
        super(TpotEstimator, self).__init__(task)
        if task == 'regression':
            self.tpot = TPOTRegressor(**kwargs)
        else:
            self.tpot = TPOTClassifier(**kwargs)
        self.name = 'tpot'
        self.label_encoder = None
        self.obj_cols = None

    def train(self, X, y, X_test):
        self.obj_cols = column_object_category_bool(X)
        self.label_encoder = SafeOrdinalEncoder()
        X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols])
        self.tpot.fit(X, y)

    def predict_proba(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        proba = self.tpot.predict_proba(X)
        print(f'proba.shape:{proba.shape}')
        return proba

    def predict(self, X):
        X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols])
        return self.tpot.predict(X)
Пример #3
0
class AutomlInstance:
    def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None):
        self.y_class_dict = None
        self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id)
        if memory_path != None:
            if Path(memory_path).is_file():
                self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3)
            else:
                self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3)
        else:
            self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3)
        self.tpot.fit(self.X_train,self.y_train)

    def predict(self, X):
        return self.tpot.predict(X)

    def get_segments(self)->List[Segment]:
        segments = []
        for model in self.tpot.evaluated_individuals_:
            try:
                classifier = self.tpot._toolbox.compile(creator.Individual.from_string(model, self.tpot._pset))
                classifier.fit(self.X_train,self.y_train)
                y_pred = classifier.predict(self.X_test)
                segments.append(Segment(y_ground=self.y_test,y_pred=y_pred))
            except ValueError:
                print("One classifier could not be evaluated.")
            except RuntimeError:
                print("One classifier could not be evaluated.")
        return segments

    def get_dataset(self, openMl_id, test_size=0.2):
        X, y = openml.fetch_openml(data_id=openMl_id, return_X_y=True)
        self.dataset_categories = openml.fetch_openml(data_id=31).categories
        openml_data = openml.fetch_openml(data_id=openMl_id, return_X_y=False)
        self.feature_names_X = openml_data.feature_names
        imp = Imputer()
        self.target_categories = numpy.unique(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        x_imp = imp.fit(X_train)
        X_train = x_imp.transform(X_train)
        x_imp = imp.fit(X_test)
        X_test = x_imp.transform(X_test)
        y_train = self._y_string_2_int(y_train)
        y_test = self._y_string_2_int(y_test)
        return X_train, X_test, y_train, y_test

    def _y_string_2_int(self, y: numpy.ndarray):
        if self.y_class_dict == None:
            self._create_class_dict(y)
        transdict = {y:x for x,y in self.y_class_dict.items()}
        return numpy.array([transdict[val] for val in y])

    def _create_class_dict(self, y:numpy.ndarray):
        res = {}
        unique_values = numpy.unique(y)
        counter = 0
        for x in unique_values.tolist():
            res[counter] = x
            counter = counter +1
        self.y_class_dict = res
Пример #4
0
    def test_dask_matches(self):
        with dask.config.set(scheduler='single-threaded'):
            for n_jobs in [-1]:
                X, y = make_classification(random_state=0)
                a = TPOTClassifier(
                    generations=2,
                    population_size=5,
                    cv=3,
                    random_state=0,
                    n_jobs=n_jobs,
                    use_dask=False,
                )
                b = TPOTClassifier(
                    generations=2,
                    population_size=5,
                    cv=3,
                    random_state=0,
                    n_jobs=n_jobs,
                    use_dask=True,
                )
                b.fit(X, y)
                a.fit(X, y)

                self.assertEqual(a.score(X, y), b.score(X, y))
                self.assertEqual(a.pareto_front_fitted_pipelines_.keys(),
                                 b.pareto_front_fitted_pipelines_.keys())
                self.assertEqual(a.evaluated_individuals_,
                                 b.evaluated_individuals_)
Пример #5
0
def build_classifier(data, name):
    X, y = data
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    del config["sklearn.neighbors.KNeighborsClassifier"]
    classifier = TPOTClassifier(generations=1,
                                population_size=3,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(X, y)
    pipeline = make_pmml_pipeline(classifier.fitted_pipeline_,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name + ".pkl")
    result = DataFrame(classifier.predict(X), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(X),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name + ".csv")
Пример #6
0
def run_tpot(zeros, ones):
    all_data, y = make_all_data(zeros, ones)
    X_train, X_test, y_train, y_test = train_test_split(all_data,
                                                        y,
                                                        test_size=.1)
    pca = PCA(n_components=15)
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)

    # if not os.path.exists('tpot_checkpoint'):
    # os.mkdir('tpot_checkpoint')

    tpot = TPOTClassifier(
        n_jobs=-1,
        generations=50,
        verbosity=3,
        scoring='f1',
        # subsample=.5,
        # periodic_checkpoint_folder='tpot_checkpoint',
        max_eval_time_mins=30,
        memory='auto')

    tpot.fit(X_train, y_train)
    tpot.export('tpot_ecog_pipeline.py')
    results = tpot.predict(X_test)
    out_file = open('tpot_metrics.txt', 'w')
    out_file.write(sklearn.metrics.classification_report(y_test, results))
    tpot.export('tpot_ecog_pipeline.py')
def main(**kwargs) -> None:
    # divides kwargs between `Featurizer` and `TPOTClassifier` kwargs.
    tpot_kwargs = {}
    keys = list(kwargs.keys())
    for k in keys:
        if k in inspect.getargspec(TPOTClassifier).args:
            tpot_kwargs[k] = kwargs.pop(k)
    # loads all data into memory.
    paths = [os.path.join(LABELS_PATH, fname) for fname in os.listdir(LABELS_PATH)]
    X_raw, y = load_data(paths)
    X_raw.title.fillna('', inplace=True)
    X_raw.channel_title.fillna('', inplace=True)
    # splits data into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y,
        random_state=SEED, train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE, shuffle=True)
    # KLUDGE: preprocesses text deterministically (i.e. NOT part of the TPOT hyperparameter
    # optimization pipeline).
    featurizer = Featurizer(**kwargs)
    featurizer.fit(X_train)
    X_train = featurizer.transform(X_train)
    if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0:
        print(f'Beginning hyper-parameter search with training data shape: {X_train.shape}.')
    tpot = TPOTClassifier(**tpot_kwargs)
    tpot.fit(X_train, y_train)
    if 'periodic_checkpoint_folder' in tpot_kwargs:
        tpot.export(os.path.join(tpot_kwargs['periodic_checkpoint_folder'], 'best_pipeline.py'))
    if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0:
        X_test = featurizer.transform(X_test)
        print(f'Train set score: {tpot.score(X_train, y_train).round(4)}')
        print(f'Test set score: {tpot.score(X_test, y_test).round(4)}')
    return None
Пример #8
0
def main():
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    tsne_data = np.load('data/tsne_2d_5p.npz')
    tsne_train = tsne_data['X_train']
    tsne_valid = tsne_data['X_valid']

    # concat features
    X_train_concat = np.concatenate([X_train, tsne_train], axis=1)
    X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1)

    tpot = TPOTClassifier(max_time_mins=60 * 24,
                          population_size=100,
                          scoring='log_loss',
                          num_cv_folds=3,
                          verbosity=2,
                          random_state=67)
    tpot.fit(X_train_concat, y_train)
    print(tpot.score(X_valid_concat, y_valid))
    tpot.export('tpot_pipeline.py')
Пример #9
0
def test_fit():
    """Assert that the TPOT fit function provides an optimized pipeline"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert not (tpot_obj._start_datetime is None)
Пример #10
0
def classificar():
    """
    Extrai dataframe da tabela, converte variáveis de texto para número
    e preenche vazios para -1
    """

    dataframe = pd.read_excel('data/data.xlsx')

    #dataframe.rename({'CODIFICAÇÃO': 'class'}, axis='columns', inplace=True)

    # Binarizando variável com multiplos niveis
    encoder = LabelEncoder()
    classe_label = encoder.fit_transform(dataframe.iloc[:, 0])

    print(classe_label)

    # Binarizando variável com dois niveis
    dest_autopecas = {'N': 0, 'S': 1}
    dataframe['DEST AUTOPECAS'] = [dest_autopecas[item] for item in dataframe['DEST AUTOPECAS']]
    # Preenchendo vazios com valor padrão
    dataframe.fillna(-1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(dataframe[LABELS].values, np.array(classe_label), test_size=0.3)

    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=3)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_classif_pipeline.py')
Пример #11
0
def get_tpot_word_pipeline(train_sequences,
                           dev_sequences,
                           train_targets,
                           dev_targets,
                           time_constraint=1,
                           num_cpu=1,
                           max_features=1000):

    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=max_features)
    features = [('word', vectorizer)]
    clf = TPOTClassifier(generations=5,
                         population_size=50,
                         verbosity=2,
                         random_state=42)

    auml_pip = pipeline.Pipeline([('union',
                                   FeatureUnion(transformer_list=features)),
                                  ('scale', Normalizer())])

    sequence_space = train_sequences.tolist() + dev_sequences.tolist()

    X_train = auml_pip.fit_transform(sequence_space)
    Y_train = np.array(train_targets.tolist() + dev_targets.tolist())

    clf.fit(X_train.todense(), Y_train)
    return (auml_pip, clf)
Пример #12
0
class TPot(Model):

    def __init__(self):
        print("Starting t pot!")

    def fit(self, X, y, title=None):
        # For this case, X and y are the complete datasets!!!
        self.pipeline_optimizer = TPOTClassifier(
            generations=5,
            cv=5,
            random_state=42,
            verbosity=3,
            n_jobs=8,
            max_eval_time_mins=1,#10,
            scoring='f1',
            subsample=0.5
        )
        self.pipeline_optimizer.fit(X, y)

        if not os.path.exists("./automl"):
            os.makedirs("./automl")

        self.pipeline_optimizer.export('./automl/tpot_exported_pipeline_' + str(title) + '_.py')

    def predict(self, X):
        pass
Пример #13
0
    def run_AutoTpot(self):
        # Running the AutoTpot pipeline
        automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse')
        automl.fit(self.train, self.y_train)

        # TPOT produces ready-to-run, standalone Python code for the best-performing model,
        # in the form of a scikit-learn pipeline.
        # Exporting the best models
        automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py'))

        print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_))

        print('Saving the best model discovered through TPOT.')
        # Dumping ensemble of the models
        joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle'))

        # Calculating time per prediction
        # Start time ******************************************************************************
        start = timeit.default_timer()

        # Predicting label, confidence probability on the test data set
        predictions = automl.predict(self.test)
        predictions_prob = automl.predict_proba(self.test)

        # Binary class values : rounding them to 0 or 1
        predictions = [round(value) for value in predictions]

        end = timeit.default_timer()
        # End Time ******************************************************************************
        print('Time per prediction : {}'.format((end - start) / self.test.shape[0]))

        self.visualize(predictions, automl)
Пример #14
0
def build_classifier(data, feature_pipeline, generations, population_size,
                     name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    config = filter_config(config)
    del config[
        "sklearn.naive_bayes.GaussianNB"]  # Does not support nesting - see http://mantis.dmg.org/view.php?id=208
    del config["sklearn.neighbors.KNeighborsClassifier"]
    del config[
        "sklearn.svm.LinearSVC"]  # Does not support classifier.predict_proba(Xt)
    del config["sklearn.tree.DecisionTreeClassifier"]
    classifier = TPOTClassifier(generations=generations,
                                population_size=population_size,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(Xt, y)
    pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps +
                                           classifier.fitted_pipeline_.steps),
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(classifier.predict(Xt), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(Xt),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name)
Пример #15
0
def cli(erv_data):
    # import the ERV expression data as a Pandas dataframe
    df = pd.read_csv(erv_data)
    class_codes = dict(enumerate(
        df['class'].astype("category").cat.categories))
    df["class"] = df["class"].astype("category").cat.codes

    # create the test and training data
    X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:],
                                                        df.values[:, 1],
                                                        train_size=0.75,
                                                        test_size=0.25)

    # convert them all to floats
    X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype(
        float), y_train.astype(float), y_test.astype(float)

    # create a pipeline
    pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1)
    pipeline_optimizer.fit(X_train, y_train)
    pipeline_optimizer.export('tpot_exported_pipeline.py')

    print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}")
    cm = ConfusionMatrix([class_codes[y] for y in y_test], [
        class_codes[y] for y in
        [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test]
    ])
    cm.save_html("report")
Пример #16
0
def runTPOT(X, y, metric, algo):
    aml_config_dict = aml_config()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25)

    if algo == "Classifier":
        pipeline_optimizer = TPOTClassifier(generations=1,
                                            population_size=5,
                                            verbosity=2,
                                            warm_start=True)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    elif algo == 'Regressor':

        def aml_reg_scorer(y_pred, y_test):
            rsme = sqrt(mean_squared_error(y_test, y_pred))
            return rsme

        aml_custom_scorer = make_scorer(aml_reg_scorer,
                                        greater_is_better=False)
        pipeline_optimizer = TPOTRegressor(generations=1,
                                           population_size=5,
                                           verbosity=2,
                                           warm_start=True,
                                           scoring=aml_custom_scorer)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    else:
        raise Exception('Incorrect Problem Type')
    return pipeline_optimizer, pipeline_optimizer.score(X_test, y_test), len(
        pipeline_optimizer.evaluated_individuals_)
Пример #17
0
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ):

    print(n_jobs)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    
    if include_preprocessors:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                            use_dask=False,
                                            #template="Selector-Transformer-Classifier",
                                            n_jobs=n_jobs,)
    else:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                    use_dask=False,
                                    template='Classifier',
                                    n_jobs=n_jobs,)
    
    pipeline_optimizer.fit(X_train, y_train)
    y_hat = pipeline_optimizer.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_hat)
    f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted')
    metrs = []
    metrs.append("Accuracy score - " + str(acc))
    metrs.append("F1 score - " + str(f1_s))
    res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()]
    
    
    return str(metrs),res
Пример #18
0
def tpotClassifier(train_data, target_value):
    classifier = TPOTClassifier()
    X_train, X_test, y_train, y_test = train_test_split(
        train_data, train_data[target_value], train_size=0.75, test_size=0.25)
    classifier.fit(X_train, y_train)
    score: float = classifier.score(X_test, y_test)
    classifier.export('my_pipeline.py')
    return classifier, score
Пример #19
0
def main(X_train, y_train):
    tpot = TPOTClassifier(generations=5,
                          population_size=50,
                          verbosity=2,
                          random_state=42)
    tpot.fit(X_train, y_train)

    return tpot.fitted_pipeline_
Пример #20
0
def test_fit():
    """Assert that the TPOT fit function provides an optimized pipeline"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, generations=1, verbosity=0)
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert tpot_obj._gp_generation == 0
    assert not (tpot_obj._start_datetime is None)
Пример #21
0
def tpot_evaluation(total_runtime, train_features, train_labels):
    clf = TPOTClassifier(
        max_time_mins=total_runtime / 60,
        scoring='balanced_accuracy',
        config_dict=classifier_config_dict_custom,
    )
    clf.fit(train_features, train_labels)
    return clf
Пример #22
0
def tpot_generation(X_train, y_train, X_test, y_test):
    tpot = TPOTClassifier(generations=10,
                          population_size=20,
                          verbosity=2,
                          n_jobs=4)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_quiniela_pipeline.py')
Пример #23
0
def clfWithTpot(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    my_tpot = TPOTClassifier(generations=10, verbosity=2)
    my_tpot.fit(np.array(X_train), np.array(y_train))
    print(my_tpot.score(np.array(X_test), np.array(y_test)))
    my_tpot.export('exported_pipeline.py')
    predictions = my_tpot.predict(np.array(X_test))
    print(confusion_matrix(y_test, predictions))
Пример #24
0
def geneticModel(a,b):
  X_train, X_test, y_train, y_test = train_test_split(a, b,
  train_size=0.75, test_size=0.25)

  tpot = TPOTClassifier(generations=12, population_size=100, verbosity=2)
  tpot.fit(X_train,y_train)
  print(classifier.score(X_test, y_test))
  return tpot
Пример #25
0
def main():
    """
    Uses tpot (Tree-based Pipeline Optimization Tool) an Automated Machine Learning tool 
    to find and output the best machine learning model for the given dataset. 
    
    See https://github.com/EpistasisLab/tpot

    Outputs the results to automodel.py
    """
    titanic = pd.read_csv('../data/titanic.csv')
    titanic.rename(columns={'Survived': 'class'}, inplace=True)

    for category in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:
        print("Number of levels in category '{0}': \b {1:2.2f} ".format(
            category, titanic[category].unique().size))

    # Encode values
    titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})
    titanic['Embarked'] = titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    # Fill na
    titanic = titanic.fillna(-999)
    pd.isnull(titanic).any()

    # Encode values
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    CabinTrans = mlb.fit_transform([{str(val)}
                                    for val in titanic['Cabin'].values])

    # Drop unused columns
    titanic_new = titanic.drop(
        ['PassengerId', 'Name', 'Ticket', 'Cabin', 'class'], axis=1)

    # Create numpy arrays
    titanic_new = np.hstack((titanic_new.values, CabinTrans))
    titanic_class = titanic['class'].values

    # Train test split
    # https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html
    # https://stackoverflow.com/questions/55525195/do-i-have-to-do-one-hot-encoding-separately-for-train-and-test-dataset
    training_indices, validation_indices = training_indices, testing_indices = train_test_split(
        titanic.index, stratify=titanic_class, train_size=0.75, test_size=0.25)
    training_indices.size, validation_indices.size

    # Train model
    tpot = TPOTClassifier(verbosity=2,
                          max_time_mins=2,
                          max_eval_time_mins=0.04,
                          population_size=40)
    tpot.fit(titanic_new[training_indices], titanic_class[training_indices])

    # Score
    tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices,
                                                            'class'].values)

    # Export
    tpot.export('automodel.py')
Пример #26
0
def test_invaild_dataset_warning():
    """Assert that the TPOT fit function raises a ValueError when dataset is not in right format"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
    bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes
    try:
        tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy
        assert False
    except ValueError:
        pass
Пример #27
0
def test_invaild_dataset_warning():
    """Assert that the TPOT fit function raises a ValueError when dataset is not in right format"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
    bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes
    try:
        tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy
        assert False
    except ValueError:
        pass
Пример #28
0
 def _optimizeModel(self, X, y, model_path, config):
     if not os.path.exists(model_path):
         optimizer = TPOTClassifier(verbosity=2, config_dict=config)
         optimizer.fit(X, y)
         pipeline = optimizer.fitted_pipeline_
         pickle.dump(optimizer.fitted_pipeline_, open(model_path, 'wb'))
     else:
         pipeline = pickle.load(open(model_path, 'rb'))
     return pipeline
Пример #29
0
def generate_tpot_model(self, train_df, test_df,  generations, population, use_dask=True):
    
    client = Client(workers=6, threads=2)
    
    X_train, y_train, X_test, y_test = train_test_split(train_df, test_df, random_state=42, test_size=0.25)

    tp = TPOTClassifier(generations=generations, population_size=population, use_dask=use_dask, verbosity=2, n_jobs=-1)

    tp.fit(X_train, y_train)
Пример #30
0
def tpot_train(project,
               X,
               y,
               export_file,
               prediction_type,
               train_size=0.75,
               max_time_mins=1,
               max_eval_time_mins=0.04,
               population_size=40,
               scoring_func=None,
               n_jobs=1):

    print(
        "==========train / test split for training size {}".format(train_size))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size)
    print(X_train.shape, y_train.shape)

    print("==========Start training the model...")
    print("==========max_time_mins: {}".format(max_time_mins))
    print("==========max_eval_time_mins: {}".format(max_eval_time_mins))
    print("==========population_size: {}".format(population_size))
    print("==========n_jobs: {}".format(n_jobs))

    # predition type:
    # - regression
    # - classification
    if (prediction_type == "classification"):
        tpot = TPOTClassifier(verbosity=2,
                              max_time_mins=max_time_mins,
                              max_eval_time_mins=max_eval_time_mins,
                              population_size=population_size,
                              scoring=scoring_func,
                              n_jobs=n_jobs)
    else:
        tpot = TPOTRegressor(verbosity=2,
                             max_time_mins=max_time_mins,
                             max_eval_time_mins=max_eval_time_mins,
                             population_size=population_size,
                             scoring=scoring_func,
                             n_jobs=n_jobs,
                             warm_start=True)

    tpot.fit(X_train, y_train)

    try:
        holdout_score = tpot.score(X_test, y_test)
        print("==========holdout set score is {}".format(holdout_score))
    except:
        print("==========Unexpected error when score holdout set")

    print("==========export tpot to {}".format(export_file))
    tpot.export(export_file)

    return tpot
Пример #31
0
def run_automl_tpot(x_train, y_train, gener=3, verb=2, popul=8, cv=None):

    clf = TPOTClassifier(generations=gener,
                         verbosity=verb,
                         population_size=popul,
                         scoring='roc_auc',
                         cv=cv,
                         random_state=23)
    clf.fit(x_train.values, y_train)
    return clf
Пример #32
0
def do_tpot(generations=5, population_size=10,X='',y=''):

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=0.80, test_size=0.20)

    tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2,cv=3)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipeline.py')
    return tpot
Пример #33
0
def test_log_file_verbose_3():
    """ Set verbosity as 3. Assert log_file parameter to generate log file. """
    file_name = "progress_verbosity_3.log"
    tracking_progress_file = open(file_name, "w")
    tpot_obj = TPOTClassifier(population_size=10,
                              generations=10,
                              verbosity=3,
                              log_file=tracking_progress_file)
    tpot_obj.fit(X, y)
    assert_equal(os.path.getsize(file_name) > 0, True)
Пример #34
0
def test_imputer_in_export():
    """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit()."""
    tpot_obj = TPOTClassifier(
        random_state=42,
        population_size=1,
        offspring_size=2,
        generations=1,
        verbosity=0,
        config_dict='TPOT light'
    )
    features_with_nan = np.copy(training_features)
    features_with_nan[0][0] = float('nan')

    tpot_obj.fit(features_with_nan, training_target)
    # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.*
    pipeline_string = (
        'KNeighborsClassifier('
        'input_matrix, '
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1, '
        'KNeighborsClassifier__weights=uniform'
        ')'
    )
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

    export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed)

    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=None)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""

    assert_equal(export_code, expected_code)
Пример #35
0
def test_warm_start():
    """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run"""
    tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True)
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop != None
    assert tpot_obj._pareto_front != None

    first_pop = tpot_obj._pop
    first_pareto_front = tpot_obj._pareto_front

    tpot_obj.random_state = 21
    tpot_obj.fit(training_features, training_classes)

    assert tpot_obj._pop == first_pop
Пример #36
0
def generate_model(generations, train_X, train_y):
	tpot_generator = TPOTClassifier(generations=generations, verbosity=2)
	tpot_generator.fit(train_X, train_y)
	tpot_generator.export('tpot_model' + generations + '.py')
Пример #37
0
def main():

  # set up the path to the data sets and the data were are going to experiment 
  # with 
  base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/'
  data_setz = [#'bank',
    'blood',
    'breast-cancer-wisc-diag',
    'breast-cancer-wisc-prog',
    'breast-cancer-wisc',
    'breast-cancer',
    'congressional-voting',
    'conn-bench-sonar-mines-rocks',
    'credit-approval',
    'cylinder-bands',
    'echocardiogram',
    #'fertility',
    'haberman-survival',
    'heart-hungarian',
    'hepatitis',
    'ionosphere',
    'mammographic',
    'molec-biol-promoter',
    'musk-1',
    'oocytes_merluccius_nucleus_4d',
    'oocytes_trisopterus_nucleus_2f',
    'ozone',
    'parkinsons',
    'pima',
    #'pittsburg-bridges-T-OR-D';
    'planning',
    'ringnorm',
    #'spambase',
    'spectf_train',
    'statlog-australian-credit',
    'statlog-german-credit',
    'statlog-heart',
    'titanic',
    #'twonorm',
    'vertebral-column-2clases']

  # nsplits is like the number of cv (its bootstraps here) then set up some variales
  # to save the results to. 
  n_splitz = 10
  errors = np.zeros((len(data_setz),))
  fms = np.zeros((len(data_setz),))
  times = np.zeros((len(data_setz),))
  m = 0

  for n in range(n_splitz):
    print 'Spilt ' + str(n) + ' of ' + str(n_splitz)
    for i in range(len(data_setz)):
      print '    ' + data_setz[i]
      df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',')
      data = df.as_matrix()
      X = data[:, :-1]
      y = data[:, -1]
      X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m)
      m += 1
      
      ts = time.time()
      tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1)
      tpot.fit(X_train, y_train)
      times[i] += (time.time() - ts)

      errors[i] += (1-tpot.score(X_test, y_test))
      yhat = tpot.predict(X_test)
      fms[i] += f1_score(y_test, yhat, average='macro')
  
  errors /= n_splitz
  fms /= n_splitz
  times /= n_splitz

  df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times})
  df.to_csv(path_or_buf='tpot-results2.csv', sep=',')

  return None 
Пример #38
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()

X_train, X_test, y_train,  y_test = train_test_split(digits.data, digits.target,
													train_size = 0.75, test_size = 0.25)

tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')