예제 #1
0
    def run_example(self):

        train = pd.read_csv("./data/churn-train.csv")
        #dummy_train = pd.get_dummies(train[categorical_cols])
        categorical_feature_mask = train.dtypes == object
        categorical_cols = train.columns[categorical_feature_mask].tolist()
        le = LabelEncoder()
        #le.fit(train[categorical_cols])
        #le.transform(train[categorical_cols])
        train[categorical_cols] = train[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        # numpy
        X_train = train.drop(columns=['churn_probability']).to_numpy()
        y_train = train["churn_probability"].to_numpy()

        test = pd.read_csv("./data/churn-test.csv")
        #dummy_new = pd.get_dummies(test[categorical_cols])
        test[categorical_cols] = test[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        X_test = test.drop(columns=['churn_probability']).to_numpy()
        y_test = test["churn_probability"].to_numpy()

        tpot = TPOTRegressor(generations=5,
                             population_size=50,
                             verbosity=2,
                             random_state=42,
                             scoring='neg_mean_absolute_error',
                             cv=5)
        tpot.fit(X_train, y_train)
        print(tpot.score(X_test, y_test))
        tpot.export('tpot_iris_pipeline.py')

        return tpot.score(X_test, y_test)
예제 #2
0
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    known_score = 12.3727966005 # Assumes use of mse

    # Reify pipeline with known score

    pipeline_string = ("ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)


    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
예제 #3
0
파일: tests.py 프로젝트: val922/tpot
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    known_score = 12.3727966005  # Assumes use of mse

    # Reify pipeline with known score

    pipeline_string = (
        "ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(
        pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
예제 #4
0
def tpot_test(conf):
    from tpot import TPOTRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import TimeSeriesSplit

    p.load_config(conf)
    ds = dl.load_price_data()
    ds = add_features(ds)

    X = ds[p.feature_list][:-1]
    y = ds['DR'].shift(-1)[:-1]

    # Split Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2)

    tpot = TPOTRegressor(n_jobs=-1,
                         verbosity=2,
                         max_time_mins=60,
                         cv=TimeSeriesSplit(n_splits=3))

    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('./tpot_out.py')
예제 #5
0
def train_tpot(name, X, y, gen, cores):

    test_name = str('gen_' + str(gen) + name + '_' + time.strftime('%y%m%d'))

    print('Training with TPOT .... ', test_name)
    t1 = time.time()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25)
    tpot = TPOTRegressor(generations=gen,
                         population_size=50,
                         verbosity=2,
                         n_jobs=cores)
    tpot.fit(X_train, y_train.reshape(-1, ))

    print(tpot.score(X_test, y_test))
    t2 = time.time()
    delta_time = t2 - t1
    print('Time to train...:', delta_time)

    print('Saving the model ...')
    tpot.export('trained_models/' + test_name + '.py')
    joblib.dump(tpot.fitted_pipeline_, 'trained_models/' + test_name + '.pk1')
    print(test_name, ' saved ... ')
예제 #6
0
def auto_ml(X_train, X_test, y_train, y_test):
    tpot = TPOTRegressor(generations=30,
                         population_size=200,
                         verbosity=2,
                         periodic_checkpoint_folder="tpot_checkpoint/")
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipeline.py')
예제 #7
0
def tpotRegressor(train_data, target_value):
    regressor = TPOTRegressor()
    X_train, X_test, y_train, y_test = train_test_split(
        train_data, train_data[target_value], train_size=0.75, test_size=0.25)
    regressor.fit(X_train, y_train)
    score = regressor.score(X_test, y_test)
    regressor.export('my_pipeline.py')
    return regressor, score
예제 #8
0
 def fit_single_output(row):
     tpot = TPOTRegressor(generations=generations,
                          population_size=population_size,
                          verbosity=2,
                          n_jobs=1,
                          config_dict='TPOT light')
     fit_model = tpot.fit(X, row).fitted_pipeline_
     print(tpot.score(X, row))
     return fit_model
def go_tpot():
    from tpot import TPOTRegressor
    import datetime
    tpot = TPOTRegressor(generations=5,
                         population_size=20,
                         verbosity=3,
                         scoring='mean_absolute_error')
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('../models/tpot_pipeline_' +
                datetime.datetime.now().strftime('%Y.%m.%d_%H%M%S') + '.py')
예제 #10
0
def regression():
    housing = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(housing.data,
                                                        housing.target,
                                                        train_size=0.75,
                                                        test_size=0.25,
                                                        random_state=42)
    tpot = TPOTRegressor(generations=5,
                         population_size=50,
                         verbosity=2,
                         random_state=42)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_boston_pipeline.py')
예제 #11
0
def test_sample_weight_func():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')

    # Reify pipeline with known scor

    pipeline_string = ("ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)

    # make up a sample weight
    training_classes_r_weight = np.array(range(1, len(training_classes_r)+1))
    training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight)

    np.random.seed(42)
    cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict)

    np.random.seed(42)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
    # Get score from TPOT
    known_score = 12.643383517 # Assumes use of mse
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
    assert np.allclose(cv_score1, cv_score2)
    assert not np.allclose(cv_score1, cv_score_weight)
    assert isclose(known_score, score)
예제 #12
0
    def regression(self, timeMax=60):
        def rmse_scorer(y_true, y_pred):
            return mean_squared_error(y_true, y_pred, squared=False)

        my_custom_scorer = make_scorer(rmse_scorer, greater_is_better=False)

        print(f"Starting regression with {self.modelName}")
        X_train, X_test, y_train, y_test = self.dataFunction(
            preprocessed=self.preprocessed,
            specifics="TPOT",
            trainSize=self.trainSize,
            nDataPoints=self.nDataPoints)

        # Change dict for prediction model
        config_copy = regressor_config.copy()
        config_copy.update(self.model)

        # TPOT automated feature engineering
        start_time = time.time()
        tpot = TPOTRegressor(generations=self.generations,
                             population_size=self.popSize,
                             verbosity=2,
                             config_dict=config_copy,
                             max_time_mins=timeMax,
                             max_eval_time_mins=30,
                             cv=4,
                             scoring=my_custom_scorer)

        tpot.fit(X_train, y_train)
        total_time = int(divmod(time.time() - start_time, 60)[0])
        print(tpot.evaluated_individuals_)
        print(f"Time: {total_time}")

        # prediction score
        predictionScore = int(-tpot.score(X_test, y_test))
        print(f"Final MSE prediction score: {predictionScore}")

        # Export model
        tpot.export(
            f'{self.savePath}/time{total_time}_score{predictionScore}_trainSize{self.trainSize}_PIPE.py'
        )
        # Export History
        with open(f'{self.savePath}/performance_history.pkl', "wb") as handle:
            pickle.dump(tpot.evaluated_individuals_, handle)
        # Export pareto front
        with open(f'{self.savePath}/PARETO.pkl', "wb") as handle:
            pickle.dump(tpot.pareto_front_fitted_pipelines_, handle)
예제 #13
0
def model_selection_and_HPO(dataframe, target="job_performance", test_size=0.25, r_seed=123):
    """ Pass in the dataframe that has gone through feature selection
    Uses the TPOT regressor module from TPOT to perform MS and HPO. As this modeling uses some element
    of stochasticity, it may provide different results every time. The longer you run this,
    the more similar the final models will look like in the end.
    
    Finally outputs a .py file with the selected model and its hyperparameters, for which we can import.
    """
    import TPOT 
    from sklearn.model_selection import train_test_split
    import timeit
    from tpot import TPOTRegressor
    from sklearn.metrics import (
        confusion_matrix,
        roc_auc_score,
        precision_recall_fscore_support,
        accuracy_score,
    )

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        dataframe.loc[:, dataframe.columns != target].values,
        dataframe[target].values.ravel(),
        test_size=test_size,
        random_state=r_seed)
    
    y_train = y_train.ravel()
    y_test = y_test.ravel()

    # model selection and hyperparameter optimization with TPOT Regressor
    tpot_regressor = TPOTRegressor(generations=20, 
                                   population_size=50, 
                                   cv=10,
                                   random_state=r_seed, 
                                   verbosity=2, 
                                   memory='auto')
    
    start_time = timeit.default_timer()
    tpot_regressor.fit(X_train, y_train)
    y_pred = tpot_regressor.predict(X_test)
    end_time = timeit.default_timer()

    print(f"Total runtime for the Employee dataset: {end_time-start_time}s")
    print("TPOT Score: {}".format(tpot_regressor.score(X_test, y_test)))

    tpot_regressor.export('tpot_exported_pipeline.py')
예제 #14
0
    def fit(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, train_size=self.train_size, random_state=0)

        tpot = TPOTRegressor(generations=self.generation,
                             population_size=self.generation,
                             verbosity=3,
                             warm_start=True,
                             config_dict=self.config_dict)
        startTime = datetime.datetime.now()
        tpot.fit(X_train, y_train)

        endTime = datetime.datetime.now()

        predict_score = tpot.score(X_test, y_test)
        cost_time = endTime - startTime

        return predict_score, cost_time
def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    df.columns.intersection(listOfFeatures)
    X = df.drop(label, axis=1).values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=1,
                                                        test_size=0.2)
    tpotModel = TPOTRegressor(verbosity=3,
                              generations=10,
                              max_time_mins=15,
                              n_jobs=-1,
                              random_state=25,
                              population_size=15)
    tpotModel.fit(X_train, y_train)
    print(tpotModel.score(X_test, y_test))
예제 #16
0
파일: tests.py 프로젝트: rhiever/tpot
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    tpot_obj._pbar = tqdm(total=1, disable=True)
    known_score = 8.9673743407873712  # Assumes use of mse
    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('ExtraTreesRegressor(GradientBoostingRegressor(input_matrix, 100.0, 0.11), 0.17999999999999999)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
예제 #17
0
def TPOTRegressor(ATM):
    X = ATM.inputs["X"]
    y = ATM.inputs["y"]
    tpot = TPOTRegressor(generations=ATM.props["generations"],
                         population_size=ATM.props["population_size"],
                         verbosity=ATM.props["verbosity"],
                         random_state=ATM.props["random_state"])
    tpot.fit(X, y)
    ATM.report({
        'name': "stats",
        'stats': {
            'score': tpot.score(payload.X_test, y_test)
        }
    })
    ATM.report({
        'name': "log",
        'payload': {
            'model': tpot.export()
        }
    })
    ATM.save("model.tpot", tpot.export())
예제 #18
0
파일: tests.py 프로젝트: sepehrmn/tpot
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOTRegressor(scoring='mean_squared_error')
    tpot_obj._pbar = tqdm(total=1, disable=True)
    known_score = 8.9673743407873712  # Assumes use of mse
    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('ExtraTreesRegressor(GradientBoostingRegressor(input_matrix, 100.0, 0.11), 0.17999999999999999)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
예제 #19
0
def show_data(dataset_train, classifier_name, params):
    st.write("Training dataset:", dataset_train)
    X = dataset_train.values[:, 1:]
    y = dataset_train.values[:, 0]
    st.write('Shape of dataset:', X.shape, '=> ', X.shape[0], 'rows and ',
             X.shape[1], 'columns of dataset')
    st.write(f'Classifier = {classifier_name}',
             '=> model to train the dataset')

    generation = params['2.1 Tune parameter: Generation (Epoch)']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25,
                                                        random_state=42)

    tpot = TPOTRegressor(generations=generation,
                         population_size=50,
                         verbosity=2,
                         random_state=42)  #generations=5
    tpot.fit(X_train, y_train)
    #st.write('Info for reference only:', tpot.fit(X_train, y_train))
    #print(tpot.score(X_test, y_test))

    tpot.export('tpot_boston_pipeline.py')
    #tpot.log('tpot_progress_content.txt')
    MSE = abs(tpot.score(X_test, y_test))
    st.write("MSE (Mean Squared Error):", MSE.round(2))

    #st.write(tpot.evaluated_individuals_)

    # save the model to disk
    #model=tpot
    #pickle.dump(model, open(filename, 'wb'))

    #from joblib import dump, load
    #dump(tpot, 'filename.joblib')

    #https://github.com/EpistasisLab/tpot/issues/11#issuecomment-341421022
    pickle.dump(tpot.fitted_pipeline_, open(filename, 'wb'))
예제 #20
0
def ensemble_tpot(city, state, target, horizon, lookback):
    with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp:
        clusters = pickle.load(fp)
        data, group = get_cluster_data(city,
                                       clusters=clusters,
                                       data_types=DATA_TYPES,
                                       cols=PREDICTORS)

    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    casos_columns = ['casos_{}'.format(i) for i in group]

    data = data.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        data_lag[target],
                                                        train_size=0.7,
                                                        test_size=0.3,
                                                        shuffle=False)

    tgt_full = data_lag[target].shift(-(horizon - 1))[:-(horizon - 1)]
    tgt = tgt_full[:len(X_train)]
    tgtt = tgt_full[len(X_train):]

    model = TPOTRegressor(generations=20,
                          population_size=100,
                          verbosity=2,
                          n_jobs=32)
    model.fit(X_train, target=tgt)
    model.export('tpot_{}_pipeline.py'.format(city))
    print(model.score(X_test[:len(tgtt)], tgtt))

    pred = plot_prediction(X_data[:len(tgt_full)], tgt_full, model,
                           'Out_of_Sample_{}_{}'.format(horizon,
                                                        city), horizon)
    plt.show()
    return pred
예제 #21
0
def TPOTAutoMLRegressor(data, settings):
    # Runs the AutoML algorithm on the dataset
    clf = TPOTRegressor()
    X, y, features = data.get_data(target=data.default_target_attribute,
                                   return_attribute_names=True)
    folds = 10
    acc = 0

    X = np.nan_to_num(X)
    y = np.nan_to_num(y)

    p = len(features)
    n = len(X)

    #if showRuntimePrediction:
    #    getAverageRuntime("IBk", task)

    # computational complexity O(n^2 * p)
    #complexity = n**2 * p * 10

    #if complexity <= comp or comp == -1:
    #for x in range(1,folds+1):
    #    if (((n**2 * p)*10) * x) > comp and comp != -1:
    #        folds = x-1
    #        print("Number of folds would increase the complexity over the given threshold, number of folds has been set to: " + str(folds))
    #        break
    #if folds > len(y):
    #    print("Number of folds are larger than number of samples, number of folds has been set to: " + str(len(y)))
    #    folds = len(y)
    #kf = KFold(n_splits=folds)
    #for train_index, test_index in kf.split(X,y):
    #X_train, X_test = X[train_index], X[test_index]
    #y_train, y_test = y[train_index], y[test_index]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    #else:
    #    print("computation complexity too high, please run manually if desired.")
    settings.addAlgorithm('TPOTAutoML', acc)
예제 #22
0
class runmodel:

    def __init__(self):
        self.tpotclassifier=TPOTClassifier(generations=5,verbosity=2,population_size=20,random_state=7)
        self.tpotregressor=TPOTRegressor(generations=5,verbosity=2,population_size=20,random_state=7)

    def regressor(self,dataframe,target):
        x=dataframe.drop(target,axis=1)
        y=dataframe[[target]]
        X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.75, test_size=0.25)
        self.tpotregressor.fit(X_train, y_train)
        bestscore=self.tpotregressor.score(X_test, y_test)
        return bestscore


    def classifier(self,dataframe,target):
        x=dataframe.drop(target,axis=1)
        y=dataframe[[target]]
        X_train, X_test, y_train, y_test = train_test_split(x, y,train_size=0.75, test_size=0.25)
        self.tpotclassifier.fit(X_train, y_train)
        bestscore=self.tpotclassifier.score(X_test, y_test)
        return bestscore
예제 #23
0
    def callback(self, channel, method, properties, body):
        with self.lock:
            (symbol, X_train, X_test, y_train, y_test,
             folds_index) = decode_data(body)
            channel.basic_ack(delivery_tag=method.delivery_tag)
        logger.info("data received %s %d", symbol, folds_index)
        tpot = TPOTRegressor(memory='auto',
                             generations=100,
                             population_size=100,
                             n_jobs=-1,
                             max_time_mins=20,
                             max_eval_time_mins=20,
                             config_dict='TPOT light')
        try:
            tpot.fit(X_train, y_train)
        except Exception as e:
            logger.error(e)
            data = (None, None, None, None)
            with self.lock:
                channel.basic_publish(exchange='',
                                      routing_key='tpot_pipelines',
                                      body=encode_data(data))
            return

        test_prediction = tpot.predict(X_test)
        test_prediction_error = abs((y_test - test_prediction) * 100 / y_test)
        score = tpot.score(X_test, y_test)
        logger.info("sending result of %s %s", symbol, folds_index)
        try:
            data = (tpot.fitted_pipeline_, score, folds_index, symbol)
            with self.lock:
                channel.basic_publish(exchange='',
                                      routing_key='tpot_pipelines',
                                      body=encode_data(data))
        except Exception:
            import pdb
            pdb.set_trace()
from sklearn.externals import joblib
from sklearn.metrics import r2_score

from time import time

n_skip = 100  # testing on smaller data set
features = pd.read_csv('pmap_raw_16features.csv').iloc[::n_skip]
labels = pd.read_csv('pmap_raw_labels_and_errors.csv')['Flux'].iloc[::n_skip]

#Split training, testing, and validation data
idx = np.arange(labels.values.size)
training_indices, validation_indices = train_test_split(idx, test_size=0.20)

#Let Genetic Programming find best ML model and hyperparameters
tpot = TPOTRegressor(generations=10, verbosity=2, n_jobs=-1)

start = time()
tpot.fit(features.iloc[training_indices].values,
         labels.iloc[training_indices].values)
print('Full TPOT regressor operation took {:.1f} minutes'.format(
    (time() - start) / 60))

#Score the accuracy

print('Best pipeline test accuracy: {:.3f}'.format(
    tpot.score(features.iloc[validation_indices].values,
               labels.iloc[validation_indices].values)))

#Export the generated code
tpot.export('spitzer_calibration_tpot_best_pipeline.py')
예제 #25
0
        'Os', 'Ir', 'Pt', 'Au', 'Hg'
    ]
    tmlist.append('O')  # we need O
    tmlist.append('H')  # we need H
    tmset = set(tmlist)
    badset = elementset - tmset
    for badel in badset:
        data = data.loc[data[badel] == 0]

# Split training and test:
traindata, testdata = train_test_split(data, test_size=0.1, random_state=1)

# Fit elemental linear regression:
outcols = ['V_min', 'V_max', 'pH_min', 'pH_max', 'area', 'energy_per_atom']
for output in outcols:
    trainX = traindata[elementlist]
    trainy = traindata[output]
    testX = testdata[elementlist]
    testy = testdata[output]

    pipeline_optimizer = TPOTRegressor(generations=20,
                                       population_size=100,
                                       verbosity=2,
                                       n_jobs=1)  # applying TPOT
    pipeline_optimizer.fit(trainX, trainy)

    testscore = pipeline_optimizer.score(testX, testy)  # Default score: MSE
    # calculate alternative scores w/sklearn: testscore_r2 = r2_score(pipeline_optimizer, testX, testy)

    print('{} Test Score: {}'.format(output, testscore))
예제 #26
0

random_state = 1618
brainage_train_data = pd.read_csv('BrainAGE_train.csv')
brainage_test_data = pd.read_csv('BrainAGE_test.csv')
label = 'age'
n_gen = 500
n_pop = 500

Xdatatrain = brainage_train_data.drop(label, axis=1)
Ydatatrain = brainage_train_data[label]
Xdatatest = brainage_test_data.drop(label, axis=1)
Ydatatest = brainage_test_data[label]


# In[4]:


# personal_config = regressor_config_dict
tpot = TPOTRegressor(generations = n_gen,
                 population_size = n_pop,
                 verbosity = 2,
                 config_dict = regressor_config_dict,
                 scoring = 'r2',
                 random_state = random_state,
                 cv = TimeSeriesSplit(n_splits=5),
                 template = 'Selector-Transformer-Regressor')
tpot.fit(Xdatatrain.values, Ydatatrain.values)
print(tpot.score(Xdatatest.values, Ydatatest.values))
tpot.export('tpot_brainAGE_pipeline.py')
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

# load train data and split
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
X_train, X_test, y_train, y_test = train_test_split(train.drop('y', axis=1), train['y'],
                                                    train_size=0.75, test_size=0.25)

pipeline_optimizer = TPOTRegressor(generations=10, population_size=100, cv=5,
                                    random_state=42, verbosity=2, warm_start=True)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline_overnight.py')
y = df.pop('progression')
X = df

#y.head()

#split training and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#specify model
regr = linear_model.LinearRegression()
regr = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=-1)
#regr = linear_model.Ridge()
#regr = linear_model.Lasso()

#train the model using all data
#regr.fit(X, y)

# Train the model using the training sets
regr.fit(X_train, y_train)

#Explained variance score: 1 is perfect prediction
regr.score(X, y)
regr.score(X_train, y_train)
regr.score(X_test, y_test)

#Generate predictions, then append to df, then write to Excel
results = X_test
y_pred = regr.predict(X_test)
results['progression'] = y_test
results['pred_progression'] = y_pred
results.to_excel(r'diabetes.xls', header=True, index=True)
def model_dev(train_set,matchups,spreads):
	""" Create the testing set for the algo creation """
	# Create a sample set to pass into the machine learning algorithm
	X = train_set[['rush_attempt_diff', 'turn_diff', 'yards_diff', 'third_diff', 'sack_diff', 'sack_ydiff', 'poss_diff', 'p_attempt_diff']].copy()
	# X = df[['poss_diff', 'third_diff', 'turn_diff', 'pass_diff', 'rush_diff']].copy()

	# Create results vector (a home win = 1, a home loss or tie = 0)
	train_set.rename(columns={'result_spread':'class'},inplace=True)
	y = train_set['class']#np.array(np.where(df['home_score'] > df['away_score'], 1, 0))

	""" Train, test, and predict the algorithm """
	# Scale the sample data
	scaler = preprocessing.StandardScaler().fit(X)
	X = scaler.transform(X)

	# Delete the dataframe to clear memory
	del train_set

	# Split out training and testing data sets
	X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

	# alphas = [0.1, 0.3, 0.9, 1.0, 1.3, 1.9, 2.0, 2.3, 2.9]
	# for alpha in alphas:
	# 	reg = linear_model.Ridge(alpha = alpha)
	# 	reg.fit(X_train,y_train)
	# 	print 'alpha = ',alpha,', score = ',reg.score(X_test,y_test)
	# input()
	pipeline_optimizer = TPOTRegressor(generations = 5, population_size = 10, random_state = 42, cv = 5, verbosity = 2, n_jobs = 3)#, scoring = 'f1')
	pipeline_optimizer.fit(X_train,y_train)
	print pipeline_optimizer.score(X_test,y_test)
	pipeline_optimizer.export('NFL_ML_TPOT_Regressor.py')

	# Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm
	matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True)


	"""
	for feat in range(1,len(matchups.columns)):
		for c in C_vec:
			# Create the classifier and check the score
			# clf = LogisticRegression()
			clf = linear_model.LogisticRegression(C=c,random_state=42)
			selector = RFE(clf)
			selector = selector.fit(X_train,y_train)

			# Calculate probabilities using the predict_proba method for logistic regression
			probabilities = selector.predict_proba(scaler.transform(matchups))

			# Vectorize the spread_conversion function and apply the function to the probabilities result vector
			vfunc = np.vectorize(spread_conversion)
			predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])

			# If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
			bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))

			# Create the actual result vector where a tie counts as a loss for the home team
			game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0))

			# Check to see where the bet_vector equals the actual game result with the spread included
			result = np.array(np.where(bet_vector == game_result,1,0))

			prob_result = float(np.sum(result)) / len(result)

			# print 'Number of features =', feat, 'C =',c,'  Percent correct =',prob_result

			if prob_result > prob_val:
				prob_val = prob_result
				C_val = c
				feat_val = feat

	print 'Score =',selector.score(X_test,y_test)
	# print prob_val, C_val, feat

	clf = linear_model.LogisticRegression(C=C_val,random_state=42)
	clf = clf.fit(X_train,y_train)
	probabilities = clf.predict_proba(scaler.transform(matchups))
	vfunc = np.vectorize(spread_conversion)
	predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])
	"""

	predicted_spreads = pd.DataFrame(pipeline_optimizer.predict(scaler.transform(matchups)),columns = ['results'])
	bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))
	print spreads
	print predicted_spreads
	print bet_vector
예제 #30
0
파일: maintrain.py 프로젝트: nicrie/tbd
ensemble2.fit(X_train2, y_train2)
predvot2 = ensemble2.predict(X_test2).round(0)
MSE6 = mse(y_test2, predvot2)
print("Average error on new number of hospitalizations per day:",
      round(MSE6**0.5, 0))
print(MSE6)
print('OK')

print("TPOTRegressor")
tpot = TPOTRegressor(generations=50,
                     population_size=50,
                     verbosity=2,
                     random_state=42)
tpot.fit(X_train2, y_train2)
print(tpot.score(X_test2, y_test2))
tpot.export('tpot_covid_pipeline.py')

print("Neural Network")
X_trainNN = X_train2.values.reshape(X_train2.shape[0], X_train2.shape[1], 1)
y_trainNN = y_train2.values
X_testNN = X_test2.values.reshape(X_test2.shape[0], X_test2.shape[1], 1)
y_testNN = y_test2.values
NNmodel = Sequential()
#NNmodel.add(layers.Dense(215, input_shape=(X_trainNN.shape[0], X_trainNN.shape[1])))
NNmodel.add(
    layers.LSTM(units=22,
                activation='tanh',
                return_sequences=True,
                input_shape=X_trainNN.shape[1:]))
NNmodel.add(layers.LSTM(units=10, activation='tanh', return_sequences=False))
예제 #31
0
# Data Extraction
df = data_extract_e('e_20190609_15.pkl')

# Data Transformation and Engineering
df = feature_eng(df)
df = extract_queues(df)
dept_encoder, queue_encoder = fit_labels(df)
df = feature_transform(df, queue_encoder, dept_encoder)

# Training/Test Split
x, y = data_filter(df)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2468)

# Using TPOT AutoML
tpot = TPOTRegressor(n_jobs=-1,
                     verbosity=1,
                     config_dict=xgb_config.xgb_config_dict)
tpot = tpot.fit(x_train, y_train)
y_pred = tpot.predict(x_train)
print('XGB TPOT training R2 score: ', r2_score(y_train, y_pred))
print('XGB TPOT training negative MSE: ', tpot.score(x_train, y_train))

y_pred = tpot.predict(x_test)
print('XGB TPOT test R2 score: ', r2_score(y_test, y_pred))
print('XGB TPOT test negative MSE: ', tpot.score(x_test, y_test))

tpot.export('xgb_tpot.py')
예제 #32
0
    def do_analysis(**kwargs):
        """
        Keyword Arguments:
            dataloader (callable): a callable which returns an sklean.base.Bunch
            export_filename_prefix (str): must be specified
            export_dirpath (str): default: dirpath(__file__) / 'pipelines'
            export_filename (str): default: export_filename_prefix + '_.py'
            export_filepath (str): default: export_dirpath / export_filename
            train_size (float): default: 0.75
            test_size (float): default: 0.25
            generations (int): default: 5
            population_size (int): default: 20
            verbosity (int): default: 2

        Returns:
            OrderedDict: dict of parameters
        """
        data = odict()
        export_filename_prefix = kwargs.pop('export_filename_prefix')
        export_dirpath = kwargs.pop('export_dirpath',
                                join(dirname(__file__), 'pipelines'))
        export_filename = kwargs.pop('export_filename',
                                     "%s_.py" % export_filename_prefix)
        export_filepath = kwargs.pop('export_filepath',
                                    join(export_dirpath, export_filename))
        data['export_filepath'] = export_filepath
        _export_dirpath = dirname(export_filepath)
        if not os.path.exists(_export_dirpath):
            os.makedirs(_export_dirpath)

        dataloader = kwargs['dataloader']
        data['dataloader'] = getattr(dataloader, '__qualname__',
                                     getattr(dataloader, '__name__',
                                             str(dataloader)))
        databunch = dataloader()

        tts_kwargs = odict()
        tts_kwargs['train_size'] = kwargs.pop('train_size', 0.75)
        tts_kwargs['test_size'] = kwargs.pop('test_size', 0.25)
        data.update(tts_kwargs)
        X_train, X_test, y_train, y_test = train_test_split(
            databunch.data, databunch.target, **tts_kwargs)

        regressor_kwargs = odict()
        regressor_kwargs['generations'] = kwargs.pop('generations', 5)
        regressor_kwargs['population_size'] = kwargs.pop('population_size', 20)
        regressor_kwargs['verbosity'] = kwargs.pop('verbosity', 2)
        data.update(regressor_kwargs)
        tpot = TPOTRegressor(**regressor_kwargs)

        log.info(TPOTAnalysis._to_json_str(data))
        tpot.fit(X_train, y_train)
        data['score'] = tpot.score(X_test, y_test)
        log.info(('score', data['score']))

        tpot.export(export_filepath)

        json_str = TPOTAnalysis._to_json_str(data)
        log.info(json_str)
        data['export_filepath_datajson'] = export_filepath + '.json'
        with open(data['export_filepath_datajson'], 'w') as f:
            f.write(json_str)

        return data
        print(f"Failed setting training data: {e}")
        return
    return mm_training.training_df, mm_training.feature_column_list, mm_training.target_column_list


feature_minutes_list = [1, 3, 5, 8, 11, 14, 18, 22, 30, 60, 120, 1440]
features_df, feature_cols, target_col_list = features(feature_minutes_list)

features_df = features_df[:-14]

# Split for last 4.5 hours training and adjust for look ahead
#X_train, y_train = features_df[-300:-20][feature_cols], features_df[-300:-20][target_col]
#X_test, y_test = features_df[-10:][feature_cols], features_df[-10:][target_col]

# Split for last x days training and adjust for look ahead
days_training = 400 * -1440
hours_test = 120 * -60
X_train, y_train = features_df[days_training:(
    hours_test -
    14)][feature_cols], features_df[days_training:(hours_test -
                                                   14)][target_col_list[0]]
X_test, y_test = features_df[hours_test:][feature_cols], features_df[
    hours_test:][target_col_list[0]]

tpot = TPOTRegressor(generations=5, population_size=10, verbosity=2, n_jobs=-1)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export(
    f'tpot_{days_training/-1440}days_train_{hours_test/-60}hour_test_pipeline.py'
)
test = combi[train.shape[0]:]
test.drop('Item_Outlet_Sales',axis=1,inplace=True)

## removing id variables 
tpot_train = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
tpot_test = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train.drop('Item_Outlet_Sales',axis=1,inplace=True)

# finally building model using tpot library
from tpot import TPOTRegressor

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export(data+'tpot_boston_pipeline.py')

## predicting using tpot optimised pipeline
tpot_pred = tpot.predict(tpot_test)
sub1 = pd.DataFrame(data=tpot_pred)

#sub1.index = np.arange(0, len(test)+1)
sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'})
sub1['Item_Identifier'] = test['Item_Identifier']
sub1['Outlet_Identifier'] = test['Outlet_Identifier']
sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier']
sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
sub1.to_csv('tpot.csv',index=False)