def test_fit_and_predict(self):
        seed = 1706 + 1
        for dataset_id in [31]:  # 720 # 31,44,737
            df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
            x_cols = [c for c in df.columns if c != "target"]
            X = df[x_cols]
            y = df["target"]

            X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
                X, y, test_size=0.3, random_state=seed)
            automl = AutoML(
                total_time_limit=60 * 6000,
                algorithms=["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
                start_random_models=10,
                hill_climbing_steps=3,
                top_models_to_improve=3,
                train_ensemble=True,
                verbose=True,
            )
            automl.fit(X_train, y_train)

            response = automl.predict(X_test)
            # Compute the logloss on test dataset
            ll = log_loss(y_test, response)
            print("(*) Dataset id {} logloss {}".format(dataset_id, ll))

            for i, m in enumerate(automl._models):
                response = m.predict(X_test)
                ll = log_loss(y_test, response)
                print("{}) Dataset id {} logloss {}".format(i, dataset_id, ll))
Exemplo n.º 2
0
def run(dataset, config):
    log.info("\n**** mljar-supervised ****\n")

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    X_train = pd.DataFrame(dataset.train.X,
                           columns=column_names).astype(column_types,
                                                        copy=False)
    X_test = pd.DataFrame(dataset.test.X,
                          columns=column_names).astype(column_types,
                                                       copy=False)

    y_train = dataset.train.y.flatten()
    y_test = dataset.test.y.flatten()

    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    is_classification = config.type == "classification"
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    seed=config.seed,
                    ml_task=ml_task,
                    **training_params)

    with Timer() as training:
        automl.fit(X_train, y_train)

    preds = automl.predict(X_test)

    predictions, probabilities = None, None
    if is_classification:
        predictions = preds["label"].values
        probabilities = preds[preds.columns[:-1]].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(
        output_file=config.output_predictions_file,
        predictions=predictions,
        truth=y_test,
        probabilities=probabilities,
        models_count=len(automl._models),
        training_duration=training.duration,
    )
    def test_fit_and_predict(self):
        seed = 1709

        df = pd.read_csv(
            "./tests/data/housing_regression_missing_values_missing_target.csv"
        )
        print(df.columns)
        x_cols = [c for c in df.columns if c != "MEDV"]
        X = df[x_cols]
        y = df["MEDV"]

        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X, y, test_size=0.3, random_state=seed)
        automl = AutoML(
            total_time_limit=10,
            algorithms=["Xgboost"
                        ],  # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
            start_random_models=1,
            hill_climbing_steps=0,
            top_models_to_improve=0,
            train_ensemble=True,
            verbose=True,
        )
        automl.fit(X_train, y_train)

        response = automl.predict(X_test)  # ["p_1"]
        print("Response", response)
Exemplo n.º 4
0
    def run(self):
        from supervised.automl import AutoML

        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe().dropna()
        X = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        stimulus, preprocessor = preprocess(dataframe, self.specification)

        if self.specification.get('timeBoundSearch'):
            self.system_params['total_time_limit'] = self.specification[
                'timeBoundSearch']

        if self.specification.get('timeBoundRun'):
            self.system_params['learner_time_limit'] = self.specification[
                'timeBoundRun']

        automl = AutoML(**self.system_params)

        # mljar seems kind of fragile?
        stimulus = pandas.DataFrame(stimulus)
        stimulus.columns = [str(i).strip() for i in stimulus.columns]

        automl.fit(stimulus, dataframe[y])

        for model_mljar in sorted(automl._models,
                                  key=lambda m: m.get_final_loss())[:4]:
            model = ModelSklearn(
                model_mljar,
                system='mljar-supervised',
                search_id=self.search_id,
                predictors=X,
                targets=[y],
                preprocess=preprocessor,
                task=self.specification['problem']['taskType'])

            model.save()

            from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
            FOUND_MODEL_CALLBACKS[self.callback_found](
                model, **(self.callback_arguments or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'mljar-supervised'
            }
        }
Exemplo n.º 5
0
 def test_fit_optimize_auc(self):
     automl = AutoML(
         total_time_limit=5,
         algorithms=["Xgboost"],
         start_random_models=2,
         hill_climbing_steps=0,
         optimize_metric="auc",
         seed=16,
     )
     automl.fit(self.X, self.y)
     ldb = automl.get_leaderboard()
     self.assertEqual(ldb["metric_type"][0], "auc")
     self.assertEqual(np.sum(ldb["metric_value"] > 0.5),
                      ldb.shape[0])  # all better than 0.5 AUC
Exemplo n.º 6
0
    def test_predict_labels(self):
        # 3.csv') #
        df = pd.read_csv(
            'tests/data/adult_missing_values_missing_target_500rows.csv')
        X = df[df.columns[:-1]]
        y = df[df.columns[-1]]
        automl = AutoML(total_time_limit=15,
                        algorithms=["Xgboost"],
                        start_random_models=5,
                        hill_climbing_steps=0,
                        train_ensemble=True)
        automl.fit(X, y)

        y_predicted = automl.predict(X)
        self.assertTrue('A' in np.unique(y_predicted['label']))
        self.assertTrue('B' in np.unique(y_predicted['label']))
Exemplo n.º 7
0
def train_titanic(train_data):
    train_df = pd.read_csv(train_data)
    # test_df = pd.read_csv(test_data)

    # feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1).columns
    feature_cols = train_df.columns[2:]
    target_cols = 'Survived'

    X_train, X_test, y_train, y_test = train_test_split(train_df[feature_cols],
                                                        train_df[target_cols],
                                                        test_size=0.25)

    automl = AutoML(results_path="AutoML_titanic")
    automl.fit(X_train, y_train)

    predictions = automl.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, predictions) * 100.0:.2f}%")
Exemplo n.º 8
0
 def test_predict_labels(self):
     automl = AutoML(
         total_time_limit=15,
         algorithms=["Xgboost"],
         start_random_models=5,
         hill_climbing_steps=0,
         train_ensemble=True,
         seed=15,
     )
     automl.fit(self.X, self.y)
     ldb = automl.get_leaderboard()
     self.assertEqual(ldb.shape[0], len(automl._models))
     for col in [
             "uid", "model_type", "metric_type", "metric_value",
             "train_time"
     ]:
         self.assertTrue(col in ldb.columns)
Exemplo n.º 9
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     losses = []
     for i in range(2):
         automl = AutoML(
             total_time_limit=
             10000,  # the time limit should be big enough too not interrupt the training
             algorithms=["Xgboost"],
             start_random_models=2,
             hill_climbing_steps=1,
             train_ensemble=True,
             verbose=True,
             seed=12,
         )
         automl.fit(self.X, self.y)
         y_predicted = automl.predict(self.X)["p_1"]
         loss = metric(self.y, y_predicted)
         losses += [loss]
     assert_almost_equal(losses[0], losses[1], decimal=4)
Exemplo n.º 10
0
def train_digits():
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(
        digits.data),
                                                        digits.target,
                                                        stratify=digits.target,
                                                        test_size=0.25,
                                                        random_state=123)

    # train models with AutoML
    automl = AutoML(mode="Perform", results_path="AutoML_digits")
    automl.fit(X_train, y_train)

    # compute
    predictions = automl.predict_all(X_test)
    print(predictions.head())
    print("Test accuracy:",
          accuracy_score(y_test, predictions["label"].astype(int)))

    plot_digits(X_test, predictions)
    def test_fit_and_predict(self):

        for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]:
            df = pd.read_csv("./tests/data/{0}.csv".format(dataset_id))
            x_cols = [c for c in df.columns if c != "target"]
            X = df[x_cols]
            y = df["target"]

            for repeat in range(1):

                X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
                    X, y, test_size=0.3, random_state=1706 + repeat)
                automl = AutoML(
                    total_time_limit=60 * 1,  # 1h limit
                    algorithms=[
                        "Xgboost"
                    ],  # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
                    start_random_models=3,
                    hill_climbing_steps=1,
                    top_models_to_improve=1,
                    train_ensemble=True,
                    verbose=True,
                )
                automl.fit(X_train, y_train)

                response = automl.predict(X_test)["p_1"]
                labels = automl.predict(X_test)["label"]

                # Compute the logloss on test dataset
                ll = log_loss(y_test, response)
                f1 = f1_score(y_test, labels)
                print("iter: {}) id:{} logloss:{} f1:{} time:{}".format(
                    repeat, dataset_id, ll, f1, automl._fit_time))
                with open("./result.txt", "a") as f_result:
                    f_result.write("{} {} {} {} {}\n".format(
                        repeat, dataset_id, ll, f1, automl._fit_time))
Exemplo n.º 12
0
import pandas as pd
import numpy as np
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score

df = pd.read_csv("tests/data/Titanic/train.csv")

X = df[df.columns[2:]]
y = df["Survived"]

automl = AutoML(mode="Explain")
automl.fit(X, y)
pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred))
test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
pred = automl.predict(test)
print("Test accuracy", accuracy_score(test["Survived"], pred))
Exemplo n.º 13
0
import numpy as np
from supervised.automl import AutoML
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

digits = load_digits()
X = pd.DataFrame(digits.data)
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.25)

automl = AutoML(
    #        results_path="AutoML_1",
    total_time_limit=10,
    start_random_models=1,
    hill_climbing_steps=0,
    top_models_to_improve=0,
    train_ensemble=True,
)

automl.fit(X_train, y_train)
predictions = automl.predict(X_test)

print(predictions.head())
print("Test accuracy:", accuracy_score(y_test,
                                       predictions["label"].astype(int)))
Exemplo n.º 14
0
df = pd.read_csv(
    "./tests/data/housing_regression_missing_values_missing_target.csv")
x_cols = [c for c in df.columns if c != "MEDV"]
X = df[x_cols]
y = df["MEDV"]

print("y", y[:10])

print(X.shape)

automl = AutoML(
    #results_path="AutoML_43",
    #total_time_limit=100,
    #algorithms=["Linear"],
    # "Decision Tree",
    # ,
    # "Extra Trees"
    #],
    #explain_level=0,
    #tuning_mode="Normal"
    mode='Explain',
    #train_ensemble = True
)
#automl.set_advanced(start_random_models=1)
automl.fit(X, y)

df["predictions"] = automl.predict(X)
print("Predictions")
print(df[["MEDV", "predictions"]].head())
import numpy as np
import pandas as pd
from supervised.automl import AutoML
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import log_loss
import warnings

# warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(results_path="AutoML_7", mode="Compete")
automl.fit(X_train, y_train)

predictions = automl.predict_all(X_test)

print(predictions.head())
print(predictions.tail())
print(X_test.shape, predictions.shape)
print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
warnings.filterwarnings(
    "error",
    category=pd.core.common.SettingWithCopyWarning)  # message="*ndarray*")

# df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv")
df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]

automl = AutoML(

    # results_path="AutoML_41",
    # algorithms=["CatBoost"],
    #algorithms=["Neural Network"],
    #    "Linear",
    #    "Xgboost",
    #    "Random Forest"
    # ],
    #total_time_limit=100,
    #tuning_mode="Normal",
    #explain_level=0,
    mode="Perform")
# automl.set_advanced(start_random_models=1)
automl.fit(X, y)

predictions = automl.predict(X)

print(predictions.head())
print(predictions.tail())

print(X.shape)
Exemplo n.º 17
0
automl = AutoML(mode="Explain")
automl.fit(X, y)
pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred))
test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
pred = automl.predict(test)
print("Test accuracy", accuracy_score(test["Survived"], pred))
'''

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from supervised import AutoML

train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv")
print(train.head())

X = train[train.columns[2:]]
y = train["Survived"]

#automl = AutoML(mode="Compete") # default mode is Explain
automl = AutoML(algorithms=["Decision Tree"]) # default mode is Explain

automl.fit(X, y)

test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv")
predictions = automl.predict(test)
print(predictions)
print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )
import os

import warnings

# warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(
    # results_path="AutoML_11",
    # algorithms=["Xgboost"],
    # total_time_limit=200,
    # explain_level=0
    # mode="Perform"
)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

print(predictions.head())
print(predictions.tail())
print(X_test.shape, predictions.shape)
Exemplo n.º 19
0
from scipy.integrate import quad, dblquad
#from pybayes.pdfs import CPdf
from scipy.special import gamma
from arch import arch_model
import pyflux as pf
import sys
import investpy
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
import feats
from supervised.automl import AutoML
#Main.include("try3.jl")
#pos=int(input("enter stock pos: "))
#lb = int(input("enter no of returns: "))
#k=pd.read_pickle('/home/sahil/projdir/dailydata.pkl')
#k1=k[k.Symbol==k.Symbol.unique()[pos]].iloc[-lb:]
automl = AutoML(mode='Compete', total_time_limit=600)
k1 = investpy.search_quotes(text='Kotak NIFTY ETF',
                            products=['etfs'],
                            countries=['India'],
                            n_results=2)[0]

k1 = investpy.get_etf_historical_data('KOTAKNIFTY',
                                      country='India',
                                      from_date='01/01/2010',
                                      to_date='20/03/2021')

#k1=investpy.search_quotes(text='AARTIIND',products=['stocks'],countries=['India'],n_results=2)[0].retrieve_historical_data(from_date='01/01/2019',to_date='07/12/2020')

k2 = investpy.get_index_historical_data(index='India VIX',
                                        country='India',
                                        from_date='01/01/2010',
Exemplo n.º 20
0
import pandas as pd
from supervised.automl import AutoML
import os

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

automl = AutoML(
    #results_path="AutoML_73",
    algorithms=["Linear", "Xgboost", "LightGBM", "Extra Trees"],
    total_time_limit=160,
    explain_level=0)
#automl.set_advanced(start_random_models=3)

automl.fit(X, y)
predictions = automl.predict(X)

print(predictions.head())
Exemplo n.º 21
0
    def test_fit_and_predict(self):
        metric = Metric({"name": "logloss"})

        automl = AutoML(total_time_limit=10, algorithms=["Xgboost"],
                        start_random_models=5,
                        hill_climbing_steps=0)
        automl.fit(self.X, self.y)

        y_predicted = automl.predict(self.X)
        self.assertTrue(y_predicted is not None)
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.5)

        params = automl.to_json()
        automl2 = AutoML()
        automl2.from_json(params)

        y_predicted2 = automl2.predict(self.X)
        self.assertTrue(y_predicted2 is not None)
        loss2 = metric(self.y, y_predicted2)
        self.assertTrue(loss2 < 0.5)
import pandas as pd
from supervised.automl import AutoML
import os

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

automl = AutoML(total_time_limit=1000)
# results_path = "AutoML_8",
# ,
# start_random_models=1,
# hill_climbing_steps=0,
# top_models_to_improve=3,
# train_ensemble=True)

print(X)
print(y)

automl.fit(X, y)
print(X)
print(y)

predictions = automl.predict(X)
print(predictions.head())
print("Train accuracy", accuracy_score(y, pred))
test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
pred = automl.predict(test)
print("Test accuracy", accuracy_score(test["Survived"], pred))
"""

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from supervised import AutoML

train = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv"
)
print(train.head())

X = train[train.columns[2:]]
y = train["Survived"]

# automl = AutoML(mode="Compete") # default mode is Explain
automl = AutoML(total_time_limit=120)  # default mode is Explain

automl.fit(X, y)

test = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv"
)
predictions = automl.predict(test)
print(predictions)
print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%")
Exemplo n.º 24
0
# warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(
    algorithms=["LightGBM"],
    mode="Compete",
    explain_level=0,
    train_ensemble=True,
    golden_features=False,
    features_selection=False,
    eval_metric="auc",
)
automl.fit(X_train, y_train)

predictions = automl.predict_all(X_test)

print(predictions.head())
print(predictions.tail())
print(X_test.shape, predictions.shape)
print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
Exemplo n.º 25
0
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(
    #results_path="AutoML_30",
    algorithms=["Random Forest"],
    total_time_limit=20,
    explain_level=0,
    # validation={"validation_type": "split"},
    mode="Explain",
    # validation={"validation_type": "split"}
    validation={
        "validation_type": "kfold",
        "k_folds": 2,
        "shuffle": True,
        "stratify": True,
    },
    golden_features=True,
    feature_selection=True)
automl.set_advanced(start_random_models=20,
                    hill_climbing_steps=10,
                    top_models_to_improve=3)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

print(predictions.head())
Exemplo n.º 26
0
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error
from supervised.automl import AutoML  # mljar-supervised
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor
from scipy.stats import ttest_ind


def doctypes(l):
    if l == 'PSA' or l == 'MSA':
        return 'PSA/MSA'
    else:
        return l


automl = AutoML(mode='Explain', total_time_limit=680)
df = pd.read_csv('/home/sahil/Downloads/LegalTrackerDated1.csv')
#df = df[df['Deal Created date'].notna()]
df = df[df['Date of Initial Review by Legal'].notna()]
#df['Deal Created date']=df['Deal Created date'].apply(lambda x:parser.parse(x))
df['Date of Origination'] = df['Date of Origination'].apply(
    lambda x: parser.parse(x))
df['Date of Initial Review by Legal'] = df[
    'Date of Initial Review by Legal'].apply(lambda x: parser.parse(x))
df['initreview'] = df['Date of Initial Review by Legal']
df['Turnaround Time'] = (df.initreview -
                         df['Date of Origination']).apply(lambda x: x.days)
df = df[df['No. of pages '].notna()]
df['pages'] = df['No. of pages ']
df = df[df.pages != 'RFP']
df['qornot'] = df['Whether Q Template \n(Yes / No)']
Exemplo n.º 27
0
#     keep_cross_validation_fold_assignment=True)
#
# automl.train(y="Hall_of_Fame", x=['At_bats', 'Runs'], training_frame=data, fold_column='TWORAVENS_FOLD_COLUMN')
#
# best_model = h2o.get_model(automl.leaderboard.as_data_frame()['model_id'][0])
#
# print(best_model.cross_validation_fold_assignment())
#

from supervised.automl import AutoML
import pandas as pd

dataframe = pd.read_csv(data_path)
dataframe = dataframe[dataframe['Hall_of_Fame'] != 2]

automl_mljar = AutoML(total_time_limit=30)
automl_mljar.fit(dataframe[['Runs', 'At_bats']], dataframe['Hall_of_Fame'])

mljar_model = automl_mljar._models[0]

mljar_model.train({
    "train": {
        "X": dataframe[['Runs', 'At_bats']],
        "y": dataframe['Hall_of_Fame']
    }
})
mljar_model.predict(dataframe[['Runs', 'At_bats']])

# import mlbox.model.classification
# import mlbox.model.regression
#
Exemplo n.º 28
0
                    max_depth = max_depth, min_child_weight=1, missing=None, n_estimators=n_estimators,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)
    cf = 'features'
elif option == 'AutoML':
    from supervised.automl import AutoML
    '''
    All the time parameters are in seconds.
    '''
    total_time_limit = st.slider('total_time_limit',60,600,120)
    top_models_to_improve = st.slider('top_models_to_improve',1,10,4)
    learner_time_limit = st.slider('learner_time_limit',10,120,60)
    hill_climbing_steps = st.slider('hill_climbing_steps',1,10,3)
    model = AutoML(total_time_limit=total_time_limit,top_models_to_improve=top_models_to_improve,
                    learner_time_limit=learner_time_limit,algorithms=["Xgboost", "RF", "LightGBM"],
                    start_random_models=10, hill_climbing_steps=hill_climbing_steps)
    

'''
### Run the model
Now that everything is properly set-up you can run the model.
'''
# running models
if st.button('Run model'):
    if processed:
        with st.spinner(text='Training models...'):
            my_bar = st.progress(0)
            my_bar.progress(1)

            if option == 'AutoML':
Exemplo n.º 29
0
import pandas as pd

# scikit learn utilites
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# mljar-supervised package
from supervised.automl import AutoML

# Load the data
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(digits.data),
                                                    digits.target,
                                                    stratify=digits.target,
                                                    test_size=0.25)

# train models
automl = AutoML(mode="Perform")
automl.fit(X_train, y_train)

# compute the accuracy on test data
predictions = automl.predict(X_test)
print(predictions.head())
print("Test accuracy:", accuracy_score(y_test,
                                       predictions["label"].astype(int)))
Exemplo n.º 30
0
import pandas as pd
import numpy as np
from supervised.automl import AutoML

# df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv")
df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]

automl = AutoML(
    # results_path="AutoML_100",
    algorithms=[
        "Linear",
        # "Xgboost",
        # "Random Forest"
    ],
    model_time_limit=1,
    tuning_mode="Normal",
)
automl.set_advanced(start_random_models=1)
automl.fit(X, y)

predictions = automl.predict(X)

print(predictions.head())
print(predictions.tail())