def test_fit_and_predict(self): seed = 1706 + 1 for dataset_id in [31]: # 720 # 31,44,737 df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id)) x_cols = [c for c in df.columns if c != "target"] X = df[x_cols] y = df["target"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=seed) automl = AutoML( total_time_limit=60 * 6000, algorithms=["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], start_random_models=10, hill_climbing_steps=3, top_models_to_improve=3, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test) # Compute the logloss on test dataset ll = log_loss(y_test, response) print("(*) Dataset id {} logloss {}".format(dataset_id, ll)) for i, m in enumerate(automl._models): response = m.predict(X_test) ll = log_loss(y_test, response) print("{}) Dataset id {} logloss {}".format(i, dataset_id, ll))
def run(dataset, config): log.info("\n**** mljar-supervised ****\n") column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) X_train = pd.DataFrame(dataset.train.X, columns=column_names).astype(column_types, copy=False) X_test = pd.DataFrame(dataset.test.X, columns=column_names).astype(column_types, copy=False) y_train = dataset.train.y.flatten() y_test = dataset.test.y.flatten() problem_mapping = dict( binary="binary_classification", multiclass="multiclass_classification", regression="regression", ) is_classification = config.type == "classification" ml_task = problem_mapping.get( dataset.problem_type ) # if None the AutoML will guess about the ML task results_path = output_subdir("results", config) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith("_") } automl = AutoML(results_path=results_path, total_time_limit=config.max_runtime_seconds, seed=config.seed, ml_task=ml_task, **training_params) with Timer() as training: automl.fit(X_train, y_train) preds = automl.predict(X_test) predictions, probabilities = None, None if is_classification: predictions = preds["label"].values probabilities = preds[preds.columns[:-1]].values else: predictions = preds["prediction"].values # clean the results if not config.framework_params.get("_save_artifacts", False): shutil.rmtree(results_path, ignore_errors=True) return result( output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, models_count=len(automl._models), training_duration=training.duration, )
def test_fit_and_predict(self): seed = 1709 df = pd.read_csv( "./tests/data/housing_regression_missing_values_missing_target.csv" ) print(df.columns) x_cols = [c for c in df.columns if c != "MEDV"] X = df[x_cols] y = df["MEDV"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=seed) automl = AutoML( total_time_limit=10, algorithms=["Xgboost" ], # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test) # ["p_1"] print("Response", response)
def run(self): from supervised.automl import AutoML dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe().dropna() X = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] stimulus, preprocessor = preprocess(dataframe, self.specification) if self.specification.get('timeBoundSearch'): self.system_params['total_time_limit'] = self.specification[ 'timeBoundSearch'] if self.specification.get('timeBoundRun'): self.system_params['learner_time_limit'] = self.specification[ 'timeBoundRun'] automl = AutoML(**self.system_params) # mljar seems kind of fragile? stimulus = pandas.DataFrame(stimulus) stimulus.columns = [str(i).strip() for i in stimulus.columns] automl.fit(stimulus, dataframe[y]) for model_mljar in sorted(automl._models, key=lambda m: m.get_final_loss())[:4]: model = ModelSklearn( model_mljar, system='mljar-supervised', search_id=self.search_id, predictors=X, targets=[y], preprocess=preprocessor, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found]( model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'mljar-supervised' } }
def test_fit_optimize_auc(self): automl = AutoML( total_time_limit=5, algorithms=["Xgboost"], start_random_models=2, hill_climbing_steps=0, optimize_metric="auc", seed=16, ) automl.fit(self.X, self.y) ldb = automl.get_leaderboard() self.assertEqual(ldb["metric_type"][0], "auc") self.assertEqual(np.sum(ldb["metric_value"] > 0.5), ldb.shape[0]) # all better than 0.5 AUC
def test_predict_labels(self): # 3.csv') # df = pd.read_csv( 'tests/data/adult_missing_values_missing_target_500rows.csv') X = df[df.columns[:-1]] y = df[df.columns[-1]] automl = AutoML(total_time_limit=15, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0, train_ensemble=True) automl.fit(X, y) y_predicted = automl.predict(X) self.assertTrue('A' in np.unique(y_predicted['label'])) self.assertTrue('B' in np.unique(y_predicted['label']))
def train_titanic(train_data): train_df = pd.read_csv(train_data) # test_df = pd.read_csv(test_data) # feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1).columns feature_cols = train_df.columns[2:] target_cols = 'Survived' X_train, X_test, y_train, y_test = train_test_split(train_df[feature_cols], train_df[target_cols], test_size=0.25) automl = AutoML(results_path="AutoML_titanic") automl.fit(X_train, y_train) predictions = automl.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions) * 100.0:.2f}%")
def test_predict_labels(self): automl = AutoML( total_time_limit=15, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0, train_ensemble=True, seed=15, ) automl.fit(self.X, self.y) ldb = automl.get_leaderboard() self.assertEqual(ldb.shape[0], len(automl._models)) for col in [ "uid", "model_type", "metric_type", "metric_value", "train_time" ]: self.assertTrue(col in ldb.columns)
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) losses = [] for i in range(2): automl = AutoML( total_time_limit= 10000, # the time limit should be big enough too not interrupt the training algorithms=["Xgboost"], start_random_models=2, hill_climbing_steps=1, train_ensemble=True, verbose=True, seed=12, ) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X)["p_1"] loss = metric(self.y, y_predicted) losses += [loss] assert_almost_equal(losses[0], losses[1], decimal=4)
def train_digits(): digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame( digits.data), digits.target, stratify=digits.target, test_size=0.25, random_state=123) # train models with AutoML automl = AutoML(mode="Perform", results_path="AutoML_digits") automl.fit(X_train, y_train) # compute predictions = automl.predict_all(X_test) print(predictions.head()) print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))) plot_digits(X_test, predictions)
def test_fit_and_predict(self): for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]: df = pd.read_csv("./tests/data/{0}.csv".format(dataset_id)) x_cols = [c for c in df.columns if c != "target"] X = df[x_cols] y = df["target"] for repeat in range(1): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=1706 + repeat) automl = AutoML( total_time_limit=60 * 1, # 1h limit algorithms=[ "Xgboost" ], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"], start_random_models=3, hill_climbing_steps=1, top_models_to_improve=1, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test)["p_1"] labels = automl.predict(X_test)["label"] # Compute the logloss on test dataset ll = log_loss(y_test, response) f1 = f1_score(y_test, labels) print("iter: {}) id:{} logloss:{} f1:{} time:{}".format( repeat, dataset_id, ll, f1, automl._fit_time)) with open("./result.txt", "a") as f_result: f_result.write("{} {} {} {} {}\n".format( repeat, dataset_id, ll, f1, automl._fit_time))
import pandas as pd import numpy as np from supervised.automl import AutoML import os from sklearn.metrics import accuracy_score df = pd.read_csv("tests/data/Titanic/train.csv") X = df[df.columns[2:]] y = df["Survived"] automl = AutoML(mode="Explain") automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred)) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") pred = automl.predict(test) print("Test accuracy", accuracy_score(test["Survived"], pred))
import numpy as np from supervised.automl import AutoML from sklearn.datasets import load_digits from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split digits = load_digits() X = pd.DataFrame(digits.data) y = digits.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25) automl = AutoML( # results_path="AutoML_1", total_time_limit=10, start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0, train_ensemble=True, ) automl.fit(X_train, y_train) predictions = automl.predict(X_test) print(predictions.head()) print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
df = pd.read_csv( "./tests/data/housing_regression_missing_values_missing_target.csv") x_cols = [c for c in df.columns if c != "MEDV"] X = df[x_cols] y = df["MEDV"] print("y", y[:10]) print(X.shape) automl = AutoML( #results_path="AutoML_43", #total_time_limit=100, #algorithms=["Linear"], # "Decision Tree", # , # "Extra Trees" #], #explain_level=0, #tuning_mode="Normal" mode='Explain', #train_ensemble = True ) #automl.set_advanced(start_random_models=1) automl.fit(X, y) df["predictions"] = automl.predict(X) print("Predictions") print(df[["MEDV", "predictions"]].head())
import numpy as np import pandas as pd from supervised.automl import AutoML from sklearn.model_selection import train_test_split import os from sklearn.metrics import log_loss import warnings # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) automl = AutoML(results_path="AutoML_7", mode="Compete") automl.fit(X_train, y_train) predictions = automl.predict_all(X_test) print(predictions.head()) print(predictions.tail()) print(X_test.shape, predictions.shape) print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
warnings.filterwarnings( "error", category=pd.core.common.SettingWithCopyWarning) # message="*ndarray*") # df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv") df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] y = df["class"] automl = AutoML( # results_path="AutoML_41", # algorithms=["CatBoost"], #algorithms=["Neural Network"], # "Linear", # "Xgboost", # "Random Forest" # ], #total_time_limit=100, #tuning_mode="Normal", #explain_level=0, mode="Perform") # automl.set_advanced(start_random_models=1) automl.fit(X, y) predictions = automl.predict(X) print(predictions.head()) print(predictions.tail()) print(X.shape)
automl = AutoML(mode="Explain") automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred)) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") pred = automl.predict(test) print("Test accuracy", accuracy_score(test["Survived"], pred)) ''' import pandas as pd import numpy as np from sklearn.metrics import accuracy_score from supervised import AutoML train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv") print(train.head()) X = train[train.columns[2:]] y = train["Survived"] #automl = AutoML(mode="Compete") # default mode is Explain automl = AutoML(algorithms=["Decision Tree"]) # default mode is Explain automl.fit(X, y) test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv") predictions = automl.predict(test) print(predictions) print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )
import os import warnings # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) automl = AutoML( # results_path="AutoML_11", # algorithms=["Xgboost"], # total_time_limit=200, # explain_level=0 # mode="Perform" ) automl.fit(X_train, y_train) predictions = automl.predict(X_test) print(predictions.head()) print(predictions.tail()) print(X_test.shape, predictions.shape)
from scipy.integrate import quad, dblquad #from pybayes.pdfs import CPdf from scipy.special import gamma from arch import arch_model import pyflux as pf import sys import investpy from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier import feats from supervised.automl import AutoML #Main.include("try3.jl") #pos=int(input("enter stock pos: ")) #lb = int(input("enter no of returns: ")) #k=pd.read_pickle('/home/sahil/projdir/dailydata.pkl') #k1=k[k.Symbol==k.Symbol.unique()[pos]].iloc[-lb:] automl = AutoML(mode='Compete', total_time_limit=600) k1 = investpy.search_quotes(text='Kotak NIFTY ETF', products=['etfs'], countries=['India'], n_results=2)[0] k1 = investpy.get_etf_historical_data('KOTAKNIFTY', country='India', from_date='01/01/2010', to_date='20/03/2021') #k1=investpy.search_quotes(text='AARTIIND',products=['stocks'],countries=['India'],n_results=2)[0].retrieve_historical_data(from_date='01/01/2019',to_date='07/12/2020') k2 = investpy.get_index_historical_data(index='India VIX', country='India', from_date='01/01/2010',
import pandas as pd from supervised.automl import AutoML import os df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] automl = AutoML( #results_path="AutoML_73", algorithms=["Linear", "Xgboost", "LightGBM", "Extra Trees"], total_time_limit=160, explain_level=0) #automl.set_advanced(start_random_models=3) automl.fit(X, y) predictions = automl.predict(X) print(predictions.head())
def test_fit_and_predict(self): metric = Metric({"name": "logloss"}) automl = AutoML(total_time_limit=10, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X) self.assertTrue(y_predicted is not None) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.5) params = automl.to_json() automl2 = AutoML() automl2.from_json(params) y_predicted2 = automl2.predict(self.X) self.assertTrue(y_predicted2 is not None) loss2 = metric(self.y, y_predicted2) self.assertTrue(loss2 < 0.5)
import pandas as pd from supervised.automl import AutoML import os df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] automl = AutoML(total_time_limit=1000) # results_path = "AutoML_8", # , # start_random_models=1, # hill_climbing_steps=0, # top_models_to_improve=3, # train_ensemble=True) print(X) print(y) automl.fit(X, y) print(X) print(y) predictions = automl.predict(X) print(predictions.head())
print("Train accuracy", accuracy_score(y, pred)) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") pred = automl.predict(test) print("Test accuracy", accuracy_score(test["Survived"], pred)) """ import pandas as pd import numpy as np from sklearn.metrics import accuracy_score from supervised import AutoML train = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv" ) print(train.head()) X = train[train.columns[2:]] y = train["Survived"] # automl = AutoML(mode="Compete") # default mode is Explain automl = AutoML(total_time_limit=120) # default mode is Explain automl.fit(X, y) test = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv" ) predictions = automl.predict(test) print(predictions) print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%")
# warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) automl = AutoML( algorithms=["LightGBM"], mode="Compete", explain_level=0, train_ensemble=True, golden_features=False, features_selection=False, eval_metric="auc", ) automl.fit(X_train, y_train) predictions = automl.predict_all(X_test) print(predictions.head()) print(predictions.tail()) print(X_test.shape, predictions.shape) print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) automl = AutoML( #results_path="AutoML_30", algorithms=["Random Forest"], total_time_limit=20, explain_level=0, # validation={"validation_type": "split"}, mode="Explain", # validation={"validation_type": "split"} validation={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, golden_features=True, feature_selection=True) automl.set_advanced(start_random_models=20, hill_climbing_steps=10, top_models_to_improve=3) automl.fit(X_train, y_train) predictions = automl.predict(X_test) print(predictions.head())
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error from supervised.automl import AutoML # mljar-supervised from sklearn.preprocessing import OneHotEncoder from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor from scipy.stats import ttest_ind def doctypes(l): if l == 'PSA' or l == 'MSA': return 'PSA/MSA' else: return l automl = AutoML(mode='Explain', total_time_limit=680) df = pd.read_csv('/home/sahil/Downloads/LegalTrackerDated1.csv') #df = df[df['Deal Created date'].notna()] df = df[df['Date of Initial Review by Legal'].notna()] #df['Deal Created date']=df['Deal Created date'].apply(lambda x:parser.parse(x)) df['Date of Origination'] = df['Date of Origination'].apply( lambda x: parser.parse(x)) df['Date of Initial Review by Legal'] = df[ 'Date of Initial Review by Legal'].apply(lambda x: parser.parse(x)) df['initreview'] = df['Date of Initial Review by Legal'] df['Turnaround Time'] = (df.initreview - df['Date of Origination']).apply(lambda x: x.days) df = df[df['No. of pages '].notna()] df['pages'] = df['No. of pages '] df = df[df.pages != 'RFP'] df['qornot'] = df['Whether Q Template \n(Yes / No)']
# keep_cross_validation_fold_assignment=True) # # automl.train(y="Hall_of_Fame", x=['At_bats', 'Runs'], training_frame=data, fold_column='TWORAVENS_FOLD_COLUMN') # # best_model = h2o.get_model(automl.leaderboard.as_data_frame()['model_id'][0]) # # print(best_model.cross_validation_fold_assignment()) # from supervised.automl import AutoML import pandas as pd dataframe = pd.read_csv(data_path) dataframe = dataframe[dataframe['Hall_of_Fame'] != 2] automl_mljar = AutoML(total_time_limit=30) automl_mljar.fit(dataframe[['Runs', 'At_bats']], dataframe['Hall_of_Fame']) mljar_model = automl_mljar._models[0] mljar_model.train({ "train": { "X": dataframe[['Runs', 'At_bats']], "y": dataframe['Hall_of_Fame'] } }) mljar_model.predict(dataframe[['Runs', 'At_bats']]) # import mlbox.model.classification # import mlbox.model.regression #
max_depth = max_depth, min_child_weight=1, missing=None, n_estimators=n_estimators, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) cf = 'features' elif option == 'AutoML': from supervised.automl import AutoML ''' All the time parameters are in seconds. ''' total_time_limit = st.slider('total_time_limit',60,600,120) top_models_to_improve = st.slider('top_models_to_improve',1,10,4) learner_time_limit = st.slider('learner_time_limit',10,120,60) hill_climbing_steps = st.slider('hill_climbing_steps',1,10,3) model = AutoML(total_time_limit=total_time_limit,top_models_to_improve=top_models_to_improve, learner_time_limit=learner_time_limit,algorithms=["Xgboost", "RF", "LightGBM"], start_random_models=10, hill_climbing_steps=hill_climbing_steps) ''' ### Run the model Now that everything is properly set-up you can run the model. ''' # running models if st.button('Run model'): if processed: with st.spinner(text='Training models...'): my_bar = st.progress(0) my_bar.progress(1) if option == 'AutoML':
import pandas as pd # scikit learn utilites from sklearn.datasets import load_digits from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split # mljar-supervised package from supervised.automl import AutoML # Load the data digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25) # train models automl = AutoML(mode="Perform") automl.fit(X_train, y_train) # compute the accuracy on test data predictions = automl.predict(X_test) print(predictions.head()) print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
import pandas as pd import numpy as np from supervised.automl import AutoML # df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv") df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] y = df["class"] automl = AutoML( # results_path="AutoML_100", algorithms=[ "Linear", # "Xgboost", # "Random Forest" ], model_time_limit=1, tuning_mode="Normal", ) automl.set_advanced(start_random_models=1) automl.fit(X, y) predictions = automl.predict(X) print(predictions.head()) print(predictions.tail())