def final_auto_snapshot(): from kale.utils import pod_utils as _kale_pod_utils _kale_pod_utils.snapshot_pipeline_step("T", "final_auto_snapshot", "/path/to/nb", before=False)
def loaddata(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "loaddata", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb") import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB path = "data/" PREDICTION_LABEL = 'Survived' test_df = pd.read_csv(path + "test.csv") train_df = pd.read_csv(path + "train.csv") # -----------------------DATA SAVING START--------------------------------- if "test_df" in locals(): _kale_resource_save(test_df, os.path.join(_kale_data_directory, "test_df")) else: print("_kale_resource_save: `test_df` not found.") if "PREDICTION_LABEL" in locals(): _kale_resource_save( PREDICTION_LABEL, os.path.join(_kale_data_directory, "PREDICTION_LABEL")) else: print("_kale_resource_save: `PREDICTION_LABEL` not found.") if "train_df" in locals(): _kale_resource_save(train_df, os.path.join(_kale_data_directory, "train_df")) else: print("_kale_resource_save: `train_df` not found.")
def test(): from kale.utils import pod_utils as _kale_pod_utils _kale_pod_utils.snapshot_pipeline_step("T", "test", "/path/to/nb", before=True) _kale_pod_utils.snapshot_pipeline_step("T", "test", "/path/to/nb", before=False)
def test(): import os import shutil from kale.utils import pod_utils as _kale_pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/path" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) _kale_pod_utils.snapshot_pipeline_step("T", "test", "/path/to/nb")
def final_auto_snapshot(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "final_auto_snapshot", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb")
def test(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() from kale.utils import pod_utils as _kale_pod_utils _kale_mlmd_utils.call("link_input_rok_artifacts") _kale_pod_utils.snapshot_pipeline_step( "T", "test", "/path/to/nb", before=True) _rok_snapshot_task = _kale_pod_utils.snapshot_pipeline_step( "T", "test", "/path/to/nb", before=False) _kale_mlmd_utils.call("submit_output_rok_artifact", _rok_snapshot_task) _kale_mlmd_utils.call("mark_execution_complete")
def results(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "results", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb") # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "acc_log" not in _kale_directory_file_names: raise ValueError("acc_log" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "acc_log" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "acc_log" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] acc_log = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "acc_random_forest" not in _kale_directory_file_names: raise ValueError("acc_random_forest" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "acc_random_forest" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "acc_random_forest" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] acc_random_forest = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "acc_decision_tree" not in _kale_directory_file_names: raise ValueError("acc_decision_tree" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "acc_decision_tree" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "acc_decision_tree" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] acc_decision_tree = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "acc_gaussian" not in _kale_directory_file_names: raise ValueError("acc_gaussian" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "acc_gaussian" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "acc_gaussian" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] acc_gaussian = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "acc_linear_svc" not in _kale_directory_file_names: raise ValueError("acc_linear_svc" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "acc_linear_svc" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "acc_linear_svc" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] acc_linear_svc = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB results = pd.DataFrame({ 'Model': [ 'Support Vector Machines', 'logistic Regression', 'Random Forest', 'Naive Bayes', 'Decision Tree' ], 'Score': [ acc_linear_svc, acc_log, acc_random_forest, acc_gaussian, acc_decision_tree ] }) result_df = results.sort_values(by='Score', ascending=False) result_df = result_df.set_index('Score') print(result_df)
def randomforest(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "randomforest", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb") # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "train_labels" not in _kale_directory_file_names: raise ValueError("train_labels" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "train_labels" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "train_labels" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] train_labels = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "train_df" not in _kale_directory_file_names: raise ValueError("train_df" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "train_df" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "train_df" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] train_df = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_df, train_labels) acc_random_forest = round( random_forest.score(train_df, train_labels) * 100, 2) # -----------------------DATA SAVING START--------------------------------- if "acc_random_forest" in locals(): _kale_resource_save( acc_random_forest, os.path.join(_kale_data_directory, "acc_random_forest")) else: print("_kale_resource_save: `acc_random_forest` not found.")
def datapreprocessing(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "datapreprocessing", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb") # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "test_df" not in _kale_directory_file_names: raise ValueError("test_df" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "test_df" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "test_df" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] test_df = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "train_df" not in _kale_directory_file_names: raise ValueError("train_df" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "train_df" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "train_df" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] train_df = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB data = [train_df, test_df] for dataset in data: dataset['relatives'] = dataset['SibSp'] + dataset['Parch'] dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0 dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1 dataset['not_alone'] = dataset['not_alone'].astype(int) train_df['not_alone'].value_counts() # This does not contribute to a person survival probability train_df = train_df.drop(['PassengerId'], axis=1) import re deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8} data = [train_df, test_df] for dataset in data: dataset['Cabin'] = dataset['Cabin'].fillna("U0") dataset['Deck'] = dataset['Cabin'].map( lambda x: re.compile("([a-zA-Z]+)").search(x).group()) dataset['Deck'] = dataset['Deck'].map(deck) dataset['Deck'] = dataset['Deck'].fillna(0) dataset['Deck'] = dataset['Deck'].astype(int) # we can now drop the cabin feature train_df = train_df.drop(['Cabin'], axis=1) test_df = test_df.drop(['Cabin'], axis=1) data = [train_df, test_df] for dataset in data: mean = train_df["Age"].mean() std = test_df["Age"].std() is_null = dataset["Age"].isnull().sum() # compute random numbers between the mean, std and is_null rand_age = np.random.randint(mean - std, mean + std, size=is_null) # fill NaN values in Age column with random values generated age_slice = dataset["Age"].copy() age_slice[np.isnan(age_slice)] = rand_age dataset["Age"] = age_slice dataset["Age"] = train_df["Age"].astype(int) train_df["Age"].isnull().sum() train_df['Embarked'].describe() # fill with most common value common_value = 'S' data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].fillna(common_value) train_df.info() # -----------------------DATA SAVING START--------------------------------- if "test_df" in locals(): _kale_resource_save(test_df, os.path.join(_kale_data_directory, "test_df")) else: print("_kale_resource_save: `test_df` not found.") if "train_df" in locals(): _kale_resource_save(train_df, os.path.join(_kale_data_directory, "train_df")) else: print("_kale_resource_save: `train_df` not found.")
def featureengineering(rok_workspace_aidays01_2rlcyd0k8_url: str): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) pod_utils.snapshot_pipeline_step( "titanic-ml-fylgn", "featureengineering", "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb") # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "PREDICTION_LABEL" not in _kale_directory_file_names: raise ValueError("PREDICTION_LABEL" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "PREDICTION_LABEL" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "PREDICTION_LABEL" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] PREDICTION_LABEL = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "test_df" not in _kale_directory_file_names: raise ValueError("test_df" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "test_df" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "test_df" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] test_df = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) if "train_df" not in _kale_directory_file_names: raise ValueError("train_df" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "train_df" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "train_df" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] train_df = _kale_resource_load( os.path.join(_kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB data = [train_df, test_df] for dataset in data: dataset['Fare'] = dataset['Fare'].fillna(0) dataset['Fare'] = dataset['Fare'].astype(int) data = [train_df, test_df] titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} for dataset in data: # extract titles dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False) # replace titles with a more common title or as Rare dataset['Title'] = dataset['Title'].replace([ 'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona' ], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') # convert titles into numbers dataset['Title'] = dataset['Title'].map(titles) # filling NaN with 0, to get safe dataset['Title'] = dataset['Title'].fillna(0) train_df = train_df.drop(['Name'], axis=1) test_df = test_df.drop(['Name'], axis=1) genders = {"male": 0, "female": 1} data = [train_df, test_df] for dataset in data: dataset['Sex'] = dataset['Sex'].map(genders) train_df = train_df.drop(['Ticket'], axis=1) test_df = test_df.drop(['Ticket'], axis=1) ports = {"S": 0, "C": 1, "Q": 2} data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].map(ports) data = [train_df, test_df] for dataset in data: dataset['Age'] = dataset['Age'].astype(int) dataset.loc[dataset['Age'] <= 11, 'Age'] = 0 dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1 dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2 dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3 dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4 dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5 dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6 dataset.loc[dataset['Age'] > 66, 'Age'] = 6 # let's see how it's distributed train_df['Age'].value_counts() data = [train_df, test_df] for dataset in data: dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare'] = 3 dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare'] = 4 dataset.loc[dataset['Fare'] > 250, 'Fare'] = 5 dataset['Fare'] = dataset['Fare'].astype(int) data = [train_df, test_df] for dataset in data: dataset['Age_Class'] = dataset['Age'] * dataset['Pclass'] for dataset in data: dataset['Fare_Per_Person'] = dataset['Fare'] / (dataset['relatives'] + 1) dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int) # Let's take a last look at the training set, before we start training the models. train_df.head(10) train_labels = train_df[PREDICTION_LABEL] train_df = train_df.drop(PREDICTION_LABEL, axis=1) # -----------------------DATA SAVING START--------------------------------- if "train_labels" in locals(): _kale_resource_save(train_labels, os.path.join(_kale_data_directory, "train_labels")) else: print("_kale_resource_save: `train_labels` not found.") if "train_df" in locals(): _kale_resource_save(train_df, os.path.join(_kale_data_directory, "train_df")) else: print("_kale_resource_save: `train_df` not found.")