lb = LabelBinarizer() feature_numeric = lb.fit_transform(df[[feature]]) else: feature_numeric = lb.transform(df[[feature]]) col_names = map(lambda x: feature + "_" + str(x).strip(), list(lb.classes_)) if lb.classes_.shape[0] == 2: col_names = col_names[:1] feature_df = pd.DataFrame(feature_numeric, columns=col_names, index=df.index) df = df.join(feature_df) return [lb, df] orig = pd.read_csv_sync(DATA_PATH + 'adult_with_colnames.csv', index_col=0) [train, test] = cross_validation.train_test_split_sync(orig, test_size=0.3, random_state=501) [lb, train] = oneHotEncoding(None, "workclass", train) cols = [col for col in train.columns if "workclass_" in col] [lb2, train] = oneHotEncoding(None, "sex", train) cols = [col for col in train.columns if "sex_" in col] train = train.drop(["workclass", "sex"], axis=1) new_cols = [ col for col in train.columns if "workclass_" in col or "sex_" in col ] logreg = linear_model.LogisticRegression(C=10) features = ['capital-gain', 'capital-loss', 'age'] + new_cols
from modeldb.sklearn_native import SyncableMetrics ROOT_DIR = '../../../../server/' DATA_PATH = '../../../../data/' name = "test1" author = "author" description = "kaggle-iris-script" # Creating a new project syncer_obj = Syncer(NewOrExistingProject(name, author, description), NewOrExistingExperiment("expName", "expDesc"), NewExperimentRun("iris test")) """ Cleaning up data first. """ iris_data = pd.read_csv_sync(DATA_PATH + 'iris-data.csv', na_values=['NA']) iris_data.loc[iris_data['class'] == 'versicolor', 'class'] = 'Iris-versicolor' iris_data.loc[iris_data['class'] == 'Iris-setossa', 'class'] = 'Iris-setosa' # This line drops any 'Iris-setosa' rows with a separal width less than 2.5 cm iris_data = iris_data.loc[(iris_data['class'] != 'Iris-setosa') | (iris_data['sepal_width_cm'] >= 2.5)] iris_data.loc[iris_data['class'] == 'Iris-setosa', 'sepal_width_cm'].hist() iris_data.loc[(iris_data['class'] == 'Iris-versicolor') & (iris_data['sepal_length_cm'] < 1.0)] iris_data.loc[(iris_data['class'] == 'Iris-versicolor') & (iris_data['sepal_length_cm'] < 1.0), 'sepal_length_cm'] *= 100.0
def run_otto_workflow(): name = "test1" author = "author" description = "kaggle-otto-script" # Creating a new project syncer_obj = Syncer(NewOrExistingProject(name, author, description), NewOrExistingExperiment("expName", "expDesc"), NewExperimentRun("otto test")) # Import Data # Note: This dataset is not included in the repo because of Kaggle # restrictions. # It can be downloaded from # https://www.kaggle.com/c/otto-group-product-classification-challenge/data X = pd.read_csv_sync(DATA_PATH + 'otto-train.csv') syncer_obj.add_tag(X, "original otto csv data") X = X.drop_sync('id', axis=1) syncer_obj.add_tag(X, "dropped id column") # Extract target # Encode it to make it manageable by ML algo y = X.target.values y = LabelEncoder().fit_transform_sync(y) # Remove target from train, else it's too easy ... X = X.drop_sync('target', axis=1) syncer_obj.add_tag(X, "data with dropped id and target columns") # Split Train / Test x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.20, random_state=36) syncer_obj.add_tag(x_test, "testing data") syncer_obj.add_tag(x_train, "training data") # First, we will train and apply a Random Forest WITHOUT calibration # we use a BaggingClassifier to make 5 predictions, and average # because that's what CalibratedClassifierCV do behind the scene, # and we want to compare things fairly, i.e. be sure that averaging several # models # is not what explains a performance difference between no calibration, # and calibration. clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clfbag = BaggingClassifier(clf, n_estimators=5) clfbag.fit_sync(x_train, y_train) y_preds = clfbag.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(clfbag, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITHOUT calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) # Now, we train and apply a Random Forest WITH calibration # In our case, 'isotonic' worked better than default 'sigmoid' # This is not always the case. Depending of the case, you have to test the # two possibilities clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) calibrated_clf.fit_sync(x_train, y_train) y_preds = calibrated_clf.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(calibrated_clf, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITH calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) print(" ") print("Conclusion : in our case, calibration improved" "performance a lot ! (reduced loss)") syncer_obj.sync() return syncer_obj, x_train, x_test
''' Source: http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients ''' # modeldb start name = "simple sample" author = "srinidhi" description = "simple LR for credit default prediction" syncer_obj = Syncer(NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("credit test")) # modeldb end # modeldb start df = pd.read_csv_sync(DATA_PATH + 'credit-default.csv', skiprows=[0]) # modeldb end target = df['default payment next month'] df = df[["LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE"]] x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( df, target, test_size=0.3) lr = linear_model.LogisticRegression(C=2) # modeldb start lr.fit_sync(x_train, y_train) # modeldb end # modeldb start
ROOT_DIR = '../../../../server/' DATA_PATH = '../../../../data/' name = "test1" author = "author" description = "kaggle-titanic-script" # Creating a new project syncer_obj = Syncer(NewOrExistingProject(name, author, description), NewOrExistingExperiment("expName", "expDesc"), NewExperimentRun("titanic test")) # Read the training set csv file. # Note: This dataset is not included in the repo because of Kaggle # restrictions. # It can be downloaded from https://www.kaggle.com/c/titanic/data titanic = pd.read_csv_sync(DATA_PATH + 'titanic-train.csv') # =====================Preprocessing the data===================== # Fill the missing value in "Age". titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median()) # Converting the Sex Column to numeric value titanic.loc[titanic["Sex"] == "male", "Sex"] = 0 titanic.loc[titanic["Sex"] == "female", "Sex"] = 1 # Converting the Embarked Column titanic["Embarked"] = titanic["Embarked"].fillna("S") titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 # NOTE: .loc commands don't create a new dataframe id
from modeldb.sklearn_native.ModelDbSyncer import * from modeldb.sklearn_native import SyncableMetrics DATA_PATH = '../../../../data/' # Pipelining: This chains a PCA and logistic regression, and uses the UCI # Census Adult dataset. name = "pipeline census" author = "srinidhi" description = "census data" syncer_obj = Syncer( NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc")) df = pd.read_csv_sync(DATA_PATH + 'adult.data.csv') new_df = pd.DataFrame() df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_level'] le = preprocessing.LabelEncoder() # Assigning 0.0 to represent incomes <=50K, and 1.0 to represent incomes >50K df['income_level'] = df['income_level'].str.strip() df['income_level'] = df['income_level'].replace(['<=50K'], [0.0]) df['income_level'] = df['income_level'].replace(['>50K'], [1.0]) # calling labelEncoder on any columns that are object types for coltype, colname in zip(df.dtypes, df.columns):