def test_logLoss(self): self.assertAlmostEqual(metrics.log_loss([1, 1, 0, 0], [1, 1, 0, 0]), 0) self.assertAlmostEqual(metrics.log_loss([1, 1, 0, 0], [1, 1, 1, 0]), np.inf) self.assertAlmostEqual( metrics.log_loss([1, 1, 1, 0, 0, 0], [0.5, 0.1, 0.01, 0.9, 0.75, 0.001]), 1.881797068998267) self.assertAlmostEqual(metrics.log_loss(1, 0.5), -np.log(0.5))
def fit(self, X, y, cv=None, **fit_params): self._set_params(**fit_params) indices = np.arange(X.shape[0]) np.random.shuffle(indices) X = X[indices,:] y = y[indices] if cv is None: cv = KFold(y.size, k=5) clf = self.classifier score_list = [] y_list = [] for train_index, test_index in cv: print train_index.shape, test_index.shape X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.predict_proba(X_test)[:,1].reshape(-1,1) score_list.append(score) y_list.append(y_test) yy = np.concatenate(y_list) scores = np.concatenate(score_list) self.a, self.b = fit_platt_logreg(scores, yy) print("Optimistic Log-loss: {0:f}".format( log_loss(yy, (1./1. + np.exp(-(self.a *scores + self.b)))))) self.classifier.fit(X, y) return self
def create_prediction_file(data_version, fold=99, is_test=False): # we can save the test prediction directly to the test predictions directory if is_test: data_file = os.path.join(parent_dir, ("P" + ` data_version ` + "/")) + "P" + ` data_version ` + "_test.csv" out_file = os.path.join( parent_dir, "Predictions/tests/") + "P" + ` data_version ` + "_test.csv" # got to save individual fold predictions to a temp directory names fold_predictions in order to # merge them in a single file to be used at second layer else: data_file = os.path.join(parent_dir, ( "P" + ` data_version ` + "/")) + "P" + ` data_version ` + "_Fold_" + ` fold ` + "_valid.csv" out_file = os.path.join( parent_dir, "Predictions/fold_predictions/" ) + "P" + ` data_version ` + "_Fold_" + ` fold ` + "_eval.csv" # the raw prediction file generated by VW text_file_name = os.path.join(parent_dir, "Predictions/tmp/") + "temp_prediction.txt" model_version = "VW_P" + ` data_version ` # need to match the ID field with the predictions with open(out_file, "wb") as prediction_csv: prediction_csv.write("raw\n") for line in open(text_file_name): row = line.strip().split(" ") prediction_csv.write("%s\n" % row[0]) if is_test: id_df = pd.read_csv(data_file)[["ID"]] preds_df = pd.read_csv(out_file) assert ( id_df.shape[0] == preds_df.shape[0] ), "data file and prediction file has differring number of rows..." id_df[model_version] = np.array(preds_df["raw"]) id_df.to_csv(out_file, index=False) else: id_df = pd.read_csv(data_file)[["ID", "target"]] id_df["Fold"] = np.repeat(fold, id_df.shape[0]) preds_df = pd.read_csv(out_file) assert ( id_df.shape[0] == preds_df.shape[0] ), "data file and prediction file has differring number of rows..." id_df[model_version] = np.array(preds_df["raw"]) id_df.to_csv(out_file, index=False) ll = log_loss(np.array(id_df["target"]), np.array(id_df[model_version])) print "******************************************************************************" print "***** data version : {0} | fold : {1} | fold sample: {2} | log loss {3} ******".format( data_version, fold, id_df.shape[0], np.round(ll, 7)) print "******************************************************************************" os.remove(text_file_name)
def create_prediction_file(data_version, fold = 99, is_test = False): # we can save the test prediction directly to the test predictions directory if is_test: data_file = os.path.join(parent_dir, ("P" + `data_version`+"/")) +"P" + `data_version` + "_test.csv" out_file = os.path.join(parent_dir, "Predictions/tests/") + "P" + `data_version` + "_test.csv" # got to save individual fold predictions to a temp directory names fold_predictions in order to # merge them in a single file to be used at second layer else: data_file = os.path.join(parent_dir, ("P" + `data_version`+"/")) + "P" + `data_version` + "_Fold_" + `fold` + "_valid.csv" out_file = os.path.join(parent_dir, "Predictions/fold_predictions/") + "P" + `data_version` + "_Fold_" + `fold` + "_eval.csv" # the raw prediction file generated by VW text_file_name = os.path.join(parent_dir, "Predictions/tmp/") + "temp_prediction.txt" model_version = "VW_P" + `data_version` # need to match the ID field with the predictions with open(out_file, "wb") as prediction_csv: prediction_csv.write("raw\n") for line in open(text_file_name): row = line.strip().split(" ") prediction_csv.write("%s\n" % row[0]) if is_test: id_df = pd.read_csv(data_file)[["ID"]] preds_df = pd.read_csv(out_file) assert (id_df.shape[0] == preds_df.shape[0]), "data file and prediction file has differring number of rows..." id_df[model_version] = np.array(preds_df["raw"]) id_df.to_csv(out_file, index = False) else: id_df = pd.read_csv(data_file)[["ID", "target"]] id_df["Fold"] = np.repeat(fold, id_df.shape[0]) preds_df = pd.read_csv(out_file) assert (id_df.shape[0] == preds_df.shape[0]), "data file and prediction file has differring number of rows..." id_df[model_version] = np.array(preds_df["raw"]) id_df.to_csv(out_file, index = False) ll = log_loss(np.array(id_df["target"]), np.array(id_df[model_version])) print "******************************************************************************" print "***** data version : {0} | fold : {1} | fold sample: {2} | log loss {3} ******".format(data_version, fold, id_df.shape[0], np.round(ll, 7)) print "******************************************************************************" os.remove(text_file_name)
n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent=False, min_child_weight=1, nthread=-1) bst.fit(X_train, y_train, eval_metric="logloss", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=200) preds = bst.predict_proba(X_valid)[:, 1] ll = log_loss(validationSet["target"], preds) df = pd.DataFrame({"ID": validationSet["ID"], pred_name: preds}) eval_matrix = eval_matrix.append(df, ignore_index=True) print "fold : {} | logloss: {}".format(i + 1, ll) del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid gc.collect() X_train = train[feature_names].copy() y_train = np.array(train["target"].copy()) bst = XGBClassifier(max_depth=8, learning_rate=0.01, n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent=False,
validationSet = train[idx] rf = RandomForestClassifier(n_estimators = 2000, criterion = "entropy", max_depth = 50, max_features = 0.8, min_samples_split = 3, bootstrap = False, oob_score = False, random_state = 112, verbose = 0, n_jobs = -1) rf.fit(trainingSet[feature_names], np.array(trainingSet["target"])) preds = rf.predict_proba(validationSet[feature_names])[:, 1] ll = log_loss(np.array(validationSet["target"]), preds) print "# Data_version : {0} | Fold : {1} | log_loss : {2}".format(i+1, j+1, ll) df = pd.DataFrame({"Fold" : np.repeat((j + 1), validationSet.shape[0]) ,"ID" : validationSet["ID"], "ground_truth" : validationSet["target"], model_version : preds}) tmp_name = "P" + `data_version` + "_Fold_" + `fold` + "_valid.csv" tmp_file = train_prediction_path + "tmp/" + tmp_name df.to_csv(tmp_file, index = False) eval_matrix = eval_matrix.append(df, ignore_index = True) del rf, trainingSet, validationSet, ll, df # generate test meta features # train on all training instances rf = RandomForestClassifier(n_estimators = 2000, criterion = "entropy", max_depth = 50, max_features = 0.8,
bst = XGBClassifier(max_depth=8, learning_rate = 0.01, n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent = False, min_child_weight=1, nthread=-1) bst.fit(X_train, y_train, eval_metric= "logloss", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=200) preds = bst.predict_proba(X_valid)[:, 1] ll = log_loss(validationSet["target"], preds) df = pd.DataFrame({"ID" : validationSet["ID"], pred_name : preds}) eval_matrix = eval_matrix.append(df, ignore_index = True) print "fold : {} | logloss: {}".format(i+1, ll) del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid gc.collect() X_train = train[feature_names].copy() y_train = np.array(train["target"].copy()) bst = XGBClassifier(max_depth=8, learning_rate = 0.01, n_estimators=2100, subsample=0.9, colsample_bytree=0.45, objective="binary:logistic", silent = False,
def test_logLoss(self): self.assertAlmostEqual(metrics.log_loss([1,1,0,0],[1,1,0,0]), 0) self.assertAlmostEqual(metrics.log_loss([1,1,0,0],[1,1,1,0]), np.inf) self.assertAlmostEqual(metrics.log_loss([1,1,1,0,0,0],[0.5,0.1,0.01,0.9,0.75,0.001]), 1.881797068998267) self.assertAlmostEqual(metrics.log_loss(1,0.5), -np.log(0.5))