def submission(model, norm, feat_selection, inputation, new_features, subm_name): dao = DAO(new_features=new_features) if norm: train = dao.get_normalized_data(dataset="train", inputation=inputation, max_na_count_columns=0.05) test = dao.get_normalized_data(dataset="test", inputation=inputation, max_na_count_columns=1) print(len(test)) else: train = dao.get_data(cols_type="numeric", dataset="train", max_na_count_columns=0.05) test = dao.get_data(cols_type="numeric", dataset="test", max_na_count_columns=0.05) test_ids = test.index.tolist() if feat_selection is None: feat_selection_name = "" else: feat_selection_name = feat_selection.__name__ columns = feat_selection(train) train_columns = columns + [TARGET] train = train[train_columns] test = test[columns] ev = Evaluator(model=model) pred = ev.run(train, test, abs_target=False) pred = pd.Series(pred).round(10) subm = pd.DataFrame() subm["ParcelId"] = test_ids subm["201610"] = pred subm["201611"] = pred subm["201612"] = pred subm["201710"] = pred subm["201711"] = pred subm["201712"] = pred subm_path = PathManager().get_submission_dir() + subm_name + ".csv" subm.to_csv(subm_path, index=False) subm_metadata = PathManager().get_submission_dir() + subm_name + ".json" with open(subm_metadata, 'w') as file: submission_dict = {} submission_dict["submission_name"] = subm_name submission_dict["norm"] = norm submission_dict["feat_selection"] = feat_selection_name submission_dict["model"] = model.get_model_name() submission_dict["inputation"] = inputation submission_dict["score"] = "" json.dump(submission_dict, file)
class H2ODeepLearning(H2OMlBase): def __init__(self, epochs=4): self.model = h2o.estimators.H2ODeepLearningEstimator( variable_importances=True, epochs=epochs) self.model_name = "H2ODeepLearning" H2OMlBase.__init__(self) class H2ODeepWater(H2OMlBase): def __init__(self): self.model = h2o.estimators.H2ODeepWaterEstimator() self.model_name = "H2ODeepWater" H2OMlBase.__init__(self) if __name__ == "__main__": model = H2OGradientBoosting() dao = DAO(train_file_name="train_complete_2016.csv") df_train = dao.get_normalized_data(max_na_count_columns=0.5) df_train = df_train.dropna() model.train(df_train, "logerror") pred = model.predict(df_train) print(pred) r2 = model.r2() print(r2)
good_cols.remove("logerror") picked_cols = [] for index, row in use_df_corr.loc[good_cols][good_cols].iterrows(): # print(index) use_row = row[row.index != index] high_correlateds = use_row[use_row > corr_threshold].index.tolist() for high_correlated in high_correlateds: if high_correlated in good_cols and not high_correlated in picked_cols: good_cols.remove(high_correlated) picked_cols.append(index) return good_cols if __name__ == "__main__": new_features_list = listdir(PathManager().get_new_features_dir()) new_features_list = [[new_features.replace(".csv", "")] for new_features in new_features_list] print("new_features_list:", new_features_list) dao = DAO(train_file_name="train_complete_2016.csv", new_features=["knn-longitude-latitude"]) df = dao.get_normalized_data(max_na_count_columns=0.05) df = df.dropna() print(select_by_corr_thresh(df)) print(df.columns.tolist()) #good_cols: ['longitude--latitude', 'bedroomcnt', 'structuretaxvaluedollarcnt', 'yearbuilt']