예제 #1
0
def submission(model, norm, feat_selection, inputation, new_features, subm_name):
    dao = DAO(new_features=new_features)

    if norm:
        train = dao.get_normalized_data(dataset="train", inputation=inputation, max_na_count_columns=0.05)
        test = dao.get_normalized_data(dataset="test", inputation=inputation, max_na_count_columns=1)
        print(len(test))
    else:
        train = dao.get_data(cols_type="numeric", dataset="train", max_na_count_columns=0.05)
        test = dao.get_data(cols_type="numeric", dataset="test", max_na_count_columns=0.05)

    test_ids = test.index.tolist()


    if feat_selection is None:
        feat_selection_name = ""
    else:
        feat_selection_name = feat_selection.__name__
        columns = feat_selection(train)
        train_columns = columns + [TARGET]
        train = train[train_columns]
        test = test[columns]


    ev = Evaluator(model=model)
    pred = ev.run(train, test, abs_target=False)

    pred = pd.Series(pred).round(10)
    subm = pd.DataFrame()
    subm["ParcelId"] = test_ids
    subm["201610"] = pred
    subm["201611"] = pred
    subm["201612"] = pred
    subm["201710"] = pred
    subm["201711"] = pred
    subm["201712"] = pred

    subm_path = PathManager().get_submission_dir() + subm_name + ".csv"
    subm.to_csv(subm_path, index=False)

    subm_metadata = PathManager().get_submission_dir() + subm_name + ".json"
    with open(subm_metadata, 'w') as file:
        submission_dict = {}
        submission_dict["submission_name"] = subm_name
        submission_dict["norm"] = norm
        submission_dict["feat_selection"] = feat_selection_name
        submission_dict["model"] = model.get_model_name()
        submission_dict["inputation"] = inputation
        submission_dict["score"] = ""

        json.dump(submission_dict, file)
예제 #2
0

class H2ODeepLearning(H2OMlBase):
    def __init__(self, epochs=4):
        self.model = h2o.estimators.H2ODeepLearningEstimator(
            variable_importances=True, epochs=epochs)
        self.model_name = "H2ODeepLearning"
        H2OMlBase.__init__(self)


class H2ODeepWater(H2OMlBase):
    def __init__(self):
        self.model = h2o.estimators.H2ODeepWaterEstimator()
        self.model_name = "H2ODeepWater"
        H2OMlBase.__init__(self)


if __name__ == "__main__":
    model = H2OGradientBoosting()

    dao = DAO(train_file_name="train_complete_2016.csv")
    df_train = dao.get_normalized_data(max_na_count_columns=0.5)
    df_train = df_train.dropna()
    model.train(df_train, "logerror")

    pred = model.predict(df_train)
    print(pred)

    r2 = model.r2()
    print(r2)
예제 #3
0
    good_cols.remove("logerror")
    picked_cols = []

    for index, row in use_df_corr.loc[good_cols][good_cols].iterrows():
        # print(index)
        use_row = row[row.index != index]
        high_correlateds = use_row[use_row > corr_threshold].index.tolist()
        for high_correlated in high_correlateds:
            if high_correlated in good_cols and not high_correlated in picked_cols:
                good_cols.remove(high_correlated)

        picked_cols.append(index)

    return good_cols


if __name__ == "__main__":
    new_features_list = listdir(PathManager().get_new_features_dir())
    new_features_list = [[new_features.replace(".csv", "")]
                         for new_features in new_features_list]
    print("new_features_list:", new_features_list)

    dao = DAO(train_file_name="train_complete_2016.csv",
              new_features=["knn-longitude-latitude"])
    df = dao.get_normalized_data(max_na_count_columns=0.05)
    df = df.dropna()

    print(select_by_corr_thresh(df))
    print(df.columns.tolist())

#good_cols: ['longitude--latitude', 'bedroomcnt', 'structuretaxvaluedollarcnt', 'yearbuilt']