예제 #1
0
def build_submission(model, test_data_path, baseline_path):
    print("> Building kaggle submission...")
    df_test = load_data(test_data_path)
    df_test = transform_data(df_test)
    X_test, _ = extract_features(df_test)

    df_baseline = load_data(baseline_path)
    # print(df_baseline)

    # TODO: for NaNs, we just use the "baseline" predictor?

    y_predicted = model.predict(X_test)
    print(y_predicted)
    print("----------------")
def load_and_save():
    qq_data = load_data("QQ", "pre_data", "cn", "data/QQ/NewsQQ.txt",
                        "data/QQ/LabelQQ.txt")
    qq_data.load_dataset()
    qq_data.save_dataset()

    sina_data = load_data("Sina", "pre_data", "cn", "data/Sina/NewsSina.txt",
                          "data/Sina/LabelSina.txt")
    sina_data.load_dataset()
    sina_data.save_dataset()

    reddit_data = load_data("Reddit", "pre_data", "en", "data/Reddit", "")
    reddit_data.load_dataset()
    reddit_data.save_dataset()
예제 #3
0
def predict_for_csv(model, test_data_path):
    """
    :return:
    """
    print("> Predicting for csv file...")
    df_test = load_data(test_data_path)
    df_test = preprocess_data(df_test)
    X_test, _ = extract_features(df_test)
    print("Predicting set: %s" % str(X_test.shape))
    print("----------------")
    return predict_for_items(model, X_test)
예제 #4
0
def main():
    paths = setup_paths()
    raw_data_path = paths.get("raw_data_path") + "train.csv"

    # get raw data, from /datasets/
    df = load_data(raw_data_path)

    # deal with cleaning/transforming raw data into the correct types
    df = preprocess_data(df)

    # building the relevant features set
    X, y, features = extract_features(df)
    X_train, X_eval, y_train, y_eval = split_train_test(X, y)

    model = train_model(X_train, y_train, features)
    evaluate_model(model, X_eval, y_eval)
예제 #5
0
def main():
    paths = setup_paths()
    raw_data_path = paths.get("raw_data_path") + "train.csv"

    df = load_data(raw_data_path)
    explore_data(df)