def build_submission(model, test_data_path, baseline_path): print("> Building kaggle submission...") df_test = load_data(test_data_path) df_test = transform_data(df_test) X_test, _ = extract_features(df_test) df_baseline = load_data(baseline_path) # print(df_baseline) # TODO: for NaNs, we just use the "baseline" predictor? y_predicted = model.predict(X_test) print(y_predicted) print("----------------")
def load_and_save(): qq_data = load_data("QQ", "pre_data", "cn", "data/QQ/NewsQQ.txt", "data/QQ/LabelQQ.txt") qq_data.load_dataset() qq_data.save_dataset() sina_data = load_data("Sina", "pre_data", "cn", "data/Sina/NewsSina.txt", "data/Sina/LabelSina.txt") sina_data.load_dataset() sina_data.save_dataset() reddit_data = load_data("Reddit", "pre_data", "en", "data/Reddit", "") reddit_data.load_dataset() reddit_data.save_dataset()
def predict_for_csv(model, test_data_path): """ :return: """ print("> Predicting for csv file...") df_test = load_data(test_data_path) df_test = preprocess_data(df_test) X_test, _ = extract_features(df_test) print("Predicting set: %s" % str(X_test.shape)) print("----------------") return predict_for_items(model, X_test)
def main(): paths = setup_paths() raw_data_path = paths.get("raw_data_path") + "train.csv" # get raw data, from /datasets/ df = load_data(raw_data_path) # deal with cleaning/transforming raw data into the correct types df = preprocess_data(df) # building the relevant features set X, y, features = extract_features(df) X_train, X_eval, y_train, y_eval = split_train_test(X, y) model = train_model(X_train, y_train, features) evaluate_model(model, X_eval, y_eval)
def main(): paths = setup_paths() raw_data_path = paths.get("raw_data_path") + "train.csv" df = load_data(raw_data_path) explore_data(df)