def do_submission(): train, test = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) test_X = test["tweet"] feature_type = ["wordcount", "char"] test_ids = get_test_ids(test) meta_train_X, meta_test_X = get_extracted_features(feature_type, train_X, test_X) print("n_samples: %d, n_features: %d" % meta_train_X.shape) predict_and_sub(meta_train_X, train_Y.values, meta_test_X, test_ids, predict_ridge)
def train(): train, _ = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) t0 = time() feature_type = ["wordcount", "char"] rmse_avg = do_cross_val(X_train, y_train, feature_type, nfolds=3) print("Average RMSE %.6f" % rmse_avg) duration = time() - t0 print("training time: %fs" % duration)
def train_model(): train, _ = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) scorer = make_scorer(rmse_score, greater_is_better=False) pipeline, parameters = get_ridge_model() # pipeline, parameters = get_three_predictor_model() # pipeline, parameters = get_elasticnet_model() # pipeline, parameters = get_three_predictor_model2() # pipeline, parameters = get_three_predictor_model3() # pipeline, parameters = get_ridge_model2() # pipeline, parameters = get_ridge_model3() # pipeline, parameters = get_advanced_ridge() do_gridsearch(X_train, y_train, pipeline, parameters, scorer)
def train_final(): """ train a model using grid search for parameter estimation """ train, test = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) test_X = test["tweet"] tfidf1 = TfidfVectorizer( max_df=0.6, min_df=0.0000003, stop_words="english", strip_accents="unicode", token_pattern="\w{1,}", max_features=5000, norm="l2", use_idf=False, smooth_idf=False, ngram_range=(1, 3), ) tfidf2 = TfidfVectorizer( max_df=0.6, analyzer="char", min_df=0.00001, stop_words="english", strip_accents="unicode", norm="l2", max_features=5000, ngram_range=(1, 7), smooth_idf=False, use_idf=False, ) tfidf1.fit(np.hstack((train_X, test_X))) tfidf2.fit(np.hstack((train_X, test_X))) train_X1 = tfidf1.transform(train_X) train_X2 = tfidf2.transform(train_X) train_X = hstack([train_X1, train_X2]).tocsr() n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) scorer = make_scorer(rmse_score, greater_is_better=False) pipeline, parameters = get_advanced_ridge2() # pipeline, parameters = get_three_predictor_model() # pipeline, parameters = get_elasticnet_model() # pipeline, parameters = get_three_predictor_model2() # pipeline, parameters = get_three_predictor_model3() # pipeline, parameters = get_ridge_model2() # pipeline, parameters = get_ridge_model3() # pipeline, parameters = get_advanced_ridge() best_estimator = do_gridsearch(X_train, y_train, pipeline, parameters, n_jobs=5, verbose=1, scoring=scorer) # predict test data test_1 = tfidf1.transform(test_X) test_2 = tfidf2.transform(test_X) test_d = hstack([test_1, test_2]) final_preds = best_estimator.predict(test_d) save_prediction_subs(test["id"], final_preds)