def rf_model(train, target, test, text_train_tfidf, text_test_tfidf): text_train = train["Title"].values + ". " + train["BodyMarkdown"].values text_test = test["Title"].values + ". " + test["BodyMarkdown"].values print("Creating word2vec model...") w2v.make_word2vec_model(text_train, text_test) wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False) X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) print("Creating linear model metafeature...") X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False) print("Creating word2vec model metafeature...") X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf") scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000, min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234) result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf): X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) feature_train = np.load("w2v/word2vec_feature_train") feature_test = np.load("w2v/word2vec_feature_test") X_train = np.column_stack((X_train.values, feature_train)) X_test = np.column_stack((X_test.values, feature_test)) scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr") X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr") model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2") result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result
def main(): train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) #tuning.parametr_tuning(train, target, param_grid={}) #tuning.ensemble_tuning(train, test, target, load_list=["xgb1"]) result = make_predictions(load_list=["xgb1", "xgb2", "ext1"]) io.save_result(test_index, result)
def main(): train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target) #tuning.tune_xgboost(train, target, load_list=[]) #tuning.parametr_tuning(train, target, param_grid={}) #tuning.ensemble_tuning(train, target, load_list=[]) model = sklearn.ensemble.RandomForestClassifier(n_estimators=2000, max_depth=8, criterion="entropy", bootstrap=False, min_samples_leaf=4, min_samples_split=2, random_state=1234) model.fit(train, target) result = model.predict_proba(test)[:, 1] """ result = make_predictions(train, target, test, load_list=["rf_entropy", "xgb"]) """ io.save_result(test_index, result)
update_momentum=0.9, eval_size=0.01, verbose=0, max_epochs=100, use_label_encoder=True) model3.fit(train, target) pickle.dump(model3, open("final_models/nn/nn_n_"+str(i)+".pkl", "wb")) else: model3 = pickle.load(open("final_models/nn/nn_n_"+str(i)+".pkl", "rb")) pred3 += model3.predict_proba(test)[:, 1] pred3 /= 10 if ranking: pred1 = scipy.stats.rankdata(pred1) pred2 = scipy.stats.rankdata(pred2) pred3 = scipy.stats.rankdata(pred3) result = 0.21*pred1 + 0.47*pred2 + 0.32*pred3 return result train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target) tuning.parametr_tuning(train, target, param_grid={"alpha": [0.01]}) #tuning.ensemble_tuning(train, target, ranking=True, load_list=["linear", "xgb"]) """ result = make_predictions(train, target, test, ranking=True, load_list=["linear", "xgb", "nn"]) io.save_result(test_index, result) """
X_test = scaler.transform(X_test) X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr") X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr") model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2") result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result def make_dirs(dir_names): for name in dir_names: if not os.path.exists(name): os.makedirs(name) dir_names = ["input", "output", "w2v", "metafeatures"] make_dirs(dir_names) train, target, test = io.load_data() text_train_tfidf, text_test_tfidf = get_tfidf(train, test) preds1 = rf_model(train, target, test, text_train_tfidf, text_test_tfidf) preds2 = linear_model(train, target, test, text_train_tfidf, text_test_tfidf) result = 0.7*preds1 + 0.3*preds2 io.save_result(test["PostId"], result)