def rf_model(train, target, test, text_train_tfidf, text_test_tfidf): text_train = train["Title"].values + ". " + train["BodyMarkdown"].values text_test = test["Title"].values + ". " + test["BodyMarkdown"].values print("Creating word2vec model...") w2v.make_word2vec_model(text_train, text_test) wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False) X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) print("Creating linear model metafeature...") X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False) print("Creating word2vec model metafeature...") X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf") scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000, min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234) result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf): X_train, X_test = fe.extract_features(train), fe.extract_features(test) X_train, X_test = fe.categories_to_counters(X_train, X_test, target) X_train, X_test = fe.transform_features(X_train, X_test) feature_train = np.load("w2v/word2vec_feature_train") feature_test = np.load("w2v/word2vec_feature_test") X_train = np.column_stack((X_train.values, feature_train)) X_test = np.column_stack((X_test.values, feature_test)) scaler = sklearn.preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr") X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr") model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2") result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result