Пример #1
0
def rf_model(train, target, test, text_train_tfidf, text_test_tfidf):

    text_train = train["Title"].values + ". " + train["BodyMarkdown"].values
    text_test = test["Title"].values + ". " + test["BodyMarkdown"].values
    print("Creating word2vec model...")
    w2v.make_word2vec_model(text_train, text_test)
    wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False)

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)


    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)
    print("Creating linear model metafeature...")
    X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False)
    print("Creating word2vec model metafeature...")
    X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf")

    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000,
                                                    min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234)

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result
Пример #2
0
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf):

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)

    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)

    feature_train = np.load("w2v/word2vec_feature_train")
    feature_test = np.load("w2v/word2vec_feature_test")

    X_train = np.column_stack((X_train.values, feature_train))
    X_test = np.column_stack((X_test.values, feature_test))


    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)


    X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr")
    X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr")

    model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2")

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result
Пример #3
0
def main():
    
    
    train, test, target, test_index = io.load_data()  
    train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)

    #tuning.parametr_tuning(train, target, param_grid={})
    #tuning.ensemble_tuning(train, test, target, load_list=["xgb1"])
    
    result = make_predictions(load_list=["xgb1", "xgb2", "ext1"])
    io.save_result(test_index, result)
Пример #4
0
def main():
    train, test, target, test_index = io.load_data()
    train, test, target = fe.preprocess_data(train, test, target)

    #tuning.tune_xgboost(train, target, load_list=[])
    #tuning.parametr_tuning(train, target, param_grid={})
    #tuning.ensemble_tuning(train, target, load_list=[])

    model = sklearn.ensemble.RandomForestClassifier(n_estimators=2000, max_depth=8, criterion="entropy", bootstrap=False,
                                                    min_samples_leaf=4, min_samples_split=2, random_state=1234)

    model.fit(train, target)
    result = model.predict_proba(test)[:, 1]

    """
    result = make_predictions(train, target, test, load_list=["rf_entropy", "xgb"])
    """
    io.save_result(test_index, result)
Пример #5
0
                               update_momentum=0.9, eval_size=0.01, verbose=0,
                               max_epochs=100, use_label_encoder=True)

            model3.fit(train, target)
            pickle.dump(model3, open("final_models/nn/nn_n_"+str(i)+".pkl", "wb"))
        else:
            model3 = pickle.load(open("final_models/nn/nn_n_"+str(i)+".pkl", "rb"))
        pred3 += model3.predict_proba(test)[:, 1]
    pred3 /= 10


    if ranking:
        pred1 = scipy.stats.rankdata(pred1)
        pred2 = scipy.stats.rankdata(pred2)
        pred3 = scipy.stats.rankdata(pred3)

    result = 0.21*pred1 + 0.47*pred2 + 0.32*pred3

    return result


train, test, target, test_index = io.load_data()
train, test, target = fe.preprocess_data(train, test, target)

tuning.parametr_tuning(train, target, param_grid={"alpha": [0.01]})
#tuning.ensemble_tuning(train, target, ranking=True, load_list=["linear", "xgb"])

"""
result = make_predictions(train, target, test, ranking=True, load_list=["linear", "xgb", "nn"])
io.save_result(test_index, result)
"""
Пример #6
0
    X_test = scaler.transform(X_test)


    X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr")
    X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr")

    model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2")

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result

def make_dirs(dir_names):
    for name in dir_names:
        if not os.path.exists(name):
            os.makedirs(name)

dir_names = ["input", "output", "w2v", "metafeatures"]
make_dirs(dir_names)

train, target, test = io.load_data()
text_train_tfidf, text_test_tfidf = get_tfidf(train, test)
preds1 = rf_model(train, target, test, text_train_tfidf,
                  text_test_tfidf)
preds2 = linear_model(train, target, test, text_train_tfidf,
                      text_test_tfidf)

result = 0.7*preds1 + 0.3*preds2
io.save_result(test["PostId"], result)