Exemplo n.º 1
0
def cor_of_features(features, scores):
    """
    Paras:
        features:
            the matrix of feature values, shape of which is (M, N).
                --M is the number of samples and N is the number of features(8 for now).
                --features[i][j] is the NO.j feature value of NO.i sample.
            feature sequence is ['1gram', '2gram', '3gram', '4gram', 'lengthratio', 'lsagrade', 'vecsim', 'fluency'].
        scores:
            matrix of scores, shape of which is (M, 1).
                --M is the number of samples.
    returns:
        A dict, key of which is feature name and value is pearson correlation value.
    """
    cors = {}
    i = 0
    features_arr = np.asarray(features)
    for f in [
            '1gram', '2gram', '3gram', '4gram', 'lengthratio', 'lsagrade',
            'vec', 'fluency', 'np', 'vp'
    ]:
        cors[f] = round(pearson_cor(scores, features_arr[:, i]), 4)
        i += 1
    print(cors)
    return cors
Exemplo n.º 2
0
    # selectionKBest.fit(X_rg_scaled, y_rg)
    # print(selectionKBest.scores_)
    # X_rg_selected = selectionKBest.transform(X_rg_scaled)
    # print(X_rg_selected.shape)
    rfr, test_scores = random_forest(X_rg_scaled, y_rg, cv=cv)
    print(rfr)
    print("test scores:", test_scores)

    X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled,
                                                        y_rg,
                                                        test_size=0.2,
                                                        random_state=42)
    rfr.fit(X_train, y_train)
    print(rfr.feature_importances_)
    print("number of features:", rfr.n_features_)
    print("pearson:", pearson_cor(y_test, rfr.predict(X_test)))

    # print("s:", s, "n:", n, "test_scores", test_scores)
    #         if test_scores[-1] > max_score:
    #             best_feature_num = n
    #             max_score = max(max_score, test_scores[-1])
    #             best_selection = s
    # print("best selection:", best_selection)
    # print("selected feature number:", best_feature_num, "max test scores:", max_score)

    # classfication model
    # X_clf, y = extract_data(conn, course="202英语二", features=features3)
    # scaler = MinMaxScaler()
    # X_clf_scaled = scaler.fit_transform(X_clf)
    # y_clf = [str(data) for data in y]
    # svc = classification(X_clf_scaled, y_clf, cv=cv)
Exemplo n.º 3
0
from feature import extract_features
import numpy as np
import math
from correlation import pearson_cor


feature_list, score_list = extract_features()
new_scores = np.array(score_list) / 2.0
predict_scores = []
for f in feature_list:
    # print(f)
    length_ratio = f[4]   # candidate length / reference length
    if length_ratio == 0:
        BLEU = 0
    else:
        BP = 1 if length_ratio > 1 else math.exp(1 - 1 / length_ratio)
        BLEU = BP * math.exp(sum(math.log(p) for p in f[:4]) * 0.25) if f[0]*f[1]*f[2]*f[3] != 0 else 0
    predict_scores.append(BLEU)

print(new_scores)
print(predict_scores)
print(pearson_cor(new_scores, predict_scores))
Exemplo n.º 4
0
                        y_rg,
                        ylim=(0.0, 0.8),
                        cv=cv,
                        n_jobs=4,
                        scoring=score_func)
    plt.show()

    # X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled, y_rg, test_size=0.33, random_state=42)
    X_train = X_rg_scaled[100:]
    y_train = y_rg[100:]
    X_test = X_rg_scaled[:100]
    y_test = y_rg[:100]
    best_svr.fit(X_train, y_train)

    y_predict = best_svr.predict(X_test)
    print(pearson_cor(y_test, y_predict))

    # conn = pymysql.connect(host="127.0.0.1",
    #                        database='essaydata',
    #                        port=3306,
    #                        user='******',
    #                        password='',
    #                        charset='utf8')
    # cur = conn.cursor()

    # sql = "SELECT textid FROM features WHERE 1gram=%s AND 2gram=%s AND 3gram=%s AND 4gram=%s"
    # sql1 = "SELECT textid, text FROM detection WHERE textid=%s"
    # for index in range(len(y_test)):
    #     if abs(y_test[index] - y_predict[index]) > 0.3:
    #
    #         print("target:", y_test[index], "predict:", y_predict[index])