def cor_of_features(features, scores): """ Paras: features: the matrix of feature values, shape of which is (M, N). --M is the number of samples and N is the number of features(8 for now). --features[i][j] is the NO.j feature value of NO.i sample. feature sequence is ['1gram', '2gram', '3gram', '4gram', 'lengthratio', 'lsagrade', 'vecsim', 'fluency']. scores: matrix of scores, shape of which is (M, 1). --M is the number of samples. returns: A dict, key of which is feature name and value is pearson correlation value. """ cors = {} i = 0 features_arr = np.asarray(features) for f in [ '1gram', '2gram', '3gram', '4gram', 'lengthratio', 'lsagrade', 'vec', 'fluency', 'np', 'vp' ]: cors[f] = round(pearson_cor(scores, features_arr[:, i]), 4) i += 1 print(cors) return cors
# selectionKBest.fit(X_rg_scaled, y_rg) # print(selectionKBest.scores_) # X_rg_selected = selectionKBest.transform(X_rg_scaled) # print(X_rg_selected.shape) rfr, test_scores = random_forest(X_rg_scaled, y_rg, cv=cv) print(rfr) print("test scores:", test_scores) X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled, y_rg, test_size=0.2, random_state=42) rfr.fit(X_train, y_train) print(rfr.feature_importances_) print("number of features:", rfr.n_features_) print("pearson:", pearson_cor(y_test, rfr.predict(X_test))) # print("s:", s, "n:", n, "test_scores", test_scores) # if test_scores[-1] > max_score: # best_feature_num = n # max_score = max(max_score, test_scores[-1]) # best_selection = s # print("best selection:", best_selection) # print("selected feature number:", best_feature_num, "max test scores:", max_score) # classfication model # X_clf, y = extract_data(conn, course="202英语二", features=features3) # scaler = MinMaxScaler() # X_clf_scaled = scaler.fit_transform(X_clf) # y_clf = [str(data) for data in y] # svc = classification(X_clf_scaled, y_clf, cv=cv)
from feature import extract_features import numpy as np import math from correlation import pearson_cor feature_list, score_list = extract_features() new_scores = np.array(score_list) / 2.0 predict_scores = [] for f in feature_list: # print(f) length_ratio = f[4] # candidate length / reference length if length_ratio == 0: BLEU = 0 else: BP = 1 if length_ratio > 1 else math.exp(1 - 1 / length_ratio) BLEU = BP * math.exp(sum(math.log(p) for p in f[:4]) * 0.25) if f[0]*f[1]*f[2]*f[3] != 0 else 0 predict_scores.append(BLEU) print(new_scores) print(predict_scores) print(pearson_cor(new_scores, predict_scores))
y_rg, ylim=(0.0, 0.8), cv=cv, n_jobs=4, scoring=score_func) plt.show() # X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled, y_rg, test_size=0.33, random_state=42) X_train = X_rg_scaled[100:] y_train = y_rg[100:] X_test = X_rg_scaled[:100] y_test = y_rg[:100] best_svr.fit(X_train, y_train) y_predict = best_svr.predict(X_test) print(pearson_cor(y_test, y_predict)) # conn = pymysql.connect(host="127.0.0.1", # database='essaydata', # port=3306, # user='******', # password='', # charset='utf8') # cur = conn.cursor() # sql = "SELECT textid FROM features WHERE 1gram=%s AND 2gram=%s AND 3gram=%s AND 4gram=%s" # sql1 = "SELECT textid, text FROM detection WHERE textid=%s" # for index in range(len(y_test)): # if abs(y_test[index] - y_predict[index]) > 0.3: # # print("target:", y_test[index], "predict:", y_predict[index])