def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def calculateMean(data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): if headerArray[index] in ["saveInterval", "score"]: dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): print(_i, "%.2f" % np.array(value_map[_i]).mean())
def scoreAll(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["score", "finalscore"] print('|数据名称|r^2|') print('|-|-|') model = LinearRegression() for al in algorithm_List: for target in target_List: for eid in examId_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) score = model.fit(watwinArray, scoreArray).score(watwinArray, scoreArray) print("|", dataFileName, "|", score, "|")
def usLRtoPredictWithExpDef(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] scoreMap = getGradeMap() print('|数据名称|预测准确率|') print('|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() scoreArray = getOneColumn(student_data, 2) for index in range(scoreArray.__len__()): scoreArray[index] = scoreMap[scoreArray[index]] scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) _lr = LinearRegression(fit_intercept=True) _lr.fit(watwinArray, scoreArray) y_predicted = _lr.predict(watwinArray) print("|", dataFileName, "|", getprecisionWithTorlerate(y_predicted, scoreArray, 0.5), "|")
def load_routine_data(needHeader=False): datPath = os.path.join(DATA_ROOT_PATH, exam_mark, data_name) id_array = [] data_matrix = [] infile = open(datPath, "r") for _line in infile: _line = _line.rstrip("\n") _linedata = _line.split(",") user_id = _linedata[0] id_array.append(user_id) data = [float(_linedata[i]) for i in range(1, featureSize + 1)] data_matrix.append(data) scoreMap = get_final_score_map(None) scoreArray = [] for _id in id_array: scoreArray.append(scoreMap[_id]) if needSplitScore: scoreArray = split_score_to_k_fold(scoreArray, spliteK) #筛选特征 data_matrix = np.array(data_matrix)[:, target_col] the_header = np.array(header)[target_col] if needHeader: return data_matrix, scoreArray, the_header else: return data_matrix, scoreArray,
def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK _file_Relative_Path = os.path.join(exam_mark, targetFileName) student_data, headerArray = load_data_from_file(_file_Relative_Path) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())] featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = LinearRegression(fit_intercept=True) _lr.fit(featureMatrix, _score_array) y_predicted = _lr.predict(featureMatrix) # y_predicted.astype(int) print() # print(headerArray); # print(_lr.coef_) # print(_lr.intercept_) print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5), getprecisionWithTorlerate(y_predicted, _score_array, 1.5), getprecisionWithTorlerate(y_predicted, _score_array, 2.5), spearmanr(y_predicted, _score_array), r2_score(_score_array, y_predicted))
def getData(data_name): #返回标注好的数据 _score_map= get_final_score_map(); _data_matrix,_data_header = getStudentDataWithHeader(data_name); _score_array = []; for line_index in range(_data_matrix.__len__()): uid = _data_matrix[line_index][0]; _score_array.append(_score_map[uid]); return _data_matrix,_data_header,_score_array;
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) : ''' 画出特征的一元分布图 :param needNormize: :return: ''' _fileName = os.path.join(DATA_MARK,dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(None); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); if needNorminize : _score_array = normizeDataSet(_score_array); #遍历所有的特征 for colomnIndex in range(1, headerArray.__len__()): data = getOneColumn(student_data, colomnIndex); # if headerArray[colomnIndex] == "avgRemoveErrorTime": # for index in range(data.__len__()): # if data[index] > 300: # data[index] = 300; if (needNorminize): data = normizeDataSet(dataSetA=data); plot.scatter(_score_array, data ,s=2); title = headerArray[colomnIndex]+"-score"; if(needNorminize): title += "-nominized"; plot.title(title); plot.xlabel("score"); plot.ylabel(headerArray[colomnIndex]); parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/"; checkThenMkdirs(parentPath); if out_mark is not None: title += "-"+out_mark; plot.savefig(parentPath+ title); plot.clf();
def calculateT(data_mark=None): """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): for _j in range(_i + 1, SCORE_FOLD): a = value_map[_i] b = value_map[_j] l, p = levene(*[a, b]) t_value, p_value = 0, 0 if p <= 0.05: t_value, p_value = ttest_ind(a, b, equal_var=False) else: t_value, p_value = ttest_ind(a, b, equal_var=True) if p_value <= 0.05: # print( _i,_j,"|", t_value , p_value) print(_i, _j)
def usePearsonrCalAll(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["score", "finalscore"] print('|数据名称|相关系数|p|') print('|-|-|-|') for al in algorithm_List: for target in target_List: for eid in examId_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) watwinArray = getOneColumn(student_data, 1) c, p = pearsonr(scoreArray, watwinArray) print("|", dataFileName, "|", c, "|", p, "|")
def calculateF(data_mark = None) : """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, "concatfeature"); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); value_map = {} #按照等地分开 for _score_index,_score in enumerate(_score_array ): if _score not in value_map : value_map[_score] = []; value_map[_score].append(dataArray[_score_index]); dataArray = []; for _score in value_map: dataArray.append(value_map[_score]); l,p = levene(*dataArray); if p <= 0.05: pass # print(headerArray[index],"levene Test show warning (p = %.2f)"%p); else : f,p = f_oneway(*dataArray); if p <= 0.05 : print(headerArray[index],f,p);
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 if (neadNorm): _score_array = normizeDataSet(_score_array) for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) if (neadNorm): dataArray = normizeDataSet(dataArray) pValue, p = spearmanr(dataArray, _score_array) print(headerArray[index], pValue, p)
def usLRtoPredict(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] print('|数据名称|预测|5分|10分|') print('|-|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) _lr = LinearRegression(fit_intercept=True) _lr.fit(watwinArray, scoreArray) y_predicted = _lr.predict(watwinArray) print("|", dataFileName, "|", getprecisionWithTorlerate(y_predicted, scoreArray, 0.5), "|", getprecisionWithTorlerate(y_predicted, scoreArray, 1.5), "|", getprecisionWithTorlerate(y_predicted, scoreArray, 2.5), "|", r2_score(scoreArray, y_predicted), "|", spearmanr(y_predicted, scoreArray))
def usLRtoPredictWithKFold(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] print('|数据名称|预测|5分|10分|') print('|-|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) kf = KFold(n_splits=10, shuffle=True) accurate_array = [] within_5_array = [] r_2_array = [] within_10_array = [] for train_index_array, test_index_array in kf.split( watwinArray): X_train = [] X_test = [] y_train = [] y_test = [] for train_index in train_index_array: X_train.append(watwinArray[train_index]) y_train.append(scoreArray[train_index]) for test_index in test_index_array: X_test.append(watwinArray[test_index]) y_test.append(scoreArray[test_index]) _lr = LinearRegression(fit_intercept=True) _lr.fit(X_train, y_train) y_predicted = _lr.predict(X_test) accurate_array.append( getprecisionWithTorlerate(y_predicted, y_test, 0.5)) within_5_array.append( getprecisionWithTorlerate(y_test, y_predicted, 1.5)) within_10_array.append( getprecisionWithTorlerate(y_test, y_predicted, 2.5)) r_2_array.append(r2_score(y_test, y_predicted)) print("|", dataFileName, "|", np.array(accurate_array).mean(), "|", np.array(within_5_array).mean(), "|", np.array(within_10_array).mean(), "|", np.array(r_2_array).mean())