def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK _file_Relative_Path = os.path.join(exam_mark, targetFileName) student_data, headerArray = load_data_from_file(_file_Relative_Path) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())] featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = LinearRegression(fit_intercept=True) _lr.fit(featureMatrix, _score_array) y_predicted = _lr.predict(featureMatrix) # y_predicted.astype(int) print() # print(headerArray); # print(_lr.coef_) # print(_lr.intercept_) print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5), getprecisionWithTorlerate(y_predicted, _score_array, 1.5), getprecisionWithTorlerate(y_predicted, _score_array, 2.5), spearmanr(y_predicted, _score_array), r2_score(_score_array, y_predicted))
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def calculateMean(data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): if headerArray[index] in ["saveInterval", "score"]: dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): print(_i, "%.2f" % np.array(value_map[_i]).mean())
def checkCodeTime(): for data_mark in ["exam1", "exam2", "exam3", "exam4"]: _fileName = os.path.join(data_mark, "codeCount-15-t2") student_data, headerArray = load_data_from_file(_fileName) print(data_mark) for _line in student_data: if _line[3] <= 300: print(_line[0]) print()
def concatFeature(file_name_array, feature_name_array, mark): data_map = {} feature_type_map = {} for file_name in file_name_array: target_source = os.path.join(mark, file_name) student_data, headerarray, headerTypeArray = load_data_from_file( target_source, needType=True) for index, featureName in enumerate(headerarray): if featureName in feature_name_array: feature_type_map[featureName] = headerTypeArray[index] for student_record in student_data: uid = int(student_record[0]) if uid not in data_map: data_map[uid] = {} data_map[uid][featureName] = student_record[index] records = data_map.items() sorted(records) parentPath = DATA_ROOT_PATH + "/" + mark + "/" checkThenMkdirs(parentPath) output_filePath = parentPath + "concatfeature" output_file = open(output_filePath, "w") output_file.write("uId") for featureName in feature_name_array: output_file.write(",") output_file.write(featureName) output_file.write("\n") output_file.write("String") for featureName in feature_name_array: output_file.write(",") output_file.write(feature_type_map[featureName]) output_file.write("\n") for uid, valueMap in records: if valueMap.__len__() == feature_name_array.__len__(): output_file.write(str(uid)) for featureName in feature_name_array: output_file.write(",") output_file.write(str(valueMap[featureName])) output_file.write("\n") output_file.close()
def drawDataDistribution(dataFileName, data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) featureCount = headerArray.__len__() - 1 for colomnIndex in range(1, featureCount + 1): data = getOneColumn(student_data, colomnIndex) max, min = getMaxAndMin(data) boxWidth = (max - min) / _BOX_COUNT x_tags = [] rightBorders = [] _left = _right = min for _index in range(0, _BOX_COUNT): _left = _right _right += boxWidth rightBorders.append(_right) x_tags.append("[%.2f,%.2f)" % (_left, _right)) x_counts = [0] * _BOX_COUNT for _value in data: for _index, _border in enumerate(rightBorders): if _value <= _border: x_counts[_index] += 1 break #将未分类的归到最后一类去 unTagCount = data.__len__() for _value in x_counts: unTagCount -= _value x_counts[_BOX_COUNT - 1] += unTagCount xIndex = range(_BOX_COUNT) plot.bar(xIndex, x_counts) plot.xticks(xIndex, x_tags, rotation=10, fontsize=8) for _a, _b in zip(xIndex, x_counts): plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom') title = headerArray[colomnIndex] plot.title(title) parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/" checkThenMkdirs(parentPath) plot.savefig(parentPath + title) plot.clf()
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) : ''' 画出特征的一元分布图 :param needNormize: :return: ''' _fileName = os.path.join(DATA_MARK,dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(None); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); if needNorminize : _score_array = normizeDataSet(_score_array); #遍历所有的特征 for colomnIndex in range(1, headerArray.__len__()): data = getOneColumn(student_data, colomnIndex); # if headerArray[colomnIndex] == "avgRemoveErrorTime": # for index in range(data.__len__()): # if data[index] > 300: # data[index] = 300; if (needNorminize): data = normizeDataSet(dataSetA=data); plot.scatter(_score_array, data ,s=2); title = headerArray[colomnIndex]+"-score"; if(needNorminize): title += "-nominized"; plot.title(title); plot.xlabel("score"); plot.ylabel(headerArray[colomnIndex]); parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/"; checkThenMkdirs(parentPath); if out_mark is not None: title += "-"+out_mark; plot.savefig(parentPath+ title); plot.clf();
def calculateT(data_mark=None): """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): for _j in range(_i + 1, SCORE_FOLD): a = value_map[_i] b = value_map[_j] l, p = levene(*[a, b]) t_value, p_value = 0, 0 if p <= 0.05: t_value, p_value = ttest_ind(a, b, equal_var=False) else: t_value, p_value = ttest_ind(a, b, equal_var=True) if p_value <= 0.05: # print( _i,_j,"|", t_value , p_value) print(_i, _j)
def calculateF(data_mark = None) : """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, "concatfeature"); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); value_map = {} #按照等地分开 for _score_index,_score in enumerate(_score_array ): if _score not in value_map : value_map[_score] = []; value_map[_score].append(dataArray[_score_index]); dataArray = []; for _score in value_map: dataArray.append(value_map[_score]); l,p = levene(*dataArray); if p <= 0.05: pass # print(headerArray[index],"levene Test show warning (p = %.2f)"%p); else : f,p = f_oneway(*dataArray); if p <= 0.05 : print(headerArray[index],f,p);
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 if (neadNorm): _score_array = normizeDataSet(_score_array) for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) if (neadNorm): dataArray = normizeDataSet(dataArray) pValue, p = spearmanr(dataArray, _score_array) print(headerArray[index], pValue, p)
def concatAllFeature(file_name_array, mark): data_map = {} feature_name_array = [] feature_type_map = {} ignore_feature_array = [ "finalTestScore", "buildCount", "useDebug", "longDeleteCount", "hasBuildError", "debugCount", "pasteCount", "totalLength" ] feature_count = 0 for file_name in file_name_array: target_source = os.path.join(mark, file_name) student_data, headerarray, headerTypeArray = load_data_from_file( target_source, needType=True) for _header_index in range(1, headerarray.__len__()): if headerarray[_header_index] not in ignore_feature_array: feature_name_array.append(headerarray[_header_index]) for index, featureName in enumerate(headerarray): if featureName in feature_name_array: feature_count += 1 feature_type_map[featureName] = headerTypeArray[index] for student_record in student_data: uid = int(student_record[0]) if uid not in data_map: data_map[uid] = {} for _ocuppy_featureName in feature_name_array: data_map[uid][_ocuppy_featureName] = NULL_OCCUPY data_map[uid][featureName] = student_record[index] for uid in data_map: if data_map[uid].__len__() < feature_count: data_map[uid][featureName] = NULL_OCCUPY records = data_map.items() sorted(records) parentPath = DATA_ROOT_PATH + "/" + mark + "/" checkThenMkdirs(parentPath) output_filePath = parentPath + "concatfeature" output_file = open(output_filePath, "w") output_file.write("uId") for featureName in feature_name_array: output_file.write(",") output_file.write(featureName) output_file.write("\n") output_file.write("String") for featureName in feature_name_array: output_file.write(",") output_file.write(feature_type_map[featureName]) output_file.write("\n") for uid, valueMap in records: if valueMap.__len__() == feature_name_array.__len__(): output_file.write(str(uid)) for featureName in feature_name_array: output_file.write(",") output_file.write(str(valueMap[featureName])) output_file.write("\n") output_file.close()