def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) :
    mic_map = {};
    for dataFileName in dataFileArray :
        if data_mark is None:
            data_mark = DATA_MARK;
        _fileName = os.path.join(data_mark, dataFileName);
        student_data,headerArray = load_data_from_file(_fileName);

        _score_map = get_final_score_map();
        _score_array = [];
        for _student_record in student_data:
            _score_array.append(_score_map[_student_record[0]]);

        featureCount = headerArray.__len__() - 1;

        if(neadNorm):
            _score_array =normizeDataSet(_score_array);

        #计算皮尔森相关系数 并输出成markdown形式
        m = MINE()
        for index in range(1,featureCount+1) :
            dataArray = getOneColumn(student_data,index);
            if (neadNorm):
                dataArray = normizeDataSet(dataArray);
            m.compute_score(dataArray,_score_array);
            mic_map[headerArray[index]] = m.mic();

    sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True);
    threhold = np.mean(list(mic_map.values()));
    for header,value in sorted_list:
        if value > threhold:
            print(header,value)
Пример #2
0
def calculateMean(data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        if headerArray[index] in ["saveInterval", "score"]:
            dataArray = getOneColumn(student_data, index)
            value_map = {}
            #按照等地分开
            for _score_index, _score in enumerate(_score_array):
                if _score not in value_map:
                    value_map[_score] = []
                value_map[_score].append(dataArray[_score_index])

            print(headerArray[index])
            for _i in range(SCORE_FOLD):
                print(_i, "%.2f" % np.array(value_map[_i]).mean())
Пример #3
0
def scoreAll():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["score", "finalscore"]

    print('|数据名称|r^2|')
    print('|-|-|')
    model = LinearRegression()

    for al in algorithm_List:
        for target in target_List:
            for eid in examId_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)

                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])
                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                score = model.fit(watwinArray,
                                  scoreArray).score(watwinArray, scoreArray)
                print("|", dataFileName, "|", score, "|")
Пример #4
0
def usLRtoPredictWithExpDef():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    scoreMap = getGradeMap()
    print('|数据名称|预测准确率|')
    print('|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                scoreArray = getOneColumn(student_data, 2)
                for index in range(scoreArray.__len__()):
                    scoreArray[index] = scoreMap[scoreArray[index]]

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                _lr = LinearRegression(fit_intercept=True)
                _lr.fit(watwinArray, scoreArray)
                y_predicted = _lr.predict(watwinArray)

                print("|", dataFileName, "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray, 0.5),
                      "|")
Пример #5
0
def load_routine_data(needHeader=False):
    datPath = os.path.join(DATA_ROOT_PATH, exam_mark, data_name)

    id_array = []
    data_matrix = []

    infile = open(datPath, "r")
    for _line in infile:
        _line = _line.rstrip("\n")
        _linedata = _line.split(",")

        user_id = _linedata[0]
        id_array.append(user_id)

        data = [float(_linedata[i]) for i in range(1, featureSize + 1)]
        data_matrix.append(data)

    scoreMap = get_final_score_map(None)
    scoreArray = []
    for _id in id_array:
        scoreArray.append(scoreMap[_id])

    if needSplitScore:
        scoreArray = split_score_to_k_fold(scoreArray, spliteK)

    #筛选特征
    data_matrix = np.array(data_matrix)[:, target_col]
    the_header = np.array(header)[target_col]

    if needHeader:
        return data_matrix, scoreArray, the_header
    else:
        return data_matrix, scoreArray,
def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    _file_Relative_Path = os.path.join(exam_mark, targetFileName)
    student_data, headerArray = load_data_from_file(_file_Relative_Path)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())]
    featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = LinearRegression(fit_intercept=True)
    _lr.fit(featureMatrix, _score_array)
    y_predicted = _lr.predict(featureMatrix)
    # y_predicted.astype(int)

    print()
    # print(headerArray);
    # print(_lr.coef_)
    # print(_lr.intercept_)

    print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 1.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 2.5),
          spearmanr(y_predicted, _score_array),
          r2_score(_score_array, y_predicted))
def getData(data_name):
    #返回标注好的数据

    _score_map= get_final_score_map();

    _data_matrix,_data_header = getStudentDataWithHeader(data_name);

    _score_array = [];
    for line_index in range(_data_matrix.__len__()):
        uid = _data_matrix[line_index][0];
        _score_array.append(_score_map[uid]);

    return _data_matrix,_data_header,_score_array;
Пример #8
0
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) :
    '''
    画出特征的一元分布图
    :param needNormize:
    :return:
    '''
    _fileName = os.path.join(DATA_MARK,dataFileName);

    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map(None);

    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    if needNorminize :
        _score_array = normizeDataSet(_score_array);

    #遍历所有的特征
    for colomnIndex in range(1, headerArray.__len__()):
        data = getOneColumn(student_data, colomnIndex);

        # if headerArray[colomnIndex] == "avgRemoveErrorTime":
        #     for index in range(data.__len__()):
        #         if data[index] > 300:
        #             data[index] = 300;

        if (needNorminize):
            data = normizeDataSet(dataSetA=data);

        plot.scatter(_score_array, data ,s=2);
        title = headerArray[colomnIndex]+"-score";
        if(needNorminize):
            title += "-nominized";
        plot.title(title);
        plot.xlabel("score");
        plot.ylabel(headerArray[colomnIndex]);

        parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/";
        checkThenMkdirs(parentPath);
        if out_mark is not None:
            title += "-"+out_mark;
        plot.savefig(parentPath+ title);
        plot.clf();
Пример #9
0
def calculateT(data_mark=None):
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        value_map = {}
        #按照等地分开
        for _score_index, _score in enumerate(_score_array):
            if _score not in value_map:
                value_map[_score] = []
            value_map[_score].append(dataArray[_score_index])

        print(headerArray[index])
        for _i in range(SCORE_FOLD):
            for _j in range(_i + 1, SCORE_FOLD):
                a = value_map[_i]
                b = value_map[_j]
                l, p = levene(*[a, b])
                t_value, p_value = 0, 0
                if p <= 0.05:
                    t_value, p_value = ttest_ind(a, b, equal_var=False)
                else:
                    t_value, p_value = ttest_ind(a, b, equal_var=True)

                if p_value <= 0.05:
                    # print( _i,_j,"|", t_value , p_value)
                    print(_i, _j)
Пример #10
0
def usePearsonrCalAll():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["score", "finalscore"]

    print('|数据名称|相关系数|p|')
    print('|-|-|-|')

    for al in algorithm_List:
        for target in target_List:
            for eid in examId_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                watwinArray = getOneColumn(student_data, 1)
                c, p = pearsonr(scoreArray, watwinArray)
                print("|", dataFileName, "|", c, "|", p, "|")
Пример #11
0
def calculateF(data_mark = None) :
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK;
    _fileName = os.path.join(data_mark, "concatfeature");
    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map();
    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    featureCount = headerArray.__len__() - 1;

    for index in range(1,featureCount+1) :
        dataArray = getOneColumn(student_data,index);
        value_map = {}
        #按照等地分开
        for _score_index,_score in enumerate(_score_array ):
            if _score not in value_map :
                value_map[_score] = [];
            value_map[_score].append(dataArray[_score_index]);

        dataArray = [];
        for _score in value_map:
            dataArray.append(value_map[_score]);

        l,p = levene(*dataArray);
        if p <= 0.05:
            pass
            # print(headerArray[index],"levene Test show warning (p = %.2f)"%p);
        else :
            f,p = f_oneway(*dataArray);
            if p <= 0.05 :
                print(headerArray[index],f,p);
Пример #12
0
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    if (neadNorm):
        _score_array = normizeDataSet(_score_array)

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        if (neadNorm):
            dataArray = normizeDataSet(dataArray)
        pValue, p = spearmanr(dataArray, _score_array)
        print(headerArray[index], pValue, p)
Пример #13
0
def usLRtoPredict():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    print('|数据名称|预测|5分|10分|')
    print('|-|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)
                _lr = LinearRegression(fit_intercept=True)
                _lr.fit(watwinArray, scoreArray)
                y_predicted = _lr.predict(watwinArray)
                print("|", dataFileName, "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray,
                                                0.5), "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray,
                                                1.5), "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray, 2.5),
                      "|", r2_score(scoreArray, y_predicted), "|",
                      spearmanr(y_predicted, scoreArray))
Пример #14
0
def usLRtoPredictWithKFold():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    print('|数据名称|预测|5分|10分|')
    print('|-|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                kf = KFold(n_splits=10, shuffle=True)
                accurate_array = []
                within_5_array = []
                r_2_array = []
                within_10_array = []

                for train_index_array, test_index_array in kf.split(
                        watwinArray):
                    X_train = []
                    X_test = []
                    y_train = []
                    y_test = []
                    for train_index in train_index_array:
                        X_train.append(watwinArray[train_index])
                        y_train.append(scoreArray[train_index])

                    for test_index in test_index_array:
                        X_test.append(watwinArray[test_index])
                        y_test.append(scoreArray[test_index])

                    _lr = LinearRegression(fit_intercept=True)
                    _lr.fit(X_train, y_train)
                    y_predicted = _lr.predict(X_test)

                    accurate_array.append(
                        getprecisionWithTorlerate(y_predicted, y_test, 0.5))
                    within_5_array.append(
                        getprecisionWithTorlerate(y_test, y_predicted, 1.5))
                    within_10_array.append(
                        getprecisionWithTorlerate(y_test, y_predicted, 2.5))
                    r_2_array.append(r2_score(y_test, y_predicted))

                print("|", dataFileName, "|",
                      np.array(accurate_array).mean(), "|",
                      np.array(within_5_array).mean(), "|",
                      np.array(within_10_array).mean(), "|",
                      np.array(r_2_array).mean())