def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    _file_Relative_Path = os.path.join(exam_mark, targetFileName)
    student_data, headerArray = load_data_from_file(_file_Relative_Path)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())]
    featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = LinearRegression(fit_intercept=True)
    _lr.fit(featureMatrix, _score_array)
    y_predicted = _lr.predict(featureMatrix)
    # y_predicted.astype(int)

    print()
    # print(headerArray);
    # print(_lr.coef_)
    # print(_lr.intercept_)

    print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 1.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 2.5),
          spearmanr(y_predicted, _score_array),
          r2_score(_score_array, y_predicted))
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) :
    mic_map = {};
    for dataFileName in dataFileArray :
        if data_mark is None:
            data_mark = DATA_MARK;
        _fileName = os.path.join(data_mark, dataFileName);
        student_data,headerArray = load_data_from_file(_fileName);

        _score_map = get_final_score_map();
        _score_array = [];
        for _student_record in student_data:
            _score_array.append(_score_map[_student_record[0]]);

        featureCount = headerArray.__len__() - 1;

        if(neadNorm):
            _score_array =normizeDataSet(_score_array);

        #计算皮尔森相关系数 并输出成markdown形式
        m = MINE()
        for index in range(1,featureCount+1) :
            dataArray = getOneColumn(student_data,index);
            if (neadNorm):
                dataArray = normizeDataSet(dataArray);
            m.compute_score(dataArray,_score_array);
            mic_map[headerArray[index]] = m.mic();

    sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True);
    threhold = np.mean(list(mic_map.values()));
    for header,value in sorted_list:
        if value > threhold:
            print(header,value)
示例#3
0
def calculateMean(data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        if headerArray[index] in ["saveInterval", "score"]:
            dataArray = getOneColumn(student_data, index)
            value_map = {}
            #按照等地分开
            for _score_index, _score in enumerate(_score_array):
                if _score not in value_map:
                    value_map[_score] = []
                value_map[_score].append(dataArray[_score_index])

            print(headerArray[index])
            for _i in range(SCORE_FOLD):
                print(_i, "%.2f" % np.array(value_map[_i]).mean())
示例#4
0
def checkCodeTime():
    for data_mark in ["exam1", "exam2", "exam3", "exam4"]:
        _fileName = os.path.join(data_mark, "codeCount-15-t2")
        student_data, headerArray = load_data_from_file(_fileName)

        print(data_mark)
        for _line in student_data:
            if _line[3] <= 300:
                print(_line[0])

        print()
示例#5
0
def concatFeature(file_name_array, feature_name_array, mark):
    data_map = {}

    feature_type_map = {}

    for file_name in file_name_array:
        target_source = os.path.join(mark, file_name)
        student_data, headerarray, headerTypeArray = load_data_from_file(
            target_source, needType=True)

        for index, featureName in enumerate(headerarray):
            if featureName in feature_name_array:
                feature_type_map[featureName] = headerTypeArray[index]

                for student_record in student_data:
                    uid = int(student_record[0])
                    if uid not in data_map:
                        data_map[uid] = {}

                    data_map[uid][featureName] = student_record[index]

    records = data_map.items()
    sorted(records)

    parentPath = DATA_ROOT_PATH + "/" + mark + "/"
    checkThenMkdirs(parentPath)
    output_filePath = parentPath + "concatfeature"

    output_file = open(output_filePath, "w")

    output_file.write("uId")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(featureName)
    output_file.write("\n")

    output_file.write("String")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(feature_type_map[featureName])
    output_file.write("\n")

    for uid, valueMap in records:
        if valueMap.__len__() == feature_name_array.__len__():
            output_file.write(str(uid))
            for featureName in feature_name_array:
                output_file.write(",")
                output_file.write(str(valueMap[featureName]))
            output_file.write("\n")

    output_file.close()
示例#6
0
def drawDataDistribution(dataFileName, data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    featureCount = headerArray.__len__() - 1
    for colomnIndex in range(1, featureCount + 1):
        data = getOneColumn(student_data, colomnIndex)
        max, min = getMaxAndMin(data)
        boxWidth = (max - min) / _BOX_COUNT

        x_tags = []
        rightBorders = []
        _left = _right = min
        for _index in range(0, _BOX_COUNT):
            _left = _right
            _right += boxWidth
            rightBorders.append(_right)
            x_tags.append("[%.2f,%.2f)" % (_left, _right))

        x_counts = [0] * _BOX_COUNT

        for _value in data:
            for _index, _border in enumerate(rightBorders):
                if _value <= _border:
                    x_counts[_index] += 1
                    break

        #将未分类的归到最后一类去
        unTagCount = data.__len__()
        for _value in x_counts:
            unTagCount -= _value
        x_counts[_BOX_COUNT - 1] += unTagCount

        xIndex = range(_BOX_COUNT)
        plot.bar(xIndex, x_counts)
        plot.xticks(xIndex, x_tags, rotation=10, fontsize=8)
        for _a, _b in zip(xIndex, x_counts):
            plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom')

        title = headerArray[colomnIndex]
        plot.title(title)
        parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/"
        checkThenMkdirs(parentPath)
        plot.savefig(parentPath + title)
        plot.clf()
示例#7
0
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) :
    '''
    画出特征的一元分布图
    :param needNormize:
    :return:
    '''
    _fileName = os.path.join(DATA_MARK,dataFileName);

    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map(None);

    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    if needNorminize :
        _score_array = normizeDataSet(_score_array);

    #遍历所有的特征
    for colomnIndex in range(1, headerArray.__len__()):
        data = getOneColumn(student_data, colomnIndex);

        # if headerArray[colomnIndex] == "avgRemoveErrorTime":
        #     for index in range(data.__len__()):
        #         if data[index] > 300:
        #             data[index] = 300;

        if (needNorminize):
            data = normizeDataSet(dataSetA=data);

        plot.scatter(_score_array, data ,s=2);
        title = headerArray[colomnIndex]+"-score";
        if(needNorminize):
            title += "-nominized";
        plot.title(title);
        plot.xlabel("score");
        plot.ylabel(headerArray[colomnIndex]);

        parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/";
        checkThenMkdirs(parentPath);
        if out_mark is not None:
            title += "-"+out_mark;
        plot.savefig(parentPath+ title);
        plot.clf();
示例#8
0
def calculateT(data_mark=None):
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        value_map = {}
        #按照等地分开
        for _score_index, _score in enumerate(_score_array):
            if _score not in value_map:
                value_map[_score] = []
            value_map[_score].append(dataArray[_score_index])

        print(headerArray[index])
        for _i in range(SCORE_FOLD):
            for _j in range(_i + 1, SCORE_FOLD):
                a = value_map[_i]
                b = value_map[_j]
                l, p = levene(*[a, b])
                t_value, p_value = 0, 0
                if p <= 0.05:
                    t_value, p_value = ttest_ind(a, b, equal_var=False)
                else:
                    t_value, p_value = ttest_ind(a, b, equal_var=True)

                if p_value <= 0.05:
                    # print( _i,_j,"|", t_value , p_value)
                    print(_i, _j)
示例#9
0
def calculateF(data_mark = None) :
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK;
    _fileName = os.path.join(data_mark, "concatfeature");
    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map();
    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    featureCount = headerArray.__len__() - 1;

    for index in range(1,featureCount+1) :
        dataArray = getOneColumn(student_data,index);
        value_map = {}
        #按照等地分开
        for _score_index,_score in enumerate(_score_array ):
            if _score not in value_map :
                value_map[_score] = [];
            value_map[_score].append(dataArray[_score_index]);

        dataArray = [];
        for _score in value_map:
            dataArray.append(value_map[_score]);

        l,p = levene(*dataArray);
        if p <= 0.05:
            pass
            # print(headerArray[index],"levene Test show warning (p = %.2f)"%p);
        else :
            f,p = f_oneway(*dataArray);
            if p <= 0.05 :
                print(headerArray[index],f,p);
示例#10
0
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    if (neadNorm):
        _score_array = normizeDataSet(_score_array)

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        if (neadNorm):
            dataArray = normizeDataSet(dataArray)
        pValue, p = spearmanr(dataArray, _score_array)
        print(headerArray[index], pValue, p)
示例#11
0
def concatAllFeature(file_name_array, mark):
    data_map = {}

    feature_name_array = []
    feature_type_map = {}
    ignore_feature_array = [
        "finalTestScore", "buildCount", "useDebug", "longDeleteCount",
        "hasBuildError", "debugCount", "pasteCount", "totalLength"
    ]
    feature_count = 0
    for file_name in file_name_array:
        target_source = os.path.join(mark, file_name)
        student_data, headerarray, headerTypeArray = load_data_from_file(
            target_source, needType=True)

        for _header_index in range(1, headerarray.__len__()):
            if headerarray[_header_index] not in ignore_feature_array:
                feature_name_array.append(headerarray[_header_index])

        for index, featureName in enumerate(headerarray):
            if featureName in feature_name_array:
                feature_count += 1
                feature_type_map[featureName] = headerTypeArray[index]
                for student_record in student_data:
                    uid = int(student_record[0])
                    if uid not in data_map:
                        data_map[uid] = {}
                        for _ocuppy_featureName in feature_name_array:
                            data_map[uid][_ocuppy_featureName] = NULL_OCCUPY
                    data_map[uid][featureName] = student_record[index]

                for uid in data_map:
                    if data_map[uid].__len__() < feature_count:
                        data_map[uid][featureName] = NULL_OCCUPY

    records = data_map.items()
    sorted(records)

    parentPath = DATA_ROOT_PATH + "/" + mark + "/"
    checkThenMkdirs(parentPath)
    output_filePath = parentPath + "concatfeature"

    output_file = open(output_filePath, "w")

    output_file.write("uId")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(featureName)
    output_file.write("\n")

    output_file.write("String")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(feature_type_map[featureName])
    output_file.write("\n")

    for uid, valueMap in records:
        if valueMap.__len__() == feature_name_array.__len__():
            output_file.write(str(uid))
            for featureName in feature_name_array:
                output_file.write(",")
                output_file.write(str(valueMap[featureName]))
            output_file.write("\n")

    output_file.close()