예제 #1
0
def anova_all_way():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,自变量
        "Y": ["y"], # list,因变量
        "alpha": "0.05", # str,置信区间百分比
        "table_direction": "", str,表格方向,水平方向为h,竖直方向为v
        "analysis_options": ["normal", "variances", "multiple"]
    }
    :return:
    """
    log.info('anova_all_way_test_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        Y = request_data['Y']
        alpha = float(request_data['alpha'])
        table_direction = request_data['table_direction']
        analysis_options = request_data.get("analysis_options", [])
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X, Y], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X, Y)
    log.info("输入数据大小:{}".format(len(data)))
    try:
        if table_direction == "v":
            data[Y[0]] = data[Y[0]].astype("float16")
            # every_level_data_index = [d for d in data[X[0]].unique()]
            # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()]
        elif table_direction == "h":
            # every_level_data_index = X
            # every_level_data = [data[l].astype("float16") for l in X]
            data, X, Y = transform_h_table_data_to_v(data, X)
        else:
            raise ValueError("table direction must be h or v")
        res = []
        # 主体间因子
        res.append(level_info(data, X))
        # 描述性统计分析
        res.append(anova_all_way_describe_info(data, X, Y))
        if "normal" in analysis_options:
            res.append(normal_test_all(data, X, alpha=alpha))
        if "variances" in analysis_options:
            res.append(transform_table_data_to_html(levene_test_all(data, X, alpha=alpha)))
        # 多因素方差分析
        res.append(transform_table_data_to_html(anova_analysis_multivariate(data, X, Y)))
        # todo:稍后加
        # 多重比较
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        # raise e
        return jsonify({"data": "", "code": "500", "msg": e.args[0]})
예제 #2
0
def t_two_pair():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
        "Y": ["y"], # list,因变量,当表格方向为v是使用
        "alpha": "0.05", # str,置信区间百分比
        "table_direction": "", str,表格方向,水平方向为h,竖直方向为v
        "analysis_options": ["normal", "pearsonr"]
    }
    :return:
    """
    log.info('t_two_pair_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        Y = request_data['Y']
        table_direction = request_data['table_direction']
        alpha = float(request_data['alpha'])
        analysis_options = request_data.get("analysis_options", [])
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X, Y], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X, Y)
    log.info("输入数据大小:{}".format(len(data)))
    try:
        if table_direction == "v":
            every_level_data_index = [d for d in data[X[0]].unique()]
            every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()]
        elif table_direction == "h":
            every_level_data_index = X
            every_level_data = [data[l].astype("float16") for l in X]
            data, X, Y = transform_h_table_data_to_v(data, X)
        else:
            raise ValueError("table direction must be h or v")
        if len(every_level_data_index) > 2:
            raise ValueError("自变量的水平必须是2个")
        res = []
        # 描述性统计分析
        res.append(transform_table_data_to_html(t_two_paired_describe_info(data, X, Y)))
        if "pearsonr" in analysis_options:
            res.append(transform_table_data_to_html(
                pearsonr_test(*every_level_data, index=every_level_data_index, alpha=alpha)))
        if "normal" in analysis_options:
            res.append(transform_table_data_to_html(normal_test(every_level_data_index, every_level_data, alpha)))
        res.append(transform_table_data_to_html(
            t_two_pair_analysis(*every_level_data, index=every_level_data_index, alpha=alpha), col0="配对差值"))
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        # raise e
        return jsonify({"data": "", "code": "500", "msg": e.args[0]})
예제 #3
0
def nonparametric_two_independent():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
        "Y": ["y"], # list,因变量,当表格方向为v是使用
        "table_direction": "", str,表格方向,水平方向为h,竖直方向为v
    }
    :return:
    """
    log.info('nonparametric_two_independent_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        Y = request_data['Y']
        table_direction = request_data['table_direction']
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X, Y], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X, Y)
    log.info("输入数据大小:{}".format(len(data)))

    try:
        if table_direction == "v":
            every_level_data_index = [d for d in data[X[0]].unique()]
            # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()]
            data, X = transform_v_table_data_to_h(data, X, Y)
        elif table_direction == "h":
            every_level_data_index = X
            every_level_data = [data[l].astype("float16") for l in X]
            # data, X, Y = transform_h_table_data_to_v(data, X) # 水平的数据,这里不用转
        else:
            raise ValueError("table direction must be h or v")
        if len(every_level_data_index) > 2:
            raise ValueError("自变量的水平必须是2个")

        # 描述性统计
        res = []
        data_info = transform_table_data_to_html(Mann_Whitney_U_describe(data, X))
        res.append(data_info)

        # Mann-Whitney U 检验
        Mann_Whitney_U_res = transform_table_data_to_html(Mann_Whitney_U_test(data, X))
        res.append(Mann_Whitney_U_res)
        response_data = {"res": res,
                         "data_info": data_info,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
예제 #4
0
def results_nonparametric_two_independent():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
        # "table_direction": "h", str,表格方向,水平方向为h,竖直方向为v
    }
    :return:
    """
    log.info('nonparametric_two_pair_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        # Y = request_data['Y']
        # table_direction = request_data['table_direction']
        # alpha = float(request_data['alpha'])
    except Exception as e:
        log.info(e)
        raise e
    # assert isinstance([X, Y], list)
    assert isinstance([X], list)
    if len(X) > 2:
        raise ValueError("只支持一列数据或两列数据")
    # 从数据库拿数据
    # data = exec_sql(table_name, X, Y)
    data = exec_sql(table_name, X)
    log.info("输入数据大小:{}".format(len(data)))

    try:

        # 描述性统计
        res = []
        data_info = transform_table_data_to_html(Wilcoxon_describe(data, X))
        res.append(data_info)
        log.info("描述性统计分析完成")

        # Wilcoxon 符号秩检验
        Wilcoxon_res = transform_table_data_to_html(Wilcoxon_test(data, X))
        res.append(Wilcoxon_res)
        log.info("Wilcoxon 符号秩检验完成")

        response_data = {"res": res,
                         "data_info": data_info,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
예제 #5
0
 def loadings(self, data):
     data = data.astype(float)
     '''
     该方法用于输出旋转前的因子载荷矩阵
     '''
     factor_num = self.component
     # 接下来求解因子载荷矩阵
     # 生成由前factor_num个特征值构成的对角阵,存入duijiao中用于计算因子载荷矩阵
     eigvalue = self.var_contri(data)['Eigvalue']  ##
     duijiao = list(np.array(np.sqrt(eigvalue[:factor_num]), dtype=float))
     eigmat = np.diag(duijiao)
     zaihe = np.dot(self.eigvector[:factor_num].T, eigmat)
     self.zaihe = zaihe
     n = range(1, factor_num + 1)
     col = []
     for i in n:
         c = 'Factor' + str(i)
         col.append(c)
     zaihe = -pd.DataFrame(zaihe, columns=col)
     zaihe.iloc[:, 1] = -zaihe.iloc[:, 1]
     self.col = col
     zaihe.index = data.columns
     self.zaihe = zaihe
     self.zaihe = format_data_col(self.zaihe)
     col = self.zaihe.columns.values.tolist()
     row = self.zaihe.index.values.tolist()
     res = self.zaihe.values.tolist()
     return transform_table_data_to_html({
         'title': "旋转前因子载荷",
         'col': col,
         'row': row,
         'data': res
     })
예제 #6
0
 def varimax_rotation(self, data):
     data = data.astype(float)
     '''
     该方法对因子载荷矩阵进行最大方差正交矩阵,返回旋转后的因子载荷矩阵
     '''
     zaihe = self.load(data)
     m, n = zaihe.shape
     R = np.eye(n)
     d = 0
     for i in range(self.q):
         d_init = d
         Lambda = np.dot(zaihe, R)
         w, a, wa = np.linalg.svd(
             np.dot(
                 zaihe.T,
                 np.asarray(Lambda)**3 - (self.gamma / m) * np.dot(
                     Lambda, np.diag(np.diag(np.dot(Lambda.T, Lambda))))))
         R = np.dot(w, wa)
         d = np.sum(a)
         if d_init != 0 and d / d_init < 1 + self.tol:
             break
     orthogonal = np.dot(zaihe, R)
     self.orthogonal = orthogonal
     after = pd.DataFrame(orthogonal, index=data.columns, columns=self.col)
     after = format_data_col(after)
     col = after.columns.values.tolist()
     row = after.index.values.tolist()
     res = after.values.tolist()
     return transform_table_data_to_html({
         'title': "旋转后因子载荷",
         'col': col,
         'row': row,
         'data': res
     })
예제 #7
0
def results_describe():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1"], # list,自变量,行
    }
    :return:
    """
    log.info('describe_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X)
    log.info("输入数据大小:{}".format(len(data)))

    try:
        describe_result = transform_table_data_to_html(description(data, X), col0='指标名称')
        log.info("调用描述性统计函数成功")
        response_data = {"res": describe_result,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
예제 #8
0
def results_nonparametric_multi_independent():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
        "Y": ["y"], # list,因变量,当表格方向为v是使用
        "table_direction": "", str,表格方向,水平方向为h,竖直方向为v
    }
    :return:
    """
    log.info('nonparametric_multi_independent_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        Y = request_data['Y']
        table_direction = request_data['table_direction']
        # alpha = float(request_data['alpha'])
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X, Y], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X, Y)
    log.info("输入数据大小:{}".format(len(data)))

    try:
        if table_direction == "v":
            every_level_data_index = [d for d in data[X[0]].unique()]
            # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()]
            data, X = transform_v_table_data_to_h(data, X, Y)
        elif table_direction == "h":
            every_level_data_index = X
            every_level_data = [data[l].astype("float16") for l in X]
            # data, X, Y = transform_h_table_data_to_v(data, X)
        else:
            raise ValueError("table direction must be h or v")
        if len(every_level_data_index) < 2:
            raise ValueError("多个独立样本非参数检验,自变量的水平至少是2个")

        # 描述性统计
        res = []
        data_info = transform_table_data_to_html(Kruskal_Wallis_H_describe(data, X))
        res.append(data_info)
        log.info("描述性统计分析完成")

        # Kruska-Wallis H 检验
        Kruskal_Wallis_H_res = Kruskal_Wallis_H_test(data, X)
        res.append(Kruskal_Wallis_H_res)
        log.info("Kruska-Wallis H 检验完成")

        response_data = {"res": res,
                         "data_info": data_info,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
예제 #9
0
def t_single():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["value"], # list,自变量
        "alpha": "0.05", # str,置信区间百分比
        "mean": "0", # str,样本均值
        "analysis_options": ["normal"]
    }
    :return:
    """
    log.info('t_single_test_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        alpha = float(request_data['alpha'])
        X = request_data['X']
        data_mean = float(request_data['mean'])
        analysis_options = request_data.get("analysis_options", [])
    except Exception as e:
        log.info(e)
        raise e
    # 从数据库拿数据
    data = exec_sql(table_name, X)
    log.info("输入数据大小:{}".format(len(data)))
    try:
        res = []
        data[X[0]] = data[X[0]].astype("float16")
        data_info = transform_table_data_to_html(t_single_describe_info(data, X))
        res.append(data_info)
        # 正态性检验
        if "normal" in analysis_options:
            normal_res = transform_table_data_to_html(normal_test([X[0]], [data[X[0]]], alpha=alpha))
            res.append(normal_res)
        # 单样本t检验分析结果
        t_single_res = transform_table_data_to_html(
            t_single_analysis(data[X[0]].astype("float16"), data_mean, X, alpha=alpha), col0="检验值={}".format(data_mean))
        res.append(t_single_res)
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        # raise e
        return jsonify({"data": "error", "code": "500", "msg": e.args[0]})
예제 #10
0
 def score(self, data):
     data = data.astype(float)
     '''
     该方法用于计算因子得分
     '''
     if self.standardize == True:
         data_scale = FA.standardization(self, data)
         F = np.dot(data_scale, self.coefficient.T)
         F = pd.DataFrame(F)
         col2 = []
         n = range(1, self.component + 1)
         for i in n:
             c = 'ScoreF' + str(i)
             col2.append(c)
         F.columns = col2
         F = format_data_col(F)
         col = F.columns.values.tolist()
         row = F.index.values.tolist()
         res = F.values.tolist()
         return transform_table_data_to_html({
             'title': "因子得分",
             'col': col,
             'row': row,
             'data': res
         })
     elif self.standardize == False:
         data_scale = data
         F = np.dot(data_scale, self.coefficient.T)
         F = pd.DataFrame(F)
         col2 = []
         n = range(1, self.component + 1)
         for i in n:
             c = 'ScoreF' + str(i)
             col2.append(c)
         F.columns = col2
         F = format_data_col(F)
         col = F.columns.values.tolist()
         row = F.index.values.tolist()
         res = F.values.tolist()
         return transform_table_data_to_html({
             'title': "因子得分",
             'col': col,
             'row': row,
             'data': res
         })
def correlation_matrix(x):
    x = x.astype(float)
    da = format_data_col(x.corr())
    col = da.columns.values.tolist()
    row = da.index.values.tolist()
    res = da.values.tolist()
    return transform_table_data_to_html({
        'title': "相关性矩阵",
        'col': col,
        'row': row,
        'data': res
    })
예제 #12
0
 def score_coef(self, data):
     data = data.astype(float)
     '''
     该方法用于计算因子得分函数
     '''
     # R 为原始变量的相关矩阵
     corr = np.corrcoef(data, rowvar=0)
     A = self.varimax_rota(data)
     coefficient = pd.DataFrame(np.dot(np.array(A).T,
                                       np.mat(corr).T),
                                columns=data.columns,
                                index=self.col)
     self.coefficient = coefficient
     defen = coefficient.T
     defen = format_data_col(defen)
     col = defen.columns.values.tolist()
     row = defen.index.values.tolist()
     res = defen.values.tolist()
     return transform_table_data_to_html({
         'title': "因子得分系数矩阵",
         'col': col,
         'row': row,
         'data': res
     })
def cross_chi2(index, columns):
    chi_res = []
    cross_result = pd.crosstab(index=index, columns=columns, margins=True)
    cr_re = pd.crosstab(index=index, columns=columns,
                        margins=False)  # 给模型的不能有汇总列,8/25修改
    chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency(
        cr_re, correction=True, lambda_='pearson')  # pearson 卡方
    chi2_log, p_value_log, dof_log, expect_log = chi2_contingency(
        cr_re, correction=True, lambda_='log-likelihood')
    chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency(
        cr_re, correction=True, lambda_='freeman-tukey')
    chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency(
        cr_re, correction=True, lambda_='mod-log-likelihood')
    chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency(
        cr_re, correction=True, lambda_='neyman')
    chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency(
        cr_re, correction=True, lambda_='cressie-read')

    chi_res.append([
        "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson),
        dof_pearson
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log])
    chi_res.append([
        "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey),
        dof_ftukey
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll])
    chi_res.append([
        "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman),
        dof_neyman
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr])

    corss_index = cross_result.index.tolist()
    corss_index[-1] = '总计'
    corss_columns = cross_result.columns.tolist()
    corss_columns[-1] = '总计'

    corss_value = cross_result.values.tolist()
    exp = pd.DataFrame(expected_freq(cr_re))
    exp = sum_data(exp)
    expect = format_data_col(exp).values.tolist()

    r1 = {
        'title': "交叉表",
        'row': corss_index,
        'col': corss_columns[0:],
        'data': corss_value
    }
    r1 = transform_table_data_to_html(r1)

    r2 = {
        'title': "期望频数表",
        'row': corss_index,
        'col': corss_columns,
        'data': expect
    }
    r2 = transform_table_data_to_html(r2)
    r3 = {
        'title':
        "卡方检验",
        'row': [
            "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood",
            "neyman", "cressie-read"
        ],
        'col': ['值', '显著性', '自由度'],
        'data':
        chi_res
    }
    r3 = transform_table_data_to_html(r3)
    return [r1, r2, r3]
def PCA(x,
        components=None
        ):  # x 是接收的只包含特征变量的dataframe,components=None 接收的用户指定的主成分个数
    x = x.astype(float)
    result = []
    if components == None:
        components = int(x.size / len(x))  # 这里再考虑一下,接收用户指定的几个主成分
    ## 标准化
    average = np.mean(x, axis=0)
    sigma = np.std(x, axis=0, ddof=1)
    r, c = np.shape(x)
    data_standardized = []
    mu = np.tile(
        average,
        (r, 1))  # r 行,铺一遍 https://www.cnblogs.com/elitphil/p/11824539.html
    data_standardized = (x - mu) / sigma
    ## 标准化

    cov_matrix = np.cov(data_standardized.T)  # 协方差矩阵
    EigenValue, EigenVector = np.linalg.eig(cov_matrix)  # 特征值和特征向量

    index = np.argsort(-EigenValue)  # 从大到小排序,返回的是元素在原有数据中的位置序号
    # Score = []
    selected_Vector = EigenVector.T[
        index[:components]]  # 根据指定的主成分个数,选择特征值相对应的特征向量
    Score = np.dot(data_standardized, selected_Vector.T)  # 计算主成分得分
    EigenValue_sorted = EigenValue[index]  # 排序后的特征值
    '''
    特征值贡献及贡献率,需输出一个表
    '''
    EigenValue_contribution = pd.DataFrame(EigenValue_sorted,
                                           columns=['EigenValue'])
    EigenValue_contribution['Proportion'] = EigenValue_contribution[
        'EigenValue'] / EigenValue_contribution['EigenValue'].sum()
    EigenValue_contribution['Cumulative'] = EigenValue_contribution[
        'Proportion'].cumsum()
    '''
    碎石图和带有方差贡献率的碎石图,此图需输出
    '''

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(6.8, 3)
    fig.subplots_adjust(wspace=0.5)  # 改一下大小

    ax1.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['EigenValue'], 'o-')
    ax1.set_title('Scree Plot')
    ax1.set_xlabel('Principal Components')
    ax1.set_ylabel('Eigenvalue')
    ax1.grid()

    ax2.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['Proportion'], 'o-')
    ax2.plot(range(1,
                   len(EigenValue_contribution) + 1),
             EigenValue_contribution['Cumulative'], 'bo-.')
    ax2.set_title('Variance Explained')
    ax2.set_xlabel('Principal Components')
    ax2.set_ylabel('Proportion')
    ax2.grid()
    plt.show()
    '''
    对应的特征向量
    '''
    vector_index = ['prin%d' % (i + 1) for i in range(len(selected_Vector))]
    vector_columns = x.columns.values.tolist()
    principal_vector = pd.DataFrame(selected_Vector,
                                    index=vector_index,
                                    columns=vector_columns).T
    '''
    主成分载荷(成分矩阵),需输出一个表
    '''
    principal_component_load = pd.DataFrame()
    for i in range(len(selected_Vector)):
        principal_component_load['z%d' % (i + 1)] = np.sqrt(
            EigenValue_contribution['EigenValue'][i]) * principal_vector[
                'prin%d' % (i + 1)]
    '''
    主成分得分(成分得分系数矩阵)
    '''
    principal_scores = pd.DataFrame()
    for i in range(len(selected_Vector)):
        principal_scores['prin%d_score' % (i + 1)] = Score[:, i]
    EigenValue_sorted_selected = EigenValue_sorted[:len(selected_Vector)]
    chengji = EigenValue_sorted_selected * principal_scores
    principal_scores['scores'] = chengji.sum(axis=1)
    principal_scores = principal_scores.sort_values(by='scores',
                                                    ascending=False)

    Eig_contri = EigenValue_contribution
    Eig_contri['EigenValue'] = Eig_contri['EigenValue'].apply(
        lambda x: format(x, '.4f'))
    Eig_contri['Proportion'] = Eig_contri['Proportion'].apply(
        lambda x: format(x, '.2%'))
    Eig_contri['Cumulative'] = Eig_contri['Cumulative'].apply(
        lambda x: format(x, '.2%'))
    result.append({
        'title': "总方差解释",
        'col': ['特征值', '特征值方差贡献率', '累计方差贡献率'],
        'data': Eig_contri.values.tolist()
    })
    result.append({
        "title": "碎石图",
        "base64": "{}".format(plot_and_output_base64_png(plt))
    })
    prin_com_load = format_data_col(principal_component_load)
    col = prin_com_load.columns.values.tolist()
    row = prin_com_load.index.values.tolist()
    res = prin_com_load.values.tolist()
    result.append(
        transform_table_data_to_html({
            'title': "主成分载荷",
            'col': col,
            'row': row,
            'data': res
        }))

    prin_scores = format_data_col(principal_scores)
    col = prin_scores.columns.values.tolist()
    row = prin_scores.index.values.tolist()
    res = prin_scores.values.tolist()
    result.append(
        transform_table_data_to_html({
            'title': "主成分得分系数矩阵",
            'col': col,
            'row': row,
            'data': res
        }))

    return result
예제 #15
0
def apriori():
    """
    接口请求参数:{
        "table_name": "apriori_test",  # str,数据库表名
        "X": ["x0", "x1", "x2", "x3", "x4", "x5"],  # list,自变量
        "alg": "fpgrowth',  # str,关联规则算法选择["apriori", "fpgrowth"] ==》【默认值:fpgrowth】
        "dataconvert": True,  # bool,是否需要数据转换 ==》【默认值:True】
        "minSupport": "0.05",  # str,最小支持度 ==》【默认值:"0.05"】
        "max_len": "2",  # 频繁项集最大长度 ==》【默认值:None】
        "metrics": "confidence",  # 关联规则评价指标["support", "confidence", "lift", "leverage", "conviction"] ==》【默认值:confidence】
        "min_threshold": "0.8",  # 关联规则评价指标最小值 ==》【默认值:"0.8"】
    }
    :return:
    """
    log.info('Apriori_init...')
    request_data = init_route()
    try:
        from mlxtend.preprocessing import TransactionEncoder
        from mlxtend.frequent_patterns import apriori
        from mlxtend.frequent_patterns import fpgrowth
        from mlxtend.frequent_patterns import association_rules
    except:
        raise ImportError("cannot import mlxtend")
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        alg = request_data['alg']
        dataconvert = request_data['dataconvert']
        min_support = float(request_data['minSupport'])
        max_len = int(request_data['max_len'])
        metrics = request_data['metrics']
        min_threshold = float(request_data['min_threshold'])
    except Exception as e:
        log.info(e)
        raise e
    try:
        table_data = exec_sql(table_name, X)
        table_data.fillna("", inplace=True)
        data = table_data.values.tolist()
        if dataconvert:
            trans = TransactionEncoder()
            data = trans.fit(data).transform(data)
            data = pd.DataFrame(data, columns=trans.columns_)
            log.info("data columns:{}".format(data.columns.values))
            if "" in data.columns:
                data.drop(columns="", axis=1, inplace=True)
        if alg == "apriori":
            frequent_itemsets = apriori(data, min_support=min_support, max_len=max_len, use_colnames=True)
        elif alg == "fpgrowth":
            frequent_itemsets = fpgrowth(data, min_support=min_support, max_len=max_len, use_colnames=True)
        else:
            raise ValueError("input Association rules:{} is not support".format(alg))
        rules = association_rules(frequent_itemsets, metric=metrics, min_threshold=min_threshold)
        rules = rules.replace([np.inf, -np.inf], "")
        rules = format_dataframe(rules, {"lift": ".4f", "leverage": ".4f"})
        res = [
            transform_table_data_to_html({
                "title": "频繁项集结果",
                "row": frequent_itemsets.index.tolist(),
                "col": frequent_itemsets.columns.tolist(),
                "data": frequent_itemsets.values.tolist(),
            }),
            transform_table_data_to_html({
                "title": "关联规则结果",
                "row": rules.index.tolist(),
                "col": rules.columns.tolist(),
                "data": rules.values.tolist(),
            })
        ]
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.exception(e)
        return jsonify({"code": "500", "res": "", "msg": "{}".format(e.args)})
예제 #16
0
def cross_chis(index, columns, fenceng):
    chi_res = []
    expect = []

    # 多层交叉表
    cross_result = pd.crosstab(index=index, columns=columns, margins=True)
    corss_index = cross_result.index.tolist()
    corss_index[-1] = '总计'
    corss_columns = cross_result.columns.tolist()
    corss_columns[-1] = '总计'
    corss_value = cross_result.values.tolist()

    # 交叉表分析
    cr_re = pd.crosstab(index=index, columns=columns,
                        margins=False)  # 给模型的不能有汇总列
    first_index = np.unique(index[0])
    for i in first_index:
        chis_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='pearson')
        chis_log, p_value_log, dof_log, expect_log = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='log-likelihood')
        chis_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='freeman-tukey')
        chis_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='mod-log-likelihood')
        chis_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='neyman')
        chis_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency(
            cr_re.loc[i, :], correction=True, lambda_='cressie-read')

        chi_res.append([
            "{:.4f}".format(chis_pearson), "{:.4f}".format(p_value_pearson),
            dof_pearson
        ])
        chi_res.append(
            ["{:.4f}".format(chis_log), "{:.4f}".format(p_value_log), dof_log])
        chi_res.append([
            "{:.4f}".format(chis_ftukey), "{:.4f}".format(p_value_ftukey),
            dof_ftukey
        ])
        chi_res.append(
            ["{:.4f}".format(chis_mll), "{:.4f}".format(p_value_mll), dof_mll])
        chi_res.append([
            "{:.4f}".format(chis_neyman), "{:.4f}".format(p_value_neyman),
            dof_neyman
        ])
        chi_res.append(
            ["{:.4f}".format(chis_cr), "{:.4f}".format(p_value_cr), dof_cr])

        for j in expect_pearson:
            expect.append(j)
        # expect.extend(expect_pearson.tolist())
    expect = pd.DataFrame(expect)  #.astype(float)
    expect = sum_data(expect)
    expect = format_data_col(expect).values.tolist()
    # row = ["pearson","log-likelihood","freeman-tukey","mod-log-likelihood","neyman","cressie-read"]*len(first_index)
    row = []

    method = [
        "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood",
        "neyman", "cressie-read"
    ]
    for uindex in first_index:
        for m in method:
            row.append(fenceng[0] + '_' + uindex + ':' + m)

    r1 = {
        'title': "交叉表",
        'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index],
        'col': corss_columns,
        'data': corss_value
    }
    r1 = transform_table_data_to_html(r1)

    r2 = {
        'title': "期望频数表",
        'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index],
        'col': corss_columns[1:],
        'data': expect
    }
    r2 = transform_table_data_to_html(r2)
    r3 = {
        'title': "卡方检验",
        'row': row,
        'col': ['值', '显著性', '自由度'],
        'data': chi_res
    }
    r3 = transform_table_data_to_html(r3)
    return [r1, r2, r3]  # expect
예제 #17
0
def anova_one_way():
    """
    接口请求参数:{
        "table_name_ori": "" # str,数据库表名-数据预处理之前的数据
        "table_name": "" # str,数据库表名-数据处理之后的数据
        "X": ["x1", "x2"], # list,自变量
        "Y": ["y"], # list,因变量
        "alpha": "0.05", # str,置信区间百分比
        "table_direction": "", str,表格方向,水平方向为h,竖直方向为v
        "analysis_options": ["normal", "variances", "multiple"]
    }
    :return:
    """
    log.info('anova_one_way_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        Y = request_data['Y']
        alpha = float(request_data['alpha'])
        table_direction = request_data['table_direction']
        analysis_options = request_data.get("analysis_options", [])
    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X, Y], list)
    # 从数据库拿数据
    data = exec_sql(table_name, X, Y)
    log.info("输入数据大小:{}".format(len(data)))
    try:
        if table_direction == "v":
            data[Y[0]] = data[Y[0]].astype("float16")
            every_level_data_index = [d for d in data[X[0]].unique()]
            every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()]
        elif table_direction == "h":
            every_level_data_index = X
            every_level_data = [data[l].astype("float16") for l in X]
            data, X, Y = transform_h_table_data_to_v(data, X)
        else:
            raise ValueError("table direction must be h or v")
        res = []
        # 描述性统计分析
        data_info = transform_table_data_to_html(anova_one_way_describe_info(data, X, Y, alpha=alpha))
        res.append(data_info)
        # 正太分布检验
        if "normal" in analysis_options:
            normal_res = transform_table_data_to_html(normal_test(every_level_data_index, every_level_data, alpha),
                                                      col0="因子水平")
            res.append(normal_res)
        # 方差齐性检验
        if "variances" in analysis_options:
            equal_variances_res = transform_table_data_to_html(levene_test(*every_level_data, alpha=alpha))
            res.append(equal_variances_res)
        # 方差分析
        anova_res = transform_table_data_to_html(anova_analysis(data, X[0], Y[0], alpha=alpha))
        res.append(anova_res)
        # 多重比较
        if "multiple" in analysis_options:
            multiple_res = multiple_test(data, X, Y, alpha=alpha)
            res.append(multiple_res)
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        # raise e
        return jsonify({"data": "", "code": "500", "msg": e.args[0]})
예제 #18
0
def result_one_sample_chi():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,检测变量
        "E": ["e1","e2"], # list,期望频率变量
        "input_e": [2,3,4], #用户具体输入的期望频率
        "button_type": ["select","input","null"] #str 按钮的类型
    }
    :return:
    """
    log.info('result_one_sample_chi_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        E = request_data['E']
        input_e = request_data['input_e'] #############################
        button_type = request_data['button_type']

    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X], list)
    results = []
    try:
        if button_type[0] == 'null':
            da = exec_sql(table_name, X)
            da = da.astype(float)
            data = [da[i] for i in X]
            log.info("输入数据大小:{}".format(len(data)))
            if da.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(da[X[0]], axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif da.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(data, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("无期望频率情况分析完成")

        elif button_type[0] == 'select':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            ex = exec_sql(table_name, E)
            ex = ex.astype(float)
            expect = [ex[j] for j in E]
            log.info("输入数据大小:{}".format(len(test)))
            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("有期望频率情况分析完成")

        elif button_type[0] == 'input':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            expect = input_e
            expect = pd.DataFrame(expect)
            expect = expect.astype(float)
            expect = expect.values.tolist()
            log.info("输入数据大小:{}".format(len(test)))

            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results =  transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("用户输入的期望频率情况分析完成")
        response_data = {
                             "code": "200",
                             "msg": "ok!",
                             "res":results}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e