def anova_all_way(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,自变量 "Y": ["y"], # list,因变量 "alpha": "0.05", # str,置信区间百分比 "table_direction": "", str,表格方向,水平方向为h,竖直方向为v "analysis_options": ["normal", "variances", "multiple"] } :return: """ log.info('anova_all_way_test_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] Y = request_data['Y'] alpha = float(request_data['alpha']) table_direction = request_data['table_direction'] analysis_options = request_data.get("analysis_options", []) except Exception as e: log.info(e) raise e assert isinstance([X, Y], list) # 从数据库拿数据 data = exec_sql(table_name, X, Y) log.info("输入数据大小:{}".format(len(data))) try: if table_direction == "v": data[Y[0]] = data[Y[0]].astype("float16") # every_level_data_index = [d for d in data[X[0]].unique()] # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()] elif table_direction == "h": # every_level_data_index = X # every_level_data = [data[l].astype("float16") for l in X] data, X, Y = transform_h_table_data_to_v(data, X) else: raise ValueError("table direction must be h or v") res = [] # 主体间因子 res.append(level_info(data, X)) # 描述性统计分析 res.append(anova_all_way_describe_info(data, X, Y)) if "normal" in analysis_options: res.append(normal_test_all(data, X, alpha=alpha)) if "variances" in analysis_options: res.append(transform_table_data_to_html(levene_test_all(data, X, alpha=alpha))) # 多因素方差分析 res.append(transform_table_data_to_html(anova_analysis_multivariate(data, X, Y))) # todo:稍后加 # 多重比较 response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) # raise e return jsonify({"data": "", "code": "500", "msg": e.args[0]})
def t_two_pair(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 "Y": ["y"], # list,因变量,当表格方向为v是使用 "alpha": "0.05", # str,置信区间百分比 "table_direction": "", str,表格方向,水平方向为h,竖直方向为v "analysis_options": ["normal", "pearsonr"] } :return: """ log.info('t_two_pair_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] Y = request_data['Y'] table_direction = request_data['table_direction'] alpha = float(request_data['alpha']) analysis_options = request_data.get("analysis_options", []) except Exception as e: log.info(e) raise e assert isinstance([X, Y], list) # 从数据库拿数据 data = exec_sql(table_name, X, Y) log.info("输入数据大小:{}".format(len(data))) try: if table_direction == "v": every_level_data_index = [d for d in data[X[0]].unique()] every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()] elif table_direction == "h": every_level_data_index = X every_level_data = [data[l].astype("float16") for l in X] data, X, Y = transform_h_table_data_to_v(data, X) else: raise ValueError("table direction must be h or v") if len(every_level_data_index) > 2: raise ValueError("自变量的水平必须是2个") res = [] # 描述性统计分析 res.append(transform_table_data_to_html(t_two_paired_describe_info(data, X, Y))) if "pearsonr" in analysis_options: res.append(transform_table_data_to_html( pearsonr_test(*every_level_data, index=every_level_data_index, alpha=alpha))) if "normal" in analysis_options: res.append(transform_table_data_to_html(normal_test(every_level_data_index, every_level_data, alpha))) res.append(transform_table_data_to_html( t_two_pair_analysis(*every_level_data, index=every_level_data_index, alpha=alpha), col0="配对差值")) response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) # raise e return jsonify({"data": "", "code": "500", "msg": e.args[0]})
def nonparametric_two_independent(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 "Y": ["y"], # list,因变量,当表格方向为v是使用 "table_direction": "", str,表格方向,水平方向为h,竖直方向为v } :return: """ log.info('nonparametric_two_independent_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] Y = request_data['Y'] table_direction = request_data['table_direction'] except Exception as e: log.info(e) raise e assert isinstance([X, Y], list) # 从数据库拿数据 data = exec_sql(table_name, X, Y) log.info("输入数据大小:{}".format(len(data))) try: if table_direction == "v": every_level_data_index = [d for d in data[X[0]].unique()] # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()] data, X = transform_v_table_data_to_h(data, X, Y) elif table_direction == "h": every_level_data_index = X every_level_data = [data[l].astype("float16") for l in X] # data, X, Y = transform_h_table_data_to_v(data, X) # 水平的数据,这里不用转 else: raise ValueError("table direction must be h or v") if len(every_level_data_index) > 2: raise ValueError("自变量的水平必须是2个") # 描述性统计 res = [] data_info = transform_table_data_to_html(Mann_Whitney_U_describe(data, X)) res.append(data_info) # Mann-Whitney U 检验 Mann_Whitney_U_res = transform_table_data_to_html(Mann_Whitney_U_test(data, X)) res.append(Mann_Whitney_U_res) response_data = {"res": res, "data_info": data_info, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) raise e
def results_nonparametric_two_independent(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 # "table_direction": "h", str,表格方向,水平方向为h,竖直方向为v } :return: """ log.info('nonparametric_two_pair_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] # Y = request_data['Y'] # table_direction = request_data['table_direction'] # alpha = float(request_data['alpha']) except Exception as e: log.info(e) raise e # assert isinstance([X, Y], list) assert isinstance([X], list) if len(X) > 2: raise ValueError("只支持一列数据或两列数据") # 从数据库拿数据 # data = exec_sql(table_name, X, Y) data = exec_sql(table_name, X) log.info("输入数据大小:{}".format(len(data))) try: # 描述性统计 res = [] data_info = transform_table_data_to_html(Wilcoxon_describe(data, X)) res.append(data_info) log.info("描述性统计分析完成") # Wilcoxon 符号秩检验 Wilcoxon_res = transform_table_data_to_html(Wilcoxon_test(data, X)) res.append(Wilcoxon_res) log.info("Wilcoxon 符号秩检验完成") response_data = {"res": res, "data_info": data_info, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) raise e
def loadings(self, data): data = data.astype(float) ''' 该方法用于输出旋转前的因子载荷矩阵 ''' factor_num = self.component # 接下来求解因子载荷矩阵 # 生成由前factor_num个特征值构成的对角阵,存入duijiao中用于计算因子载荷矩阵 eigvalue = self.var_contri(data)['Eigvalue'] ## duijiao = list(np.array(np.sqrt(eigvalue[:factor_num]), dtype=float)) eigmat = np.diag(duijiao) zaihe = np.dot(self.eigvector[:factor_num].T, eigmat) self.zaihe = zaihe n = range(1, factor_num + 1) col = [] for i in n: c = 'Factor' + str(i) col.append(c) zaihe = -pd.DataFrame(zaihe, columns=col) zaihe.iloc[:, 1] = -zaihe.iloc[:, 1] self.col = col zaihe.index = data.columns self.zaihe = zaihe self.zaihe = format_data_col(self.zaihe) col = self.zaihe.columns.values.tolist() row = self.zaihe.index.values.tolist() res = self.zaihe.values.tolist() return transform_table_data_to_html({ 'title': "旋转前因子载荷", 'col': col, 'row': row, 'data': res })
def varimax_rotation(self, data): data = data.astype(float) ''' 该方法对因子载荷矩阵进行最大方差正交矩阵,返回旋转后的因子载荷矩阵 ''' zaihe = self.load(data) m, n = zaihe.shape R = np.eye(n) d = 0 for i in range(self.q): d_init = d Lambda = np.dot(zaihe, R) w, a, wa = np.linalg.svd( np.dot( zaihe.T, np.asarray(Lambda)**3 - (self.gamma / m) * np.dot( Lambda, np.diag(np.diag(np.dot(Lambda.T, Lambda)))))) R = np.dot(w, wa) d = np.sum(a) if d_init != 0 and d / d_init < 1 + self.tol: break orthogonal = np.dot(zaihe, R) self.orthogonal = orthogonal after = pd.DataFrame(orthogonal, index=data.columns, columns=self.col) after = format_data_col(after) col = after.columns.values.tolist() row = after.index.values.tolist() res = after.values.tolist() return transform_table_data_to_html({ 'title': "旋转后因子载荷", 'col': col, 'row': row, 'data': res })
def results_describe(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1"], # list,自变量,行 } :return: """ log.info('describe_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] except Exception as e: log.info(e) raise e assert isinstance([X], list) # 从数据库拿数据 data = exec_sql(table_name, X) log.info("输入数据大小:{}".format(len(data))) try: describe_result = transform_table_data_to_html(description(data, X), col0='指标名称') log.info("调用描述性统计函数成功") response_data = {"res": describe_result, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) raise e
def results_nonparametric_multi_independent(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 "Y": ["y"], # list,因变量,当表格方向为v是使用 "table_direction": "", str,表格方向,水平方向为h,竖直方向为v } :return: """ log.info('nonparametric_multi_independent_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] Y = request_data['Y'] table_direction = request_data['table_direction'] # alpha = float(request_data['alpha']) except Exception as e: log.info(e) raise e assert isinstance([X, Y], list) # 从数据库拿数据 data = exec_sql(table_name, X, Y) log.info("输入数据大小:{}".format(len(data))) try: if table_direction == "v": every_level_data_index = [d for d in data[X[0]].unique()] # every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()] data, X = transform_v_table_data_to_h(data, X, Y) elif table_direction == "h": every_level_data_index = X every_level_data = [data[l].astype("float16") for l in X] # data, X, Y = transform_h_table_data_to_v(data, X) else: raise ValueError("table direction must be h or v") if len(every_level_data_index) < 2: raise ValueError("多个独立样本非参数检验,自变量的水平至少是2个") # 描述性统计 res = [] data_info = transform_table_data_to_html(Kruskal_Wallis_H_describe(data, X)) res.append(data_info) log.info("描述性统计分析完成") # Kruska-Wallis H 检验 Kruskal_Wallis_H_res = Kruskal_Wallis_H_test(data, X) res.append(Kruskal_Wallis_H_res) log.info("Kruska-Wallis H 检验完成") response_data = {"res": res, "data_info": data_info, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) raise e
def t_single(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["value"], # list,自变量 "alpha": "0.05", # str,置信区间百分比 "mean": "0", # str,样本均值 "analysis_options": ["normal"] } :return: """ log.info('t_single_test_init...') request_data = init_route() try: table_name = request_data['table_name'] alpha = float(request_data['alpha']) X = request_data['X'] data_mean = float(request_data['mean']) analysis_options = request_data.get("analysis_options", []) except Exception as e: log.info(e) raise e # 从数据库拿数据 data = exec_sql(table_name, X) log.info("输入数据大小:{}".format(len(data))) try: res = [] data[X[0]] = data[X[0]].astype("float16") data_info = transform_table_data_to_html(t_single_describe_info(data, X)) res.append(data_info) # 正态性检验 if "normal" in analysis_options: normal_res = transform_table_data_to_html(normal_test([X[0]], [data[X[0]]], alpha=alpha)) res.append(normal_res) # 单样本t检验分析结果 t_single_res = transform_table_data_to_html( t_single_analysis(data[X[0]].astype("float16"), data_mean, X, alpha=alpha), col0="检验值={}".format(data_mean)) res.append(t_single_res) response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) # raise e return jsonify({"data": "error", "code": "500", "msg": e.args[0]})
def score(self, data): data = data.astype(float) ''' 该方法用于计算因子得分 ''' if self.standardize == True: data_scale = FA.standardization(self, data) F = np.dot(data_scale, self.coefficient.T) F = pd.DataFrame(F) col2 = [] n = range(1, self.component + 1) for i in n: c = 'ScoreF' + str(i) col2.append(c) F.columns = col2 F = format_data_col(F) col = F.columns.values.tolist() row = F.index.values.tolist() res = F.values.tolist() return transform_table_data_to_html({ 'title': "因子得分", 'col': col, 'row': row, 'data': res }) elif self.standardize == False: data_scale = data F = np.dot(data_scale, self.coefficient.T) F = pd.DataFrame(F) col2 = [] n = range(1, self.component + 1) for i in n: c = 'ScoreF' + str(i) col2.append(c) F.columns = col2 F = format_data_col(F) col = F.columns.values.tolist() row = F.index.values.tolist() res = F.values.tolist() return transform_table_data_to_html({ 'title': "因子得分", 'col': col, 'row': row, 'data': res })
def correlation_matrix(x): x = x.astype(float) da = format_data_col(x.corr()) col = da.columns.values.tolist() row = da.index.values.tolist() res = da.values.tolist() return transform_table_data_to_html({ 'title': "相关性矩阵", 'col': col, 'row': row, 'data': res })
def score_coef(self, data): data = data.astype(float) ''' 该方法用于计算因子得分函数 ''' # R 为原始变量的相关矩阵 corr = np.corrcoef(data, rowvar=0) A = self.varimax_rota(data) coefficient = pd.DataFrame(np.dot(np.array(A).T, np.mat(corr).T), columns=data.columns, index=self.col) self.coefficient = coefficient defen = coefficient.T defen = format_data_col(defen) col = defen.columns.values.tolist() row = defen.index.values.tolist() res = defen.values.tolist() return transform_table_data_to_html({ 'title': "因子得分系数矩阵", 'col': col, 'row': row, 'data': res })
def cross_chi2(index, columns): chi_res = [] cross_result = pd.crosstab(index=index, columns=columns, margins=True) cr_re = pd.crosstab(index=index, columns=columns, margins=False) # 给模型的不能有汇总列,8/25修改 chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency( cr_re, correction=True, lambda_='pearson') # pearson 卡方 chi2_log, p_value_log, dof_log, expect_log = chi2_contingency( cr_re, correction=True, lambda_='log-likelihood') chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency( cr_re, correction=True, lambda_='freeman-tukey') chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency( cr_re, correction=True, lambda_='mod-log-likelihood') chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency( cr_re, correction=True, lambda_='neyman') chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency( cr_re, correction=True, lambda_='cressie-read') chi_res.append([ "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson), dof_pearson ]) chi_res.append( ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log]) chi_res.append([ "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey), dof_ftukey ]) chi_res.append( ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll]) chi_res.append([ "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman), dof_neyman ]) chi_res.append( ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr]) corss_index = cross_result.index.tolist() corss_index[-1] = '总计' corss_columns = cross_result.columns.tolist() corss_columns[-1] = '总计' corss_value = cross_result.values.tolist() exp = pd.DataFrame(expected_freq(cr_re)) exp = sum_data(exp) expect = format_data_col(exp).values.tolist() r1 = { 'title': "交叉表", 'row': corss_index, 'col': corss_columns[0:], 'data': corss_value } r1 = transform_table_data_to_html(r1) r2 = { 'title': "期望频数表", 'row': corss_index, 'col': corss_columns, 'data': expect } r2 = transform_table_data_to_html(r2) r3 = { 'title': "卡方检验", 'row': [ "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", "cressie-read" ], 'col': ['值', '显著性', '自由度'], 'data': chi_res } r3 = transform_table_data_to_html(r3) return [r1, r2, r3]
def PCA(x, components=None ): # x 是接收的只包含特征变量的dataframe,components=None 接收的用户指定的主成分个数 x = x.astype(float) result = [] if components == None: components = int(x.size / len(x)) # 这里再考虑一下,接收用户指定的几个主成分 ## 标准化 average = np.mean(x, axis=0) sigma = np.std(x, axis=0, ddof=1) r, c = np.shape(x) data_standardized = [] mu = np.tile( average, (r, 1)) # r 行,铺一遍 https://www.cnblogs.com/elitphil/p/11824539.html data_standardized = (x - mu) / sigma ## 标准化 cov_matrix = np.cov(data_standardized.T) # 协方差矩阵 EigenValue, EigenVector = np.linalg.eig(cov_matrix) # 特征值和特征向量 index = np.argsort(-EigenValue) # 从大到小排序,返回的是元素在原有数据中的位置序号 # Score = [] selected_Vector = EigenVector.T[ index[:components]] # 根据指定的主成分个数,选择特征值相对应的特征向量 Score = np.dot(data_standardized, selected_Vector.T) # 计算主成分得分 EigenValue_sorted = EigenValue[index] # 排序后的特征值 ''' 特征值贡献及贡献率,需输出一个表 ''' EigenValue_contribution = pd.DataFrame(EigenValue_sorted, columns=['EigenValue']) EigenValue_contribution['Proportion'] = EigenValue_contribution[ 'EigenValue'] / EigenValue_contribution['EigenValue'].sum() EigenValue_contribution['Cumulative'] = EigenValue_contribution[ 'Proportion'].cumsum() ''' 碎石图和带有方差贡献率的碎石图,此图需输出 ''' fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(6.8, 3) fig.subplots_adjust(wspace=0.5) # 改一下大小 ax1.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['EigenValue'], 'o-') ax1.set_title('Scree Plot') ax1.set_xlabel('Principal Components') ax1.set_ylabel('Eigenvalue') ax1.grid() ax2.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['Proportion'], 'o-') ax2.plot(range(1, len(EigenValue_contribution) + 1), EigenValue_contribution['Cumulative'], 'bo-.') ax2.set_title('Variance Explained') ax2.set_xlabel('Principal Components') ax2.set_ylabel('Proportion') ax2.grid() plt.show() ''' 对应的特征向量 ''' vector_index = ['prin%d' % (i + 1) for i in range(len(selected_Vector))] vector_columns = x.columns.values.tolist() principal_vector = pd.DataFrame(selected_Vector, index=vector_index, columns=vector_columns).T ''' 主成分载荷(成分矩阵),需输出一个表 ''' principal_component_load = pd.DataFrame() for i in range(len(selected_Vector)): principal_component_load['z%d' % (i + 1)] = np.sqrt( EigenValue_contribution['EigenValue'][i]) * principal_vector[ 'prin%d' % (i + 1)] ''' 主成分得分(成分得分系数矩阵) ''' principal_scores = pd.DataFrame() for i in range(len(selected_Vector)): principal_scores['prin%d_score' % (i + 1)] = Score[:, i] EigenValue_sorted_selected = EigenValue_sorted[:len(selected_Vector)] chengji = EigenValue_sorted_selected * principal_scores principal_scores['scores'] = chengji.sum(axis=1) principal_scores = principal_scores.sort_values(by='scores', ascending=False) Eig_contri = EigenValue_contribution Eig_contri['EigenValue'] = Eig_contri['EigenValue'].apply( lambda x: format(x, '.4f')) Eig_contri['Proportion'] = Eig_contri['Proportion'].apply( lambda x: format(x, '.2%')) Eig_contri['Cumulative'] = Eig_contri['Cumulative'].apply( lambda x: format(x, '.2%')) result.append({ 'title': "总方差解释", 'col': ['特征值', '特征值方差贡献率', '累计方差贡献率'], 'data': Eig_contri.values.tolist() }) result.append({ "title": "碎石图", "base64": "{}".format(plot_and_output_base64_png(plt)) }) prin_com_load = format_data_col(principal_component_load) col = prin_com_load.columns.values.tolist() row = prin_com_load.index.values.tolist() res = prin_com_load.values.tolist() result.append( transform_table_data_to_html({ 'title': "主成分载荷", 'col': col, 'row': row, 'data': res })) prin_scores = format_data_col(principal_scores) col = prin_scores.columns.values.tolist() row = prin_scores.index.values.tolist() res = prin_scores.values.tolist() result.append( transform_table_data_to_html({ 'title': "主成分得分系数矩阵", 'col': col, 'row': row, 'data': res })) return result
def apriori(): """ 接口请求参数:{ "table_name": "apriori_test", # str,数据库表名 "X": ["x0", "x1", "x2", "x3", "x4", "x5"], # list,自变量 "alg": "fpgrowth', # str,关联规则算法选择["apriori", "fpgrowth"] ==》【默认值:fpgrowth】 "dataconvert": True, # bool,是否需要数据转换 ==》【默认值:True】 "minSupport": "0.05", # str,最小支持度 ==》【默认值:"0.05"】 "max_len": "2", # 频繁项集最大长度 ==》【默认值:None】 "metrics": "confidence", # 关联规则评价指标["support", "confidence", "lift", "leverage", "conviction"] ==》【默认值:confidence】 "min_threshold": "0.8", # 关联规则评价指标最小值 ==》【默认值:"0.8"】 } :return: """ log.info('Apriori_init...') request_data = init_route() try: from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules except: raise ImportError("cannot import mlxtend") try: table_name = request_data['table_name'] X = request_data['X'] alg = request_data['alg'] dataconvert = request_data['dataconvert'] min_support = float(request_data['minSupport']) max_len = int(request_data['max_len']) metrics = request_data['metrics'] min_threshold = float(request_data['min_threshold']) except Exception as e: log.info(e) raise e try: table_data = exec_sql(table_name, X) table_data.fillna("", inplace=True) data = table_data.values.tolist() if dataconvert: trans = TransactionEncoder() data = trans.fit(data).transform(data) data = pd.DataFrame(data, columns=trans.columns_) log.info("data columns:{}".format(data.columns.values)) if "" in data.columns: data.drop(columns="", axis=1, inplace=True) if alg == "apriori": frequent_itemsets = apriori(data, min_support=min_support, max_len=max_len, use_colnames=True) elif alg == "fpgrowth": frequent_itemsets = fpgrowth(data, min_support=min_support, max_len=max_len, use_colnames=True) else: raise ValueError("input Association rules:{} is not support".format(alg)) rules = association_rules(frequent_itemsets, metric=metrics, min_threshold=min_threshold) rules = rules.replace([np.inf, -np.inf], "") rules = format_dataframe(rules, {"lift": ".4f", "leverage": ".4f"}) res = [ transform_table_data_to_html({ "title": "频繁项集结果", "row": frequent_itemsets.index.tolist(), "col": frequent_itemsets.columns.tolist(), "data": frequent_itemsets.values.tolist(), }), transform_table_data_to_html({ "title": "关联规则结果", "row": rules.index.tolist(), "col": rules.columns.tolist(), "data": rules.values.tolist(), }) ] response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.exception(e) return jsonify({"code": "500", "res": "", "msg": "{}".format(e.args)})
def cross_chis(index, columns, fenceng): chi_res = [] expect = [] # 多层交叉表 cross_result = pd.crosstab(index=index, columns=columns, margins=True) corss_index = cross_result.index.tolist() corss_index[-1] = '总计' corss_columns = cross_result.columns.tolist() corss_columns[-1] = '总计' corss_value = cross_result.values.tolist() # 交叉表分析 cr_re = pd.crosstab(index=index, columns=columns, margins=False) # 给模型的不能有汇总列 first_index = np.unique(index[0]) for i in first_index: chis_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='pearson') chis_log, p_value_log, dof_log, expect_log = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='log-likelihood') chis_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='freeman-tukey') chis_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='mod-log-likelihood') chis_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='neyman') chis_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency( cr_re.loc[i, :], correction=True, lambda_='cressie-read') chi_res.append([ "{:.4f}".format(chis_pearson), "{:.4f}".format(p_value_pearson), dof_pearson ]) chi_res.append( ["{:.4f}".format(chis_log), "{:.4f}".format(p_value_log), dof_log]) chi_res.append([ "{:.4f}".format(chis_ftukey), "{:.4f}".format(p_value_ftukey), dof_ftukey ]) chi_res.append( ["{:.4f}".format(chis_mll), "{:.4f}".format(p_value_mll), dof_mll]) chi_res.append([ "{:.4f}".format(chis_neyman), "{:.4f}".format(p_value_neyman), dof_neyman ]) chi_res.append( ["{:.4f}".format(chis_cr), "{:.4f}".format(p_value_cr), dof_cr]) for j in expect_pearson: expect.append(j) # expect.extend(expect_pearson.tolist()) expect = pd.DataFrame(expect) #.astype(float) expect = sum_data(expect) expect = format_data_col(expect).values.tolist() # row = ["pearson","log-likelihood","freeman-tukey","mod-log-likelihood","neyman","cressie-read"]*len(first_index) row = [] method = [ "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", "cressie-read" ] for uindex in first_index: for m in method: row.append(fenceng[0] + '_' + uindex + ':' + m) r1 = { 'title': "交叉表", 'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index], 'col': corss_columns, 'data': corss_value } r1 = transform_table_data_to_html(r1) r2 = { 'title': "期望频数表", 'row': ["/".join(["{}".format(d) for d in c]) for c in corss_index], 'col': corss_columns[1:], 'data': expect } r2 = transform_table_data_to_html(r2) r3 = { 'title': "卡方检验", 'row': row, 'col': ['值', '显著性', '自由度'], 'data': chi_res } r3 = transform_table_data_to_html(r3) return [r1, r2, r3] # expect
def anova_one_way(): """ 接口请求参数:{ "table_name_ori": "" # str,数据库表名-数据预处理之前的数据 "table_name": "" # str,数据库表名-数据处理之后的数据 "X": ["x1", "x2"], # list,自变量 "Y": ["y"], # list,因变量 "alpha": "0.05", # str,置信区间百分比 "table_direction": "", str,表格方向,水平方向为h,竖直方向为v "analysis_options": ["normal", "variances", "multiple"] } :return: """ log.info('anova_one_way_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] Y = request_data['Y'] alpha = float(request_data['alpha']) table_direction = request_data['table_direction'] analysis_options = request_data.get("analysis_options", []) except Exception as e: log.info(e) raise e assert isinstance([X, Y], list) # 从数据库拿数据 data = exec_sql(table_name, X, Y) log.info("输入数据大小:{}".format(len(data))) try: if table_direction == "v": data[Y[0]] = data[Y[0]].astype("float16") every_level_data_index = [d for d in data[X[0]].unique()] every_level_data = [data[data[X[0]] == d][Y[0]].astype("float16") for d in data[X[0]].unique()] elif table_direction == "h": every_level_data_index = X every_level_data = [data[l].astype("float16") for l in X] data, X, Y = transform_h_table_data_to_v(data, X) else: raise ValueError("table direction must be h or v") res = [] # 描述性统计分析 data_info = transform_table_data_to_html(anova_one_way_describe_info(data, X, Y, alpha=alpha)) res.append(data_info) # 正太分布检验 if "normal" in analysis_options: normal_res = transform_table_data_to_html(normal_test(every_level_data_index, every_level_data, alpha), col0="因子水平") res.append(normal_res) # 方差齐性检验 if "variances" in analysis_options: equal_variances_res = transform_table_data_to_html(levene_test(*every_level_data, alpha=alpha)) res.append(equal_variances_res) # 方差分析 anova_res = transform_table_data_to_html(anova_analysis(data, X[0], Y[0], alpha=alpha)) res.append(anova_res) # 多重比较 if "multiple" in analysis_options: multiple_res = multiple_test(data, X, Y, alpha=alpha) res.append(multiple_res) response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.error(e) # raise e return jsonify({"data": "", "code": "500", "msg": e.args[0]})
def result_one_sample_chi(): """ 接口请求参数:{ "table_name": "" # str,数据库表名 "X": ["x1", "x2"], # list,检测变量 "E": ["e1","e2"], # list,期望频率变量 "input_e": [2,3,4], #用户具体输入的期望频率 "button_type": ["select","input","null"] #str 按钮的类型 } :return: """ log.info('result_one_sample_chi_get_results_init...') request_data = init_route() try: table_name = request_data['table_name'] X = request_data['X'] E = request_data['E'] input_e = request_data['input_e'] ############################# button_type = request_data['button_type'] except Exception as e: log.info(e) raise e assert isinstance([X], list) results = [] try: if button_type[0] == 'null': da = exec_sql(table_name, X) da = da.astype(float) data = [da[i] for i in X] log.info("输入数据大小:{}".format(len(data))) if da.shape[1] == 1: statistic, pvalue = stats.power_divergence(da[X[0]], axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif da.shape[1] > 1: statistic, pvalue = stats.power_divergence(data, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("无期望频率情况分析完成") elif button_type[0] == 'select': te = exec_sql(table_name, X) te = te.astype(float) test = [te[i] for i in X] ex = exec_sql(table_name, E) ex = ex.astype(float) expect = [ex[j] for j in E] log.info("输入数据大小:{}".format(len(test))) if te.shape[1] == 1: statistic, pvalue = stats.power_divergence(test,expect, axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif te.shape[1] > 1: statistic, pvalue = stats.power_divergence(test,expect, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("有期望频率情况分析完成") elif button_type[0] == 'input': te = exec_sql(table_name, X) te = te.astype(float) test = [te[i] for i in X] expect = input_e expect = pd.DataFrame(expect) expect = expect.astype(float) expect = expect.values.tolist() log.info("输入数据大小:{}".format(len(test))) if te.shape[1] == 1: statistic, pvalue = stats.power_divergence(test,expect, axis=0) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) elif te.shape[1] > 1: statistic, pvalue = stats.power_divergence(test,expect, axis=1) title = '单样本卡方检验' col = ['卡方', '显著性'] row = X d = pd.DataFrame([statistic, pvalue]).T d = d.astype(float) d = format_data_col(d) res = d.values.tolist() results = transform_table_data_to_html({ 'title': title, 'col': col, 'row': row, 'data': res }) log.info("用户输入的期望频率情况分析完成") response_data = { "code": "200", "msg": "ok!", "res":results} return jsonify(response_data) except Exception as e: log.error(e) raise e