Exemplo n.º 1
0
    def run(self, df, x, y, extra_args={'bins': 10}):

        #msg={'error':None,'warning':None}

        tsy = df[x]
        tsx = df[y]
        tsy = tsy.reset_index(drop=True)
        tsx = tsx.reset_index(drop=True)

        msg = {}

        xl = len(tsx)
        yl = len(tsy)
        if xl != yl:
            msg['error'] = '输入的tsx的长度为:%s 不等于输入的tsy的长度: %s !\n ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        self.bins = extra_args.get('bins')
        if not isSeries(tsy) & isSeries(tsx):
            msg['error'] = 'tsx或者tsy不是 pandas Series 数据类型!\n'
            return {'result': pd.DataFrame(), 'msg': msg}

        else:
            if not isCategory(tsy):
                tsy = pd.cut(tsy, bins=self.bins)
                msg['warning'] = '列tsy不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据\n' % self.bins

            if not isCategory(tsx):
                tsx = pd.cut(tsx, bins=self.bins)
                if msg.get('warning'):
                    msg['warning'] = msg[
                        'warning'] + '列tsx不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据\n' % self.bins
                else:
                    msg['warning'] = 't列tsx不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据\n' % self.bins

            dfres, msg1 = core(tsx, tsy, method)

            msg = {**msg, **msg1}

            return {
                'tables': [{
                    'table_json':
                    dfres.T.reset_index().to_json(orient='index'),
                    'table_html':
                    dfres.to_html(),
                    'table_info':
                    '卡方检验分析结果',
                    'chart': ['heatmap', 'line', 'bar']
                }],
                'conf':
                self.get_info(),
                'msg':
                msg
            }, [{
                'table_df': dfres,
                'label': '卡方检验分析结果'
            }]
Exemplo n.º 2
0
    def run(self, tsx, tsy, bins=10):

        msg = {}
        tsy = tsy.reset_index(drop=True)
        tsx = tsx.reset_index(drop=True)

        msg = {}

        xl = len(tsx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = '输入的tsx的长度为:%s 不等于输入的tsy的长度: %s ! ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        self.bins = bins
        if not isSeries(tsy) & isSeries(tsx):
            logging.error('X or y data are not a pandas Series type!')

            msg['error'] = 'tsx或者tsy不是 pandas Series 数据类型!'
            return {'result': pd.DataFrame(), 'msg': msg}

        else:
            if not isCategory(tsy):
                tsy = pd.cut(tsy, bins=bins)
                logging.warning(
                    'the Series tsy is not category type, will be convert to category type by bins of %d'
                    % self.bins)
                msg['warning'] = '列tsy不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据' % self.bins

            if not isCategory(tsx):
                tsx = pd.cut(tsx, bins=bins)
                logging.warning(
                    'the Series tsx is not category type, will be convert to category type by bins of %d'
                    % self.bins)

                if msg.get('warning'):

                    msg['warning'] = msg[
                        'warning'] + '列tsx不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据' % self.bins

                else:
                    msg['warning'] = 't列tsx不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据' % self.bins

            table, results = crosstab(tsx, tsy, prop='col', test='chi-square')

            return {'result': results, 'table': table, 'msg': msg}
    def run(self, dfx, tsy):

        tsy = tsy.reset_index(drop=True)
        dfx = dfx.reset_index(drop=True)

        msg = {}

        xl = len(dfx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = '输入的dfx的长度为:%s 不等于输入的tsy的长度: %s  ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        if not isSeries(tsy) or not isCategory(tsy):
            logging.error(
                'input tsy is not a pandas Series or not a category data!')
            msg['error'] = '输入的tsy不是定类型数据或者Series类型'

            return {'result': pd.DataFrame(), 'msg': msg}

        else:
            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

            if x_numer_cols == []:
                logging.error(
                    'All input dfx are no numeric columns, Please check your input dfx data!'
                )
                msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据'
                return {'result': pd.DataFrame(), 'msg': msg}

            else:

                if x_cate_cols != []:
                    logging.warning(
                        'input dfx has non-numeric columns: %s, will ignore these columns!'
                        % x_cate_cols)

                    msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols

                name = tsy.name

                dfu = dfx[x_numer_cols].join(tsy)
                m = dfu.groupby(name).mean().T
                s = dfu.groupby(name).std().T

                def change(ts):
                    v = []
                    for i in ts.index:
                        r = '%s±%s' % (round(ts.loc[i],
                                             2), round(s[ts.name].loc[i], 2))
                        v.append(r)
                    return pd.Series(v, index=ts.index)

                m1 = m.apply(change)

                rs = []
                for i in x_numer_cols:
                    model = ols('%s ~ %s' % (i, tsy.name), dfu).fit()
                    anovat = anova_lm(model)
                    anovat.columns = ['自由度', '平方和', '均方和', 'F-值', 'p-值']
                    rs.append(anovat.iloc[0].to_frame(name=i).T)

                res = m1.join(pd.concat(rs))
                res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x))

                return {'result': res.round(5), 'msg': msg}
Exemplo n.º 4
0
    def run(self, dfx, tsy):

        tsy = tsy.reset_index(drop=True)
        dfx = dfx.reset_index(drop=True)

        msg = {}

        xl = len(dfx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = 'the length of input X:%s is not equal the length of Y: %s ! ' % (
                xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        if not isSeries(tsy) or not isCategory(tsy):
            logging.error(
                'input tsy is not a pandas Series or not a category data!')
            msg['error'] = 'input tsy is not a pandas Series or not a category data!'

            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

            if x_numer_cols == []:
                logging.error(
                    'All input dfx are no numeric columns, Please check your input dfx data!'
                )
                msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!'
                return {'result': pd.DataFrame(), 'msg': msg}

            else:

                if x_cate_cols != []:
                    logging.warning(
                        'input dfx has non-numeric columns: %s, will ignore these columns!'
                        % x_cate_cols)

                    msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols

                name = tsy.name

                dfu = dfx[x_numer_cols].join(tsy)
                m = dfu.groupby(name).mean().T
                s = dfu.groupby(name).std().T

                def change(ts):
                    v = []
                    for i in ts.index:
                        r = '%s±%s' % (round(ts.loc[i],
                                             2), round(s[ts.name].loc[i], 2))
                        v.append(r)
                    return pd.Series(v, index=ts.index)

                m1 = m.apply(change)

                rs = []
                for i in x_numer_cols:

                    dd = [dfu[dfu[tsy.name] == c][i] for c in tsy.unique()]

                    F, p = levene(*dd)

                    columns = ['F-值', 'p-值']
                    rs.append(
                        pd.DataFrame([F, p], index=columns, columns=[i]).T)

                res = m1.join(pd.concat(rs))

                return {'result': res, 'msg': msg}
Exemplo n.º 5
0
    def run(self, 
            dfx, 
            tsy): 

            tsy = tsy.reset_index(drop=True)
            dfx = dfx.reset_index(drop=True)                 
            
            msg = {}
            

                
            
            if not isSeries(tsy) or not isCategory(tsy):
                logging.error('input tsy is not a pandas Series or not a category data!')
                msg['error'] = '输入的tsy不是定类型数据或者Series类型'
                
                return  {'result':pd.DataFrame(), 'msg':msg}
                
            
            
            else:
                
                if len(tsy.unique()) != 2:
                    msg['error'] = '输入的tsy不能被分成2组,请确保值tsy中的数unique后元素个数为2,目前的元素为%s' % tsy.unique()
                    
                    return  {'result':pd.DataFrame(), 'msg':msg}                    
                    

                else:
                    x_numer_cols, x_cate_cols = ParseDFtypes(dfx)
    
    
                    if x_numer_cols ==[]:
                        logging.error('All input dfx are no numeric columns, Please check your input dfx data!')
                        msg['error'] = 'dfx输入的每列都不是数值型数据,请检查输入数据'
                        return  {'result':pd.DataFrame(), 'msg':msg}
                    
                    
                    else:
                        
                        if x_cate_cols != []:
                            logging.warning('input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols)
                        
                            msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols
                        
                        
                        name = tsy.name
                        
                        dfu = dfx[x_numer_cols].join(tsy)
                        m = dfu.groupby(name).mean().T
                        s = dfu.groupby(name).std().T
    
                        def change(ts):
                            v= []
                            for i in ts.index:
                                r = '%s±%s' % (round(ts.loc[i],2),round(s[ts.name].loc[i],2))
                                v.append(r)
                            return pd.Series(v,index=ts.index)
    
    
                        m1 = m.apply(change)
                        
                        
                        

                        rs = []
                        for i in x_numer_cols:
                            
                            c1 = tsy.unique()[0]
                            c2 = tsy.unique()[1]
                            
                            d1 = dfu[dfu[tsy.name] == c1][i]
                            
                            d2 = dfu[dfu[tsy.name] == c2][i]
                            
                            F, p = ttest_ind(d1,d2)
                            
                            columns = ['t-值', 'p-值']
                            rs.append(pd.DataFrame([F,p],index=columns,columns=[i]).T)
    
                        
                        
                        res = m1.join(pd.concat(rs))
                        
        
                        return {'result':res.round(5), 'msg':msg}