def statistic_derivation(table, group_by=None, **params): check_required_parameters(_statistic_derivation, params, ['table']) params = get_default_from_parameters_if_required(params,_statistic_derivation) param_validation_check = [all_elements_from_to(params, 0, 100, 'percentile_amounts'), all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_statistic_derivation, table, group_by=group_by, **params) else: return _statistic_derivation(table, **params)
def statistic_summary(table, group_by=None, **params): check_required_parameters(_statistic_summary, params, ['table']) params = get_default_from_parameters_if_required(params, _statistic_summary) param_validation_check = [ all_elements_from_to(params, 0, 100, 'percentile_amounts'), all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts') ] validate(*param_validation_check) column_indices = [] for i in params['input_cols']: column_indices.append(table.columns.get_loc(i)) params['column_indices'] = column_indices columns = ['column_name'] + params['statistics'].copy() columns = ['num_of_row' if x == 'nrow' else x for x in columns] if 'percentile' in columns: columns.remove('percentile') if params['percentile_amounts'] is not None: for pa in _unique_list(params['percentile_amounts']): columns.append('percentile_{}'.format(_amounts_colname(pa))) if 'trimmed_mean' in columns: columns.remove('trimmed_mean') if params['trimmed_mean_amounts'] is not None: for ta in _unique_list(params['trimmed_mean_amounts']): columns.append('trimmed_mean_{}'.format(_amounts_colname(ta))) if 'mode' in columns: columns.remove('mode') columns.append('mode') if group_by is not None: return _function_by_group2(_statistic_summary, table, columns=columns, group_by=group_by, **params) else: tmp_table = table.values if 'workers' in params: del params['workers'] result = _statistic_summary(tmp_table, **params) result['out_table'] = pd.DataFrame(result['out_table'], columns=columns) return result
def statistic_summary(table, group_by=None, **params): check_required_parameters(_statistic_summary_list, params, ['table']) params = get_default_from_parameters_if_required( params, _statistic_summary_original) param_validation_check = [ all_elements_from_to(params, 0, 100, 'percentile_amounts'), all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts') ] validate(*param_validation_check) if group_by is None: return _statistic_summary_original(table, **params) if True in pd.isnull(table[group_by]).values: group_by_unicode = [str(i) + '\u0003' for i in group_by] table[group_by_unicode] = table[group_by].fillna('\u0003') group_by = group_by_unicode params1 = dict() params1['input_cols'] = params['input_cols'] params2 = dict() params1['statistics'] = [] params2['statistics'] = [] for st in params['statistics']: if st in [ 'max', 'min', 'range', 'sum', 'avg', 'variance', 'stddev', 'nrow', 'num_of_value', 'null_count', 'median' ]: params1['statistics'].append(st) else: params2['statistics'].append(st) for st in params.keys(): if 'percentile' == st: params2['percentile'] = params['percentile'] if 'trimmed_mean' == st: params2['trimmed_mean'] = params['trimmed_mean'] if 'percentile_amounts' == st: params2['percentile_amounts'] = params['percentile_amounts'] if 'trimmed_mean_amounts' == st: params2['trimmed_mean_amounts'] = params['trimmed_mean_amounts'] if params1['statistics']: result1 = _statistic_summary_groupby(table, group_by=group_by, **params1) result1.index = result1[group_by + ['column_name']] else: result1 = None if params2['statistics']: params2['input_cols'] = params['input_cols'] column_indices = [] for i in params2['input_cols']: column_indices.append(table.columns.get_loc(i)) params2['column_indices'] = column_indices columns = ['column_name'] + params2['statistics'].copy() columns = ['num_of_row' if x == 'nrow' else x for x in columns] if 'percentile' in columns: columns.remove('percentile') if params2['percentile_amounts'] is not None: for pa in _unique_list(params2['percentile_amounts']): columns.append('percentile_{}'.format( _amounts_colname(pa))) if 'trimmed_mean' in columns: columns.remove('trimmed_mean') if params2['trimmed_mean_amounts'] is not None: for ta in _unique_list(params2['trimmed_mean_amounts']): columns.append('trimmed_mean_{}'.format( _amounts_colname(ta))) if 'mode' in columns: columns.remove('mode') columns.append('mode') result2 = _function_by_group2(_statistic_summary_list, table, columns=columns, group_by=group_by, **params2)['out_table'] result2.index = result2[group_by + ['column_name']] if result1 is not None: result2 = result2[[ i for i in result2.columns if i not in group_by + ['column_name'] ]] # Update sort parameter after upgrading pandas. # result = pd.concat([result2,result1], axis=1, sort = False).reset_index(drop = True) result = pd.concat([result2, result1], axis=1).reset_index(drop=True) else: result = result2 else: groups = table[group_by].drop_duplicates().values result2 = [] for i in groups: for j in params['input_cols']: result2.append(list(i) + [j]) result2 = pd.DataFrame(result2, columns=group_by + ['column_name']) result2.index = result2[group_by + ['column_name']] result1 = result1[[ i for i in result1.columns if i not in group_by + ['column_name'] ]] # Update sort parameter after upgrading pandas. # result = pd.concat([result2,result1], axis=1, sort=False).reset_index(drop = True) result = pd.concat([result2, result1], axis=1).reset_index(drop=True) columns = ['column_name'] + params['statistics'].copy() if 'percentile' in columns: columns.remove('percentile') if params['percentile_amounts'] is not None: for pa in _unique_list(params['percentile_amounts']): columns.append('percentile_{}'.format(_amounts_colname(pa))) if 'trimmed_mean' in columns: columns.remove('trimmed_mean') if params['trimmed_mean_amounts'] is not None: for ta in _unique_list(params['trimmed_mean_amounts']): columns.append('trimmed_mean_{}'.format(_amounts_colname(ta))) if 'mode' in columns: columns.remove('mode') columns.append('mode') result = result[group_by + columns] if '\u0003' in result.values: result = result.replace('\u0003', np.nan) tmp_col = result.columns result.columns = [_remove_unicode(i) for i in tmp_col] return {'out_table': result}