예제 #1
0
def get_anova_boxplot_data(project_id, dataset_id, df, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=15):
    anova_result = {}
    considered_independent_variable_name = independent_variables_names[0]
    considered_dependent_variable_name = dependent_variables_names[0]

    # Only return boxplot data if number of groups < THRESHOLD
    num_groups = len(get_unique(df[considered_independent_variable_name]))
    if num_groups > NUM_GROUPS_CUTOFF:
        return None


    val_box_spec = {
        'grouped_field': { 'name': considered_independent_variable_name },
        'boxed_field': { 'name': considered_dependent_variable_name }
    }

    viz_data = get_val_box_data(df, val_box_spec)

    result = {
        'project_id': project_id,
        'dataset_id': dataset_id,
        'spec': val_box_spec,
        'meta': {
            'labels': {
                'x': considered_independent_variable_name,
                'y': considered_dependent_variable_name
            },
        },
        'data': viz_data
    }

    return result
예제 #2
0
def compute_single_field_property_nontype(field_name, field_values, field_type, general_type, df=None, temporal_fields=[]):
    temporal = (len(temporal_fields) > 0)

    field_values_no_na = field_values.dropna(how='any')
    all_null = (len(field_values_no_na) == 0)
    num_na = len(field_values) - len(field_values_no_na)
    is_unique = detect_unique_list(field_values_no_na) if not temporal else get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields)

    is_id = detect_id(field_name, field_type, is_unique)

    stats, contiguous, scale, viz_data, normality, unique_values = [ None ]*6

    if not all_null:
        stats = calculate_field_stats(field_type, general_type, field_values)
        contiguous = get_contiguity(field_name, field_values, field_values_no_na, field_type, general_type)
        scale = get_scale(field_name, field_values, field_type, general_type, contiguous)
        viz_data = get_field_distribution_viz_data(field_name, field_values, field_type, general_type, scale, is_id, contiguous)
        normality = get_normality(field_name, field_values, field_type, general_type, scale)

        unique_values = [ e for e in get_unique(field_values_no_na) if not pd.isnull(e) ] if (scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ] and not is_unique) else None

    return {
        'scale': scale,  # Recompute if continguous
        'contiguous': contiguous,
        'viz_data': viz_data,
        'is_id': is_id,
        'stats': stats,
        'num_na': num_na,
        'normality': normality,
        'is_unique': is_unique,
        'unique_values': unique_values,
        'manual': {}
    }
예제 #3
0
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]):
    aggregation_variables_names = spec.get('aggregationVariablesNames')
    dataset_id = spec.get('datasetId')
    dependent_variable_name = spec.get('dependentVariableName')
    weight_variable_name = config.get('weightVariableName')
    num_variables = len(aggregation_variables_names)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_field_properties = db_access.get_field_properties(project_id, dataset_id)
    aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ]
    dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None)

    subset_variables = aggregation_variables_names
    if dependent_variable_name and dependent_variable_name != 'count':
        subset_variables += [ dependent_variable_name ]
    if weight_variable_name and weight_variable_name != 'UNIFORM':
        subset_variables += [ weight_variable_name ]
    subset_variables = get_unique(subset_variables, preserve_order=True)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ subset_variables ]
    df_ready = df_subset.dropna(how='all')  # Remove unclean

    result = {}
    if num_variables == 1:
        result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config)
    elif num_variables == 2:
        result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config)

    return result, 200
예제 #4
0
def compute_single_field_property_nontype(field_name, field_values, field_type, general_type, df=None, temporal_fields=[]):
    temporal = (len(temporal_fields) > 0)

    field_values_no_na = field_values.dropna(how='any')
    all_null = (len(field_values_no_na) == 0)
    num_na = len(field_values) - len(field_values_no_na)
    is_unique = detect_unique_list(field_values_no_na) if not temporal else get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields)

    is_id = detect_id(field_name, field_type, is_unique)

    stats, contiguous, scale, viz_data, normality, unique_values = [ None ]*6

    if not all_null:
        stats = calculate_field_stats(field_type, general_type, field_values)
        contiguous = get_contiguity(field_name, field_values, field_values_no_na, field_type, general_type)
        scale = get_scale(field_name, field_values, field_type, general_type, contiguous)
        viz_data = get_field_distribution_viz_data(field_name, field_values, field_type, general_type, scale, is_id, contiguous)
        normality = get_normality(field_name, field_values, field_type, general_type, scale)

        unique_values = [ e for e in get_unique(field_values_no_na) if not pd.isnull(e) ] if (scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ] and not is_unique) else None

    return {
        'scale': scale,  # Recompute if continguous
        'contiguous': contiguous,
        'viz_data': viz_data,
        'is_id': is_id,
        'stats': stats,
        'num_na': num_na,
        'normality': normality,
        'is_unique': is_unique,
        'unique_values': unique_values,
        'manual': {}
    }
예제 #5
0
def get_temporal_uniqueness(field_name,
                            field_type,
                            general_type,
                            df,
                            temporal_fields,
                            MAX_TIMES_TO_SAMPLE=5):
    is_unique_by_time = False

    if temporal_fields and (df is not None) and (
            general_type == GDT.C.value or field_type == DT.INTEGER.value):
        uniqueness_by_time_fields = []
        for temporal_field in temporal_fields:
            temporal_field_name = temporal_field['name']
            df_column_subset = df[[field_name, temporal_field_name]]
            unique_times = get_unique(df_column_subset[temporal_field_name])
            final_df = df
            if len(unique_times) > MAX_TIMES_TO_SAMPLE:
                unique_times = unique_times[:MAX_TIMES_TO_SAMPLE]
                final_df = df_column_subset[
                    df_column_subset[temporal_field_name].isin(unique_times)]

            unique_by_time_field = all(
                final_df.groupby([temporal_field_name
                                  ])[field_name].apply(detect_unique_list))
            uniqueness_by_time_fields.append(unique_by_time_field)
        is_unique_by_time = any(uniqueness_by_time_fields)
    return is_unique_by_time
예제 #6
0
def create_contingency_table(df, aggregation_variables, dep_variable, config={}):
    results_dict = {}
    formatted_results_dict = {}
    unique_indep_values = []
    bin_data = {}
    aggregation_mean = False

    for i, variable in enumerate(aggregation_variables):
        binningConfigKey = 'binningConfigX' if (i == 0) else 'binningConfigY'
        name = variable['name']
        general_type = variable['general_type']
        scale = variable['scale'] 

        if scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ]:
            unique_indep_values.append(get_unique(df[name], True))

        elif scale in [ Scale.CONTINUOUS.value ]:
            values = df[name].dropna(how='any')
            (binning_edges, bin_names) = get_binning_edges_and_names(values, config[binningConfigKey])
            num_bins = len(binning_edges) - 1
            unique_indep_values.append(bin_names)
            bin_data[name] = {
                'num_bins': num_bins,
                'binning_edges': binning_edges,
                'bin_names': bin_names
            }            

    if dep_variable:
        (results_dict, aggregation_mean) = create_contingency_table_with_dependent_variable(df, aggregation_variables, dep_variable, unique_indep_values, config=config, bin_data=bin_data)
    else:
        results_dict = create_contingency_table_with_no_dependent_variable(df, aggregation_variables, unique_indep_values, config=config, bin_data=bin_data)

    if not aggregation_mean:
        formatted_results_dict["column_headers"] = unique_indep_values[0] + ['Row Totals']
        column_totals = np.zeros(len(unique_indep_values[0]) + 1)
    else:
        formatted_results_dict['column_headers'] = unique_indep_values[0]

    formatted_results_dict["row_headers"] = unique_indep_values[1]
    formatted_results_dict["rows"] = []

    for row in unique_indep_values[1]:
        values = [ results_dict[row][col] for col in unique_indep_values[0] ]

        if not aggregation_mean:
            values.append(sum(values))
            column_totals += values

        formatted_results_dict["rows"].append({
            "field": row,
            "values": values
        })

    if not aggregation_mean:
        formatted_results_dict['column_totals'] = list(column_totals)
    return formatted_results_dict
예제 #7
0
def create_contingency_table(df, aggregation_variables, dep_variable, config={}):
    results_dict = {}
    formatted_results_dict = {}
    unique_indep_values = []
    bin_data = {}
    aggregation_mean = False

    for i, variable in enumerate(aggregation_variables):
        binningConfigKey = 'binningConfigX' if (i == 0) else 'binningConfigY'
        name = variable['name']
        general_type = variable['general_type']
        scale = variable['scale'] 

        if scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ]:
            unique_indep_values.append(get_unique(df[name], True))

        elif scale in [ Scale.CONTINUOUS.value ]:
            values = df[name].dropna(how='any')
            (binning_edges, bin_names) = get_binning_edges_and_names(values, config[binningConfigKey])
            num_bins = len(binning_edges) - 1
            unique_indep_values.append(bin_names)
            bin_data[name] = {
                'num_bins': num_bins,
                'binning_edges': binning_edges,
                'bin_names': bin_names
            }            

    if dep_variable:
        (results_dict, aggregation_mean) = create_contingency_table_with_dependent_variable(df, aggregation_variables, dep_variable, unique_indep_values, config=config, bin_data=bin_data)
    else:
        results_dict = create_contingency_table_with_no_dependent_variable(df, aggregation_variables, unique_indep_values, config=config, bin_data=bin_data)

    if not aggregation_mean:
        formatted_results_dict["column_headers"] = unique_indep_values[0] + ['Row Totals']
        column_totals = np.zeros(len(unique_indep_values[0]) + 1)
    else:
        formatted_results_dict['column_headers'] = unique_indep_values[0]

    formatted_results_dict["row_headers"] = unique_indep_values[1]
    formatted_results_dict["rows"] = []

    for row in unique_indep_values[1]:
        values = [ results_dict[row][col] for col in unique_indep_values[0] ]

        if not aggregation_mean:
            values.append(sum(values))
            column_totals += values

        formatted_results_dict["rows"].append({
            "field": row,
            "values": values
        })

    if not aggregation_mean:
        formatted_results_dict['column_totals'] = list(column_totals)
    return formatted_results_dict
def get_pairwise_comparison_data(df, independent_variables_names, dependent_variables_names, significance_cutoff=0.05, NUM_GROUPS_CUTOFF=15):
    '''
        datasetId
        independentVariables - list names, must be categorical
        dependentVariables - list names, must be numerical
        numBins - number of bins for the independent quantitative variables (if they exist)
    '''
    considered_independent_variable_name = independent_variables_names[0]
    considered_dependent_variable_name = dependent_variables_names[0]

    # Only return pairwise comparison data if number of groups < THRESHOLD
    num_groups = len(get_unique(df[considered_independent_variable_name]))
    if num_groups > NUM_GROUPS_CUTOFF:
        return None

    hsd_result = pairwise_tukeyhsd(df[considered_dependent_variable_name], df[considered_independent_variable_name], alpha=significance_cutoff)
    hsd_raw_data = hsd_result.summary().data[1:]
    st_range = np.abs(hsd_result.meandiffs) / hsd_result.std_pairs
    p_values = psturng(st_range, len(hsd_result.groupsunique), hsd_result.df_total)

    hsd_headers = [
        'Group 1',
        'Group 2',
        'Group Mean Difference (2 - 1)',
        'Lower Bound',
        'Upper Bound',
        'p-value',
        'Distinct (p < %s)' % significance_cutoff
    ]
    hsd_data = []
    for i in range(0, len(hsd_raw_data)):
        if isinstance(p_values, float):
            p_value = p_values
        else:
            p_value = p_values[i] if i < len(p_values) else None
        hsd_data_row = [
            hsd_raw_data[i][0],
            hsd_raw_data[i][1],
            hsd_result.meandiffs[i],
            hsd_result.confint[i][0],
            hsd_result.confint[i][1],
            p_value,
            ( 'True' if (p_value <= significance_cutoff) else 'False' )
        ]
        hsd_data.append(hsd_data_row)

    return {
        'column_headers': hsd_headers,
        'rows': hsd_data
    }
예제 #9
0
def create_one_dimensional_contingency_table(df, aggregation_variable, dep_variable, config={}):
    results_dict = {}
    formatted_results_dict = {}
    unique_indep_values = []

    aggregation_mean = False

    general_type = aggregation_variable['general_type']
    scale = aggregation_variable['scale']    
    name = aggregation_variable['name']
    bin_data = {}

    if scale in [ Scale.ORDINAL.value, Scale.NOMINAL.value ]:
        unique_indep_values = get_unique(df[name], True)

    elif scale in [ Scale.CONTINUOUS.value ]:
        values = df[name].dropna(how='any')
        (binning_edges, bin_names) = get_binning_edges_and_names(values, config.get('binningConfigX'))  # TODO Update binning function
        num_bins = len(binning_edges) -1
        bin_data = {
            'num_bins': num_bins,
            'binning_edges': binning_edges,
            'bin_names': bin_names
        }
        unique_indep_values = bin_names

    if dep_variable:
        (results_dict, aggregation_mean) = create_one_dimensional_contingency_table_with_dependent_variable(df, aggregation_variable, dep_variable, unique_indep_values, config=config, bin_data=bin_data)
    else:
        results_dict = create_one_dimensional_contingency_table_with_no_dependent_variable(df, aggregation_variable, unique_indep_values, config=config, bin_data=bin_data)

    formatted_results_dict["column_headers"] = ["VARIABLE", "AGGREGATION"]
    formatted_results_dict["row_headers"] = unique_indep_values
    formatted_results_dict["rows"] = []

    if not aggregation_mean:
        formatted_results_dict['column_total'] = 0

    for var in unique_indep_values:
        value = results_dict[var]

        if not aggregation_mean:
            formatted_results_dict['column_total'] += value

        formatted_results_dict["rows"].append({
            "field": var,
            "value": value
        })

    return formatted_results_dict
def ttest(df, fields, indep, dep):
    # Ensure single field
    dep_field_name = dep[0]
    indep_field_name = indep[0]
    unique_indep_values = get_unique(df[indep_field_name])

    subsets = {}
    for v in unique_indep_values:
        subsets[v] = np.array(df[df[indep_field_name] == v][dep_field_name])

    result = {}
    for (x, y) in combinations(unique_indep_values, 2):
        (statistic, pvalue) = ttest_ind(subsets[x], subsets[y])
        result[str([x, y])] = {'statistic': statistic, 'pvalue': pvalue}

    return result
예제 #11
0
def get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields, MAX_TIMES_TO_SAMPLE=5):
    is_unique_by_time = False

    if temporal_fields and (df is not None) and (general_type == GDT.C.value or field_type == DT.INTEGER.value):
        uniqueness_by_time_fields = []
        for temporal_field in temporal_fields:
            temporal_field_name = temporal_field['name']
            df_column_subset = df[[field_name, temporal_field_name]]
            unique_times = get_unique(df_column_subset[temporal_field_name])
            final_df = df
            if len(unique_times) > MAX_TIMES_TO_SAMPLE:
                unique_times = unique_times[:MAX_TIMES_TO_SAMPLE]
                final_df = df_column_subset[df_column_subset[temporal_field_name].isin(unique_times)]

            unique_by_time_field = all(final_df.groupby([temporal_field_name])[field_name].apply(detect_unique_list))
            uniqueness_by_time_fields.append(unique_by_time_field)
        is_unique_by_time = any(uniqueness_by_time_fields)
    return is_unique_by_time
예제 #12
0
def return_data_list_categorical(data_column, variable_name):
    '''
    helper function to return visualization data in the right format for categorical variables
    data_column: represents the array of data
    variable_name: represents the name of the variable that is being visualized
    '''
    unique_elements = get_unique(data_column)

    count_dict = {}
    data_array = []

    data_array.append([variable_name, 'count'])

    for ele in data_column:
        if count_dict.get(ele):
            count_dict[ele] += 1
        else:
            count_dict[ele] = 1

    for name in unique_elements:
        data_array.append([name, count_dict[name]])

    return data_array
예제 #13
0
def return_data_list_categorical(data_column, variable_name):
    '''
    helper function to return visualization data in the right format for categorical variables
    data_column: represents the array of data
    variable_name: represents the name of the variable that is being visualized
    '''
    unique_elements = get_unique(data_column)

    count_dict = {}
    data_array = []

    data_array.append([variable_name, 'count'])

    for ele in data_column:
        if count_dict.get(ele):
            count_dict[ele] += 1
        else:
            count_dict[ele] = 1

    for name in unique_elements:
        data_array.append([name, count_dict[name]])

    return data_array