예제 #1
0
def test_6():
    """ Test for oversight : Attribution With Hidden Negative
    Proportion of sum of reading score for each race/ethnicity
    Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams
    Args:
    Returns:
    """
    table = pandas.read_csv(
        'data/data_for_test_aspects/student_performance_updated_to_create_attribution_with_hidden_negative_oversight.csv'
    )

    result = aspects.group_by(table, ['race/ethnicity'],
                              enums.SummaryOperators.PROPORTION_OF_SUM)

    result_table = result['table']
    result_table = aspects.crop_other_columns(
        result_table, ['race/ethnicity', 'reading score'])

    result_suggestions = result['suggestions']

    print(result_table)

    expected_result_table = """  race/ethnicity  reading score
0        group A       0.083434
1        group B       0.185493
2        group C       0.316920
3        group D       0.265955
4        group E       0.148198"""

    expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 14, 'confidence_score': 1}]}]"

    assert (expected_result_table == result_table.to_string())
    assert (str(result_suggestions) == expected_suggestions)
예제 #2
0
def test_4():
    """ Test for summary operator = PROPORTION_OF_COUNT
    Proportion of count of gender for each race/ethnicity
    Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams
    Args:
    Returns:
    """
    table = pandas.read_csv(
        'data/data_for_test_aspects/student_performance.csv')

    result = aspects.group_by(table, ['race/ethnicity'],
                              enums.SummaryOperators.PROPORTION_OF_COUNT)

    result_table = result['table']
    result_table = aspects.crop_other_columns(result_table,
                                              ['race/ethnicity', 'gender'])

    result_suggestions = result['suggestions']

    # Sum of proportion column should be(close to) 1.0
    assert (result_table['gender'].sum() == 1.0)

    print(result_table)

    expected_result_table = """  race/ethnicity  gender
0        group A   0.089
1        group B   0.190
2        group C   0.319
3        group D   0.262
4        group E   0.140"""

    expected_suggestions = "[]"

    assert (expected_result_table == result_table.to_string())
    assert (str(result_suggestions) == expected_suggestions)
예제 #3
0
def test_5():
    """ Test for summary operator = PROPORTION_OF_SUM
    Proportion of sum of reading score for each race/ethnicity
    Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams
    Args:
    Returns:
    """
    table = pandas.read_csv(
        'data/data_for_test_aspects/student_performance.csv')

    result = aspects.group_by(table, ['race/ethnicity'],
                              enums.SummaryOperators.PROPORTION_OF_SUM)

    result_table = result['table']
    result_table = aspects.crop_other_columns(
        result_table, ['race/ethnicity', 'reading score'])

    result_suggestions = result['suggestions']

    # Sum of proportion column should be(close to) 1.0
    assert (float(format(result_table['reading score'].sum(), '.5f')) == 1)

    print(result_table)

    expected_result_table = """  race/ethnicity  reading score
0        group A       0.083216
1        group B       0.185011
2        group C       0.318698
3        group D       0.265263
4        group E       0.147812"""

    expected_suggestions = "[]"

    assert (expected_result_table == result_table.to_string())
    assert (str(result_suggestions) == expected_suggestions)
def _convert_to_set(table, columns):
    """ Converts the columns of the table which are in the
    list- columns into a set of tuples.

    First the other columns are dropped, then the columns are
    inseted into the a set as tuples and returned.

    Args:
        table - Type-pandas.dataframe
            The table that is to be converted into a python set

    Returns:
        The function will return a set containing the required colums
        of the table as tuples.
    """
    table = aspects.crop_other_columns(table, columns)
    result_set = set()
    num_rows = table.shape[0]
    for row in range(num_rows):
        list_row = []
        for col in columns:
            list_row.append(table.loc[row, col])
        result_set.add(tuple(list_row))
    return result_set
예제 #5
0
def topk_results(table, metric, dimensions, is_asc, k, **kwargs):
    """This function will implement the top-k intent

    Sorts tuples in the order metric, after applying slice, groupby operations.
    Also removes the tuples that do not lie in the given date range.
    The arguments 'table, metric,dimension,sort_order, k' are not optional,
    so they are passed as it is, rest of the arguments that are
    optional('date_range', 'slices') will be passed in kwargs.
    If the summary_operator is not None, it groups by dimensions.
    If some the optional args are None(not passed),
    it is assumed that we don't have to apply them.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken. Also k = -1 means taking
            all entries
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            Day_first denotes that does day in the date occurs before month in the
            dates in the date column
            Example - '29-02-19', here day_first is true
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)
        results: Type - pandas dataframe, The results of the intended top-k
        suggestions: Type - List of strings, List of suggestions.

    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', False)

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    after_group_by = aspects.group_by(table, dimensions, summary_operator)

    table = after_group_by['table']

    suggestions = after_group_by['suggestions']

    # using a stable sort('mergesort') will help to preserve the order
    # if equal values of [metric] are present
    table = table.sort_values(by=[metric], ascending=is_asc, kind='mergesort')

    # reordering the index
    # drop=True drops the new columnn named 'index' created in reset_index call
    table = table.reset_index(drop=True)

    # selecting only the top-k
    if k != -1:
        table = table.head(k)

    return (table, suggestions)
def _time_compare_results(table, metric, time_compare_column, 
                          date_range1, date_range2, day_first, 
                          summary_operator, **kwargs):

    """ This function returns the results according to the intent.
    Some of the oversights considered in this intent are-

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which grouping will be done.
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            'compare batsman A and B according to total_runs',
             dimension is 'batsman'. we group by dimensions.
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        time_compare_column: Type-string
            the column name by which we will do comparision.
        date_range1: Type-tuple of start_date and end_date
            first date range for which we have to do comparision
        date_range2: Type-tuple of start_date and end_date
            second date range for which we have to do comparision
        day_first: Type-str
            Day_first denotes that does day in the date occurs before month in the
            dates in the date column
            Example - '29-02-19', here day_first is true
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done
    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)
        results: Type - pandas dataframe, The results of the intended slice-compare
        suggestions: Type - List of strings, List of suggestions.
    """

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.slice_table(table, slices)

    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(time_compare_column)
    required_columns.append(metric)

    required_table = aspects.crop_other_columns(table, required_columns)
    
    # we shall have two list so that we can combine both the table and 
    # after applying the groupby operation we can get the desired result.
    table_slice1 = aspects.apply_date_range(required_table,  
                                            date_range1,
                                            time_compare_column,
                                            day_first)
    table_slice1[time_compare_column] = date_range1[0] + " - " + date_range1[1]

    table_slice2 = aspects.apply_date_range(required_table,  
                                            date_range2,
                                            time_compare_column,
                                            day_first)
    table_slice2[time_compare_column] = date_range2[0] + " - " + date_range2[1]

    # Pandas library to combine the tables.
    updated_table = pandas.concat([table_slice2, table_slice1])
    updated_table = updated_table.reset_index(drop = True)

    grouping_columns = []
    if dimensions is not None:
        grouping_columns = dimensions.copy()
    grouping_columns.append(time_compare_column)

    after_group_by = aspects.group_by(updated_table, grouping_columns, 
                                                   summary_operator)

    result_table = after_group_by['table']

    suggestions = after_group_by['suggestions']

    return (result_table, suggestions)
예제 #7
0
def benchmark_set_too_different(table, metric, all_metric,
                                slice_compare_column, slice1, summary_operator,
                                **kwargs):
    """This function can give the benchmark set too different oversight.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we will do grouping,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'compare batsman A and B according to total_runs',
             dimension is 'batsman'. we group by dimensions.
        all_metric: Type-list of str
            It contains list of all metrics
        slice_compare_column: Type-list of string
            first element denotes the column name by which we will do comparision.
            rest elements will the value belongs to that column by which we
            will compare the slices.
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        return a list of dictionary in which every dictionary represent
               a oversight for some column.

    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    table = aspects.slice_table(table, slices)

    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(slice_compare_column)
    required_columns = required_columns + all_metric

    table = aspects.crop_other_columns(table, required_columns)
    """ required_table_for_one is a table which has data for the 
        single variable given by the user by which we shall do 
        comparision"""
    required_table_for_one = aspects.slice_table(
        table, [(slice_compare_column, Filters.EQUAL_TO, slice1)])
    """ required_table_for_all is a table which has all the data which was in
        the initial table but with the comparision column as '*' because we
        have to compare single value with the whole table"""
    required_table_for_all = table.copy()
    required_table_for_all[slice_compare_column] = '*'

    updated_table = pandas.concat(
        [required_table_for_one, required_table_for_all])
    updated_table = updated_table.reset_index()

    grouping_columns = []
    if dimensions is not None:
        grouping_columns = dimensions.copy()
    grouping_columns.append(slice_compare_column)

    result_table = aspects.group_by(updated_table, grouping_columns,
                                    summary_operator)['table']

    other_metrics = all_metric.copy()
    other_metrics.remove(metric)

    columns_order = grouping_columns.copy()
    columns_order.append(metric)
    columns_order = columns_order + other_metrics
    """ We have done the reordering of columns so that all the dimension
        should appear first followed by the metric given by the user and
        and then rest of the columns."""
    result_table = result_table[columns_order]

    num_rows = result_table.shape[0]
    num_columns = result_table.shape[1]
    dimensions_len = 0
    if dimensions is not None:
        dimensions_len = len(dimensions)

    result_table_matrix = result_table.values.tolist()

    suggestion = []

    # We have to iterate through all metric which was not involved
    # in the computation initially.
    for column_i in range(dimensions_len + 2, num_columns):
        # it can store the index of row on whcih the oversight might appear.
        column_i_suggestion_list = []
        row_i = 0
        while row_i < num_rows:
            if row_i == num_rows - 1 or result_table_matrix[
                    row_i][:dimensions_len] != result_table_matrix[
                        row_i + 1][:dimensions_len]:
                column_i_suggestion_list.append({
                    'row': row_i + 1,
                    'confidence_score': 100
                })
            else:
                if _calculate_relation(
                        result_table_matrix[row_i][column_i],
                        result_table_matrix[row_i + 1]
                    [column_i]) < constants.BSTD_DISIMILARITY_THRESHOLD:
                    row_i = row_i + 1
                elif _calculate_relation(
                        result_table_matrix[row_i][dimensions_len + 1],
                        result_table_matrix[row_i + 1][dimensions_len + 1]
                ) < constants.BSTD_DISIMILARITY_THRESHOLD:
                    row_i = row_i + 1
                else:
                    column_i_suggestion_list.append({
                        'row': row_i + 1,
                        'confidence_score': 100
                    })
                    column_i_suggestion_list.append({
                        'row': row_i + 2,
                        'confidence_score': 100
                    })
                    row_i = row_i + 1
            row_i = row_i + 1

        if len(column_i_suggestion_list) > 0:
            """ for every suggestion we form a dictionary and 
                append it to the list of dictionary."""
            suggestion_i = {}
            suggestion_i[
                'suggestion'] = slice1 + ' looks different from others on ' + metric + '. You might also want to look at ' + columns_order[
                    column_i] + ' since ' + slice1 + ' also looks different on this.'
            suggestion_i['oversight'] = Oversights.BENCHMARK_SET_TOO_DIFFERENT
            suggestion_i['is_row_level_suggestion'] = True
            suggestion_i['row_list'] = column_i_suggestion_list
            suggestion.append(suggestion_i)

    if len(suggestion) == 0:
        return None
    else:
        return suggestion
예제 #8
0
def list_index_in_topk(table, metric, dimensions, is_asc, k, **kwargs):
    """
    This function returns the list of the rows in the original table
    that are in the top-k after applying all the operations.

    0 indexing is followed.

    Args: (Same as top-k only summary_operator is required
           as when grouping is done insert_as_column option would 
           not be given)
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken. Also k = -1 means taking
            all entries
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
    
    Returns :
        List of True/False, list[i] denotes that ith row
        is in the top-k or not
    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    bool_list = [False for _ in range(table.shape[0])]

    table = aspects.apply_date_range(table,
                                     date_range,
                                     date_column_name,
                                     date_format,
                                     reset_index=False)

    table = aspects.slice_table(table, slices, reset_index=False)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    table = table.sort_values(by=[metric], ascending=is_asc, kind='mergesort')

    # selecting only the top-k
    if k != -1:
        table = table.head(k)

    for index in list(table.index):
        bool_list[index] = True

    return bool_list
예제 #9
0
def _slice_compare_results_for_all(table, metric, slice_compare_column, slice1,
                                   slice2, summary_operator, **kwargs):
    """This function will implement the slice-compare intent

    Also removes the tuples that do not lie in the given date range.
    The arguments 'table, metric,dimension,slices_compare_column,
    summary_operator' are not optional, so they are passed as it is,
    'date_range', 'slices' will be passed in kwargs.
    If some the optional args are None(not passed),
    it is assumed that we don't have to apply them.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which grouping will be done.
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            'compare batsman A and B according to total_runs'
             dimension is 'batsman'. we group by dimensions.
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        slice_compare_column: Type-list of string
            first element denotes the column name by which we will do comparision.
            rest elements will the value belongs to that column by which we
            will compare the slices.
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return the `table(a pandas dataframe object)`
        after applying the intent on the
        given `table(a pandas dataframe object)``

    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(slice_compare_column)
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    required_table_for_one = aspects.slice_table(
        table, [(slice_compare_column, Filters.EQUAL_TO, slice1)])

    required_table_for_all = table.copy()
    required_table_for_all[slice_compare_column] = 'ALL'

    updated_table = pandas.concat(
        [required_table_for_all, required_table_for_one])
    updated_table = updated_table.reset_index(drop=True)

    # collecting the colums on whcih we shall do grouping
    grouping_columns = []
    if dimensions is not None:
        grouping_columns = dimensions.copy()
    grouping_columns.append(slice_compare_column)

    result_table = aspects.group_by(updated_table, grouping_columns,
                                    summary_operator)

    return result_table
예제 #10
0
def trend(table, metric, granularity, summary_operator, **kwargs):
    """This function will implement the trend intent

    Groups by time, then applies summary operator on the column-name=metric to
    compress the rows having similar date according to time granularity,
    after applying slicing and date range. If some the optional args are
    None(not passed), it is assumed that we don't have to apply them.
    Only slices and date range parameters(range, column_name, format)
    is optional, so it is passed using kwargs.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort, and in the
            case when grouping has to be done, summary operator is applied
            on metric. Metric could be a column containing strings,
            if we are applying count operator on it.
        date_range: Type-tuple
            Tuple of start_date and end_date, if it is null,
            represents that we don't need to crop
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes-
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-dictionary (will be changed)
            contains the key as column name and value as
            instance we want to slice
        summary_operator: Type-summary_operators enum member
            It denotes the summary operator
        granularity: Type-granularities enum member
            It denotes the granularity we need to apply to the dates.

    Returns:
        The function will return the `table(a pandas dataframe object)`
        after applying the intent on the given
        `table(a pandas dataframe object)`

    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', 'yyyy-mm-dd')

    slices = kwargs.get('slices', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     date_format)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = [date_column_name, metric]

    table = aspects.crop_other_columns(table, required_columns)

    num_rows = table.shape[0]
    for row in range(num_rows):
        row_date = datetime.datetime.strptime(table.loc[row, date_column_name],
                                              date_format)
        row_date = aspects.granular_time(row_date, granularity)
        table.loc[row, date_column_name] = row_date.strftime(date_format)

    table = aspects.group_by(table, [date_column_name], summary_operator)

    table = table.sort_values(by=[date_column_name])

    return table
예제 #11
0
def show(table, **kwargs):
    """This function will implement the show intent

	Firstly removes the tuples that do not lie in the given date range.
	Then applies slicing and groupby operations.
	The argument 'table' is not optional , hence passed as it is.
	Rest of the arguments are optional , hence passed in kwargs
	If the summary_operator is not None , it groups by dimensions.
	If some of the optional args are None (not passed),
	it is assumed that we don't have to apply them.

	Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column on which
            summary operator is applied in case of grouping. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'show all batsman', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
			https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-dictionary (will be changed)
            contains the key as column name and value as instance we want
            to slice
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return the `table(a pandas dataframe object)`
        after applying the intent on the
        given `table(a pandas dataframe object)``

    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', 'yyyy-mm-dd')

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    metric = kwargs.get('metric', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     date_format)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    # metric is optional in show
    if metric is not None:
        required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    table = aspects.group_by(table, dimensions, summary_operator)

    return table
예제 #12
0
def topk_results(table, metric, dimensions, is_asc, k, **kwargs):
    """This function will implement the top-k intent

    Sorts tuples in the order metric, after applying slice, groupby operations.
    Also removes the tuples that do not lie in the given date range.
    The arguments 'table, metric,dimension,sort_order, k' are not optional,
    so they are passed as it is, rest of the arguments that are
    optional('date_range', 'slices') will be passed in kwargs.
    If the summary_operator is not None, it groups by dimensions.
    If some the optional args are None(not passed),
    it is assumed that we don't have to apply them.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken. Also k = -1 means taking
            all entries
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return the `table(a pandas dataframe object)`
        after applying the intent on the
        given `table(a pandas dataframe object)``

    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     date_format)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    table = aspects.group_by(table, dimensions, summary_operator)

    table = table.sort_values(by=[metric], ascending=is_asc)

    # reordering the index
    # drop=True drops the new columnn named 'index' created in reset_index call
    table = table.reset_index(drop=True)

    # selecting only the top-k
    if k != -1:
        table = table.head(k)

    return table
def _weighted_mean_results(table, metric, weight_col, **kwargs):
    """ This functions implements the weighted mean intent

    Formula used for calculating weighted mean -

    metric column = {a1, a2, ... ai}
    weight column = {w1, w2, ... wi}

    weight mean = _sum(ai * wi) / _sum(wi)

    In case if _sum(wi) is 0 NaN is returned

    When a list of dimensions is also passed weighted mean is calculated
    for each grouping

    Args :
        table: Type-pandas.dataframe
            It has the contents of the table in sheets
        metric : Type-string
            The column whose mean is to be found
        weight_col : Type-strings
            The weight column which is multiplied as weights to metric column
        dimensions: Type-list of str
            It the list of columns according to which groups are formed
            If these are passed we calculate the weighted mean for each group
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes -https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN


    Returns : Type-pandas.dataframe, the result table of the intent
        containing a column containing weighed mean
    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', 'yyyy-mm-dd')

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range,date_column_name, date_format)

    table = aspects.slice_table(table, slices)

    weighted_mean_col = 'mean of ' + metric + ' weighted by ' + weight_col

    if dimensions is None:
        # calculating weighted mean from the table
        table[weighted_mean_col] = table[metric] * table[weight_col]
        weighted_mean = table[weighted_mean_col].sum() / table[weight_col].sum()

        # Inserting the weighted mean in a new result table
        tbale = pandas.DataFrame([(weighted_mean)], columns=[weighted_mean_col])

        return tbale
    else:
        table = aspects.crop_other_columns(table, dimensions + [metric, weight_col])

        # Creating new column containg the product of metric & weight_col(Weight column)
        table[weighted_mean_col] = table[metric] * table[weight_col]
        table = table.groupby(dimensions).sum()


        table[weighted_mean_col] /= table[weight_col]

        table = table.reset_index()

        # Now the table contains the weighted mean for each dimension
        table = aspects.crop_other_columns(table, dimensions + [weighted_mean_col])

        return table
예제 #14
0
def _correlation_results(table, metric1, metric2, **kwargs):
    """ This functions implements the correlation intent

    Uses the pandas corr() function to find the correlation coefficient.

    If grouping is not done the result will be a single number - the
        correlation coefficient.
    If grouping is done the result will be a table - the correlation
        coeffiecient for each group.

    There are 3 types of correlations:
        Standard correlation coefficient
        Kendall Tau correlation coefficient
        Spearman rank correlation

    Here the standard `Pearson correlation coefficient` is used.
    
    Args :
        table: Type-pandas.dataframe
            It has the contents of the table in sheets
        metric1, metric2 : Type-strings
            The columns between which the correlation is to be found
        dimensions: Type-list of str
            It the list of columns according to which groups are formed
            If these are passed we calculate the correlation for each group
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes -https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', 'yyyy-mm-dd')

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions',None)
    
    table = aspects.apply_date_range(table, date_range,date_column_name, date_format)
    
    table = aspects.slice_table(table, slices)

    correlation_col = 'correlation between "' + metric1 + '" , "' + metric2 + '"'

    if dimensions is None:
        correlation = table[metric1].corr(table[metric2])
        result_table = pandas.DataFrame([(correlation)], columns=[correlation_col])
        return result_table
    else:
        table = aspects.crop_other_columns(table, dimensions + [metric1, metric2])
        table = table.groupby(dimensions).corr().reset_index()
        table = aspects.crop_other_columns(table, dimensions + [metric1])
        table = table.rename(columns={metric1:correlation_col})
        table = table.groupby(dimensions).min()
        table = table.reset_index()

        return table
예제 #15
0
def show(table, **kwargs):
    """This function will implement the show intent

    Firstly removes the tuples that do not lie in the given date range.
    Then applies slicing and groupby operations.
    The argument 'table' is not optional , hence passed as it is.
    Rest of the arguments are optional , hence passed in kwargs
    If the summary_operator is not None , it groups by dimensions.
    If some of the optional args are None (not passed),
    it is assumed that we don't have to apply them.
    
    Also, if summary operator is applied, the name of metric column is
    renamed to "<summary operator> of metric".
    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column on which
            summary operator is applied in case of grouping. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'show all batsman', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            Day_first denotes that does day in the date occurs before month in the
            dates in the date column
            Example - '29-02-19', here day_first is true
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)

        results: Type - pandas dataframe, The results of the intended show

        suggestions: Type - List of dictionaries(suggestion structure), List of
            suggestions.
    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', False)

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    metric = kwargs.get('metric', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    # metric is optional in show
    if metric is not None:
        required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    # When there is no dimension to group by but a summary_operator exists ,
    # we assume that user wants to apply the summary operator on whole data
    if ((dimensions is None) and (summary_operator is not None)):
        # We add temporary column of 'Summary Operator' by which we can groupby
        # to obtain the final result
        _add_temporary_column_of_summary_operator(table, summary_operator)

        dimensions = []

        # To groupby 'Summary Operator' column inserted
        dimensions.append('Summary Operator')

        after_group_by = aspects.group_by(table, dimensions, summary_operator)

        table = after_group_by['table']

        suggestions = []

        if len(after_group_by['suggestions']) > 0:
            suggestions.extend(after_group_by['suggestions'])

        # Droping the 'Summary Operator' column which was inserted above
        table = table.drop(columns=['Summary Operator'])

        table = aspects.update_metric_column_name(table, summary_operator,
                                                  metric)

        return (table, suggestions)

    after_group_by = aspects.group_by(table, dimensions, summary_operator)

    table = after_group_by['table']

    suggestions = []

    different_weight_suggestion = weighted_mean_with_different_weights.\
                                  weighted_mean_with_different_weights(table, metric)

    if different_weight_suggestion is not None:
        suggestions.append(different_weight_suggestion)

    if len(after_group_by['suggestions']) > 0:
        suggestions.extend(after_group_by['suggestions'])

    order = oversights_order.ORDER_IN_SHOW
    suggestions = rank_oversights.rank_oversights(suggestions, order)

    if summary_operator is not None:
        table = aspects.update_metric_column_name(table, summary_operator,
                                                  metric)

    return (table, suggestions)

    order = oversights_order.ORDER_IN_SHOW
    suggestions = rank_oversights.rank_oversights(suggestions, order)

    return (table, suggestions)
예제 #16
0
def top_down_error(table, metric, all_dimensions, slice_compare_column, slice1,
                   slice2, summary_operator, **kwargs):
    """This function will implement the top down error debaising

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we will do 
            grouping, summary operator is applied on metric. Metric 
            could a column containing strings, if we are applying count 
            operator on it.
        dimensions: Type-list of str or None
            It is the name of column we want.
            In query:'compare batsman A and B according to total_runs',
             dimension is 'batsman'. we group by dimensions.
        all_dimension: Type-list of str
        	It contains list of all dimensions
        slice_compare_column: Type-string
            name of the slice-compare column.
        slice1: Type-string
            the first value of comparision
        slice2: Type-string
            the second value of comparision
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            It is required by datetime.strp_time to parse the date in 
            the format Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        return a list of dictionary where each dictionary represent
               a debiasing suggestion according to the new column.
    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    slice_list = []
    if slices is not None:
        slice_list = slices.copy()
    slice_list.append((slice_compare_column, Filters.IN, [slice1, slice2]))

    table = aspects.slice_table(table, slice_list)

    # removing all metric column except the one by which we do
    # group_by operation
    required_columns = all_dimensions.copy()
    required_columns.append(metric)
    table = aspects.crop_other_columns(table, required_columns)

    # operational_dimensions contain list of all dimension except
    # slice_compare_column
    operational_dimensions = all_dimensions.copy()
    operational_dimensions.remove(slice_compare_column)

    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(slice_compare_column)
    required_columns.append(metric)

    query_table = aspects.crop_other_columns(table, required_columns)

    grouping_columns = []
    if dimensions is not None:
        grouping_columns = dimensions.copy()
    grouping_columns.append(slice_compare_column)

    # result_table is the result table requested by user.
    result_table = aspects.group_by(query_table, grouping_columns,
                                    summary_operator)
    # suggestions store the list of debiasing for this oversight.
    suggestions = []

    dimension_list = []
    if dimensions is not None:
        dimension_list = dimensions.copy()

    dimensions_len = len(dimension_list)

    for column in operational_dimensions:
        # we try to find the debiasing for every column which
        # is not in the grouping list initially
        if column not in dimension_list:
            new_grouping_columns = dimension_list.copy()
            new_grouping_columns.append(column)
            new_grouping_columns.append(slice_compare_column)

            new_required_columns = new_grouping_columns.copy()
            new_required_columns.append(metric)
            new_cropped_table = aspects.crop_other_columns(
                table, new_required_columns)

            # result table after adding the new column in the grouping list.
            new_result_table = aspects.group_by(new_cropped_table,
                                                new_grouping_columns,
                                                summary_operator)

            # it will return the debiasing suggestion after comparing the
            # initial result table and new result table.
            new_suggestion = _check_top_down_error(result_table,
                                                   new_result_table, column,
                                                   slice1, dimensions_len)
            if new_suggestion != None:
                suggestions.append(new_suggestion)
    return suggestions
예제 #17
0
def slice_compare(table, metric, dimensions, slices, slice_compare_column, 
                                            summary_operator, **kwargs):
    """This function will implement the slice-compare intent

    Also removes the tuples that do not lie in the given date range.
    The arguments 'table, metric,dimension,slices, slices_compare_column, 
    summary_operator' are not optional, so they are passed as it is,
    'date_range' will be passed in kwargs.
    If some the optional args are None(not passed),
    it is assumed that we don't have to apply them.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we have group to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-dictionary (will be changed)
            contains the key as column name and value as instance we want
            to slice
        slice_compare_column: Type-list of string
            first element denotes the column name by which we will do comparision.
            rest elements will the value belongs to that column by which we
            will compare the slices.
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return the `table(a pandas dataframe object)`
        after applying the intent on the
        given `table(a pandas dataframe object)``

    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', 'yyyy-mm-dd')


    table = aspects.apply_date_range(table, date_range,
                                     date_column_name, date_format)

    table = aspects.slice_table(table, slices)

   # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    # slice_compare_column should be the last element of the group
    # so that groupby will show them together for every grouping
    dimensions.remove(slice_compare_column[0])
    dimensions.append(slice_compare_column[0])
    table = aspects.group_by(table, dimensions, summary_operator)

    return table
def _slice_compare_results(table, metric, slice_compare_column, slice1, slice2,
                           summary_operator, **kwargs):
    """This function will implement the slice-compare intent

    Also removes the tuples that do not lie in the given date range.
    The arguments 'table, metric,dimension,slices_compare_column,
    summary_operator' are not optional, so they are passed as it is,
    'date_range','slices' will be passed in kwargs.
    If some the optional args are None(not passed),
    it is assumed that we don't have to apply them.

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which grouping will be done.
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            'compare batsman A and B according to total_runs'
             dimension is 'batsman'. we group by dimensions.
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        slice_compare_column: Type-list of string
            first element denotes the column name by which we will do comparision.
            rest elements will the value belongs to that column by which we
            will compare the slices.
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)
        results: Type - pandas dataframe, The results of the intended slice-compare
        suggestions: Type - List of strings, List of suggestions.

    """

    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', False)

    slices = kwargs.get('slices', None)

    dimensions = kwargs.get('dimensions', None)

    table = aspects.apply_date_range(table, date_range, date_column_name,
                                     day_first)

    if slices == None:
        slices = [(slice_compare_column, Filters.IN, [slice1, slice2])]
    else:
        slices.append((slice_compare_column, Filters.IN, [slice1, slice2]))
    table = aspects.slice_table(table, slices)

    # collecting the colums not to be removed
    required_columns = []
    if dimensions is not None:
        required_columns = dimensions.copy()
    required_columns.append(slice_compare_column)
    required_columns.append(metric)

    table = aspects.crop_other_columns(table, required_columns)

    # slice_compare_column should be the last element of the group
    # so that groupby will show them together for every grouping
    grouping_columns = []
    if dimensions is not None:
        grouping_columns = dimensions.copy()
    grouping_columns.append(slice_compare_column)

    after_group_by = aspects.group_by(table, grouping_columns,
                                      summary_operator)

    result_table = after_group_by['table']

    suggestions = after_group_by['suggestions']

    return (result_table, suggestions)