def test_6(): """ Test for oversight : Attribution With Hidden Negative Proportion of sum of reading score for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance_updated_to_create_attribution_with_hidden_negative_oversight.csv' ) result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_SUM) result_table = result['table'] result_table = aspects.crop_other_columns( result_table, ['race/ethnicity', 'reading score']) result_suggestions = result['suggestions'] print(result_table) expected_result_table = """ race/ethnicity reading score 0 group A 0.083434 1 group B 0.185493 2 group C 0.316920 3 group D 0.265955 4 group E 0.148198""" expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 14, 'confidence_score': 1}]}]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def test_4(): """ Test for summary operator = PROPORTION_OF_COUNT Proportion of count of gender for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance.csv') result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_COUNT) result_table = result['table'] result_table = aspects.crop_other_columns(result_table, ['race/ethnicity', 'gender']) result_suggestions = result['suggestions'] # Sum of proportion column should be(close to) 1.0 assert (result_table['gender'].sum() == 1.0) print(result_table) expected_result_table = """ race/ethnicity gender 0 group A 0.089 1 group B 0.190 2 group C 0.319 3 group D 0.262 4 group E 0.140""" expected_suggestions = "[]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def test_5(): """ Test for summary operator = PROPORTION_OF_SUM Proportion of sum of reading score for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance.csv') result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_SUM) result_table = result['table'] result_table = aspects.crop_other_columns( result_table, ['race/ethnicity', 'reading score']) result_suggestions = result['suggestions'] # Sum of proportion column should be(close to) 1.0 assert (float(format(result_table['reading score'].sum(), '.5f')) == 1) print(result_table) expected_result_table = """ race/ethnicity reading score 0 group A 0.083216 1 group B 0.185011 2 group C 0.318698 3 group D 0.265263 4 group E 0.147812""" expected_suggestions = "[]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def _convert_to_set(table, columns): """ Converts the columns of the table which are in the list- columns into a set of tuples. First the other columns are dropped, then the columns are inseted into the a set as tuples and returned. Args: table - Type-pandas.dataframe The table that is to be converted into a python set Returns: The function will return a set containing the required colums of the table as tuples. """ table = aspects.crop_other_columns(table, columns) result_set = set() num_rows = table.shape[0] for row in range(num_rows): list_row = [] for col in columns: list_row.append(table.loc[row, col]) result_set.add(tuple(list_row)) return result_set
def topk_results(table, metric, dimensions, is_asc, k, **kwargs): """This function will implement the top-k intent Sorts tuples in the order metric, after applying slice, groupby operations. Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,sort_order, k' are not optional, so they are passed as it is, rest of the arguments that are optional('date_range', 'slices') will be passed in kwargs. If the summary_operator is not None, it groups by dimensions. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken. Also k = -1 means taking all entries date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended top-k suggestions: Type - List of strings, List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = after_group_by['suggestions'] # using a stable sort('mergesort') will help to preserve the order # if equal values of [metric] are present table = table.sort_values(by=[metric], ascending=is_asc, kind='mergesort') # reordering the index # drop=True drops the new columnn named 'index' created in reset_index call table = table.reset_index(drop=True) # selecting only the top-k if k != -1: table = table.head(k) return (table, suggestions)
def _time_compare_results(table, metric, time_compare_column, date_range1, date_range2, day_first, summary_operator, **kwargs): """ This function returns the results according to the intent. Some of the oversights considered in this intent are- Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN time_compare_column: Type-string the column name by which we will do comparision. date_range1: Type-tuple of start_date and end_date first date range for which we have to do comparision date_range2: Type-tuple of start_date and end_date second date range for which we have to do comparision day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended slice-compare suggestions: Type - List of strings, List of suggestions. """ slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.slice_table(table, slices) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(time_compare_column) required_columns.append(metric) required_table = aspects.crop_other_columns(table, required_columns) # we shall have two list so that we can combine both the table and # after applying the groupby operation we can get the desired result. table_slice1 = aspects.apply_date_range(required_table, date_range1, time_compare_column, day_first) table_slice1[time_compare_column] = date_range1[0] + " - " + date_range1[1] table_slice2 = aspects.apply_date_range(required_table, date_range2, time_compare_column, day_first) table_slice2[time_compare_column] = date_range2[0] + " - " + date_range2[1] # Pandas library to combine the tables. updated_table = pandas.concat([table_slice2, table_slice1]) updated_table = updated_table.reset_index(drop = True) grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(time_compare_column) after_group_by = aspects.group_by(updated_table, grouping_columns, summary_operator) result_table = after_group_by['table'] suggestions = after_group_by['suggestions'] return (result_table, suggestions)
def benchmark_set_too_different(table, metric, all_metric, slice_compare_column, slice1, summary_operator, **kwargs): """This function can give the benchmark set too different oversight. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we will do grouping, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. all_metric: Type-list of str It contains list of all metrics slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: return a list of dictionary in which every dictionary represent a oversight for some column. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns = required_columns + all_metric table = aspects.crop_other_columns(table, required_columns) """ required_table_for_one is a table which has data for the single variable given by the user by which we shall do comparision""" required_table_for_one = aspects.slice_table( table, [(slice_compare_column, Filters.EQUAL_TO, slice1)]) """ required_table_for_all is a table which has all the data which was in the initial table but with the comparision column as '*' because we have to compare single value with the whole table""" required_table_for_all = table.copy() required_table_for_all[slice_compare_column] = '*' updated_table = pandas.concat( [required_table_for_one, required_table_for_all]) updated_table = updated_table.reset_index() grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) result_table = aspects.group_by(updated_table, grouping_columns, summary_operator)['table'] other_metrics = all_metric.copy() other_metrics.remove(metric) columns_order = grouping_columns.copy() columns_order.append(metric) columns_order = columns_order + other_metrics """ We have done the reordering of columns so that all the dimension should appear first followed by the metric given by the user and and then rest of the columns.""" result_table = result_table[columns_order] num_rows = result_table.shape[0] num_columns = result_table.shape[1] dimensions_len = 0 if dimensions is not None: dimensions_len = len(dimensions) result_table_matrix = result_table.values.tolist() suggestion = [] # We have to iterate through all metric which was not involved # in the computation initially. for column_i in range(dimensions_len + 2, num_columns): # it can store the index of row on whcih the oversight might appear. column_i_suggestion_list = [] row_i = 0 while row_i < num_rows: if row_i == num_rows - 1 or result_table_matrix[ row_i][:dimensions_len] != result_table_matrix[ row_i + 1][:dimensions_len]: column_i_suggestion_list.append({ 'row': row_i + 1, 'confidence_score': 100 }) else: if _calculate_relation( result_table_matrix[row_i][column_i], result_table_matrix[row_i + 1] [column_i]) < constants.BSTD_DISIMILARITY_THRESHOLD: row_i = row_i + 1 elif _calculate_relation( result_table_matrix[row_i][dimensions_len + 1], result_table_matrix[row_i + 1][dimensions_len + 1] ) < constants.BSTD_DISIMILARITY_THRESHOLD: row_i = row_i + 1 else: column_i_suggestion_list.append({ 'row': row_i + 1, 'confidence_score': 100 }) column_i_suggestion_list.append({ 'row': row_i + 2, 'confidence_score': 100 }) row_i = row_i + 1 row_i = row_i + 1 if len(column_i_suggestion_list) > 0: """ for every suggestion we form a dictionary and append it to the list of dictionary.""" suggestion_i = {} suggestion_i[ 'suggestion'] = slice1 + ' looks different from others on ' + metric + '. You might also want to look at ' + columns_order[ column_i] + ' since ' + slice1 + ' also looks different on this.' suggestion_i['oversight'] = Oversights.BENCHMARK_SET_TOO_DIFFERENT suggestion_i['is_row_level_suggestion'] = True suggestion_i['row_list'] = column_i_suggestion_list suggestion.append(suggestion_i) if len(suggestion) == 0: return None else: return suggestion
def list_index_in_topk(table, metric, dimensions, is_asc, k, **kwargs): """ This function returns the list of the rows in the original table that are in the top-k after applying all the operations. 0 indexing is followed. Args: (Same as top-k only summary_operator is required as when grouping is done insert_as_column option would not be given) table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken. Also k = -1 means taking all entries date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN Returns : List of True/False, list[i] denotes that ith row is in the top-k or not """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', '%Y-%m-%d') slices = kwargs.get('slices', None) bool_list = [False for _ in range(table.shape[0])] table = aspects.apply_date_range(table, date_range, date_column_name, date_format, reset_index=False) table = aspects.slice_table(table, slices, reset_index=False) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) table = table.sort_values(by=[metric], ascending=is_asc, kind='mergesort') # selecting only the top-k if k != -1: table = table.head(k) for index in list(table.index): bool_list[index] = True return bool_list
def _slice_compare_results_for_all(table, metric, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range', 'slices' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs' dimension is 'batsman'. we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) required_table_for_one = aspects.slice_table( table, [(slice_compare_column, Filters.EQUAL_TO, slice1)]) required_table_for_all = table.copy() required_table_for_all[slice_compare_column] = 'ALL' updated_table = pandas.concat( [required_table_for_all, required_table_for_one]) updated_table = updated_table.reset_index(drop=True) # collecting the colums on whcih we shall do grouping grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) result_table = aspects.group_by(updated_table, grouping_columns, summary_operator) return result_table
def trend(table, metric, granularity, summary_operator, **kwargs): """This function will implement the trend intent Groups by time, then applies summary operator on the column-name=metric to compress the rows having similar date according to time granularity, after applying slicing and date range. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Only slices and date range parameters(range, column_name, format) is optional, so it is passed using kwargs. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could be a column containing strings, if we are applying count operator on it. date_range: Type-tuple Tuple of start_date and end_date, if it is null, represents that we don't need to crop date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes- https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice summary_operator: Type-summary_operators enum member It denotes the summary operator granularity: Type-granularities enum member It denotes the granularity we need to apply to the dates. Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [date_column_name, metric] table = aspects.crop_other_columns(table, required_columns) num_rows = table.shape[0] for row in range(num_rows): row_date = datetime.datetime.strptime(table.loc[row, date_column_name], date_format) row_date = aspects.granular_time(row_date, granularity) table.loc[row, date_column_name] = row_date.strftime(date_format) table = aspects.group_by(table, [date_column_name], summary_operator) table = table.sort_values(by=[date_column_name]) return table
def show(table, **kwargs): """This function will implement the show intent Firstly removes the tuples that do not lie in the given date range. Then applies slicing and groupby operations. The argument 'table' is not optional , hence passed as it is. Rest of the arguments are optional , hence passed in kwargs If the summary_operator is not None , it groups by dimensions. If some of the optional args are None (not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column on which summary operator is applied in case of grouping. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'show all batsman', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) metric = kwargs.get('metric', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() # metric is optional in show if metric is not None: required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) table = aspects.group_by(table, dimensions, summary_operator) return table
def topk_results(table, metric, dimensions, is_asc, k, **kwargs): """This function will implement the top-k intent Sorts tuples in the order metric, after applying slice, groupby operations. Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,sort_order, k' are not optional, so they are passed as it is, rest of the arguments that are optional('date_range', 'slices') will be passed in kwargs. If the summary_operator is not None, it groups by dimensions. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken. Also k = -1 means taking all entries date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', '%Y-%m-%d') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) table = aspects.group_by(table, dimensions, summary_operator) table = table.sort_values(by=[metric], ascending=is_asc) # reordering the index # drop=True drops the new columnn named 'index' created in reset_index call table = table.reset_index(drop=True) # selecting only the top-k if k != -1: table = table.head(k) return table
def _weighted_mean_results(table, metric, weight_col, **kwargs): """ This functions implements the weighted mean intent Formula used for calculating weighted mean - metric column = {a1, a2, ... ai} weight column = {w1, w2, ... wi} weight mean = _sum(ai * wi) / _sum(wi) In case if _sum(wi) is 0 NaN is returned When a list of dimensions is also passed weighted mean is calculated for each grouping Args : table: Type-pandas.dataframe It has the contents of the table in sheets metric : Type-string The column whose mean is to be found weight_col : Type-strings The weight column which is multiplied as weights to metric column dimensions: Type-list of str It the list of columns according to which groups are formed If these are passed we calculate the weighted mean for each group date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes -https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN Returns : Type-pandas.dataframe, the result table of the intent containing a column containing weighed mean """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range,date_column_name, date_format) table = aspects.slice_table(table, slices) weighted_mean_col = 'mean of ' + metric + ' weighted by ' + weight_col if dimensions is None: # calculating weighted mean from the table table[weighted_mean_col] = table[metric] * table[weight_col] weighted_mean = table[weighted_mean_col].sum() / table[weight_col].sum() # Inserting the weighted mean in a new result table tbale = pandas.DataFrame([(weighted_mean)], columns=[weighted_mean_col]) return tbale else: table = aspects.crop_other_columns(table, dimensions + [metric, weight_col]) # Creating new column containg the product of metric & weight_col(Weight column) table[weighted_mean_col] = table[metric] * table[weight_col] table = table.groupby(dimensions).sum() table[weighted_mean_col] /= table[weight_col] table = table.reset_index() # Now the table contains the weighted mean for each dimension table = aspects.crop_other_columns(table, dimensions + [weighted_mean_col]) return table
def _correlation_results(table, metric1, metric2, **kwargs): """ This functions implements the correlation intent Uses the pandas corr() function to find the correlation coefficient. If grouping is not done the result will be a single number - the correlation coefficient. If grouping is done the result will be a table - the correlation coeffiecient for each group. There are 3 types of correlations: Standard correlation coefficient Kendall Tau correlation coefficient Spearman rank correlation Here the standard `Pearson correlation coefficient` is used. Args : table: Type-pandas.dataframe It has the contents of the table in sheets metric1, metric2 : Type-strings The columns between which the correlation is to be found dimensions: Type-list of str It the list of columns according to which groups are formed If these are passed we calculate the correlation for each group date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes -https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions',None) table = aspects.apply_date_range(table, date_range,date_column_name, date_format) table = aspects.slice_table(table, slices) correlation_col = 'correlation between "' + metric1 + '" , "' + metric2 + '"' if dimensions is None: correlation = table[metric1].corr(table[metric2]) result_table = pandas.DataFrame([(correlation)], columns=[correlation_col]) return result_table else: table = aspects.crop_other_columns(table, dimensions + [metric1, metric2]) table = table.groupby(dimensions).corr().reset_index() table = aspects.crop_other_columns(table, dimensions + [metric1]) table = table.rename(columns={metric1:correlation_col}) table = table.groupby(dimensions).min() table = table.reset_index() return table
def show(table, **kwargs): """This function will implement the show intent Firstly removes the tuples that do not lie in the given date range. Then applies slicing and groupby operations. The argument 'table' is not optional , hence passed as it is. Rest of the arguments are optional , hence passed in kwargs If the summary_operator is not None , it groups by dimensions. If some of the optional args are None (not passed), it is assumed that we don't have to apply them. Also, if summary operator is applied, the name of metric column is renamed to "<summary operator> of metric". Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column on which summary operator is applied in case of grouping. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'show all batsman', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended show suggestions: Type - List of dictionaries(suggestion structure), List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) metric = kwargs.get('metric', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() # metric is optional in show if metric is not None: required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # When there is no dimension to group by but a summary_operator exists , # we assume that user wants to apply the summary operator on whole data if ((dimensions is None) and (summary_operator is not None)): # We add temporary column of 'Summary Operator' by which we can groupby # to obtain the final result _add_temporary_column_of_summary_operator(table, summary_operator) dimensions = [] # To groupby 'Summary Operator' column inserted dimensions.append('Summary Operator') after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = [] if len(after_group_by['suggestions']) > 0: suggestions.extend(after_group_by['suggestions']) # Droping the 'Summary Operator' column which was inserted above table = table.drop(columns=['Summary Operator']) table = aspects.update_metric_column_name(table, summary_operator, metric) return (table, suggestions) after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = [] different_weight_suggestion = weighted_mean_with_different_weights.\ weighted_mean_with_different_weights(table, metric) if different_weight_suggestion is not None: suggestions.append(different_weight_suggestion) if len(after_group_by['suggestions']) > 0: suggestions.extend(after_group_by['suggestions']) order = oversights_order.ORDER_IN_SHOW suggestions = rank_oversights.rank_oversights(suggestions, order) if summary_operator is not None: table = aspects.update_metric_column_name(table, summary_operator, metric) return (table, suggestions) order = oversights_order.ORDER_IN_SHOW suggestions = rank_oversights.rank_oversights(suggestions, order) return (table, suggestions)
def top_down_error(table, metric, all_dimensions, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the top down error debaising Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we will do grouping, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str or None It is the name of column we want. In query:'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. all_dimension: Type-list of str It contains list of all dimensions slice_compare_column: Type-string name of the slice-compare column. slice1: Type-string the first value of comparision slice2: Type-string the second value of comparision date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: return a list of dictionary where each dictionary represent a debiasing suggestion according to the new column. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) slice_list = [] if slices is not None: slice_list = slices.copy() slice_list.append((slice_compare_column, Filters.IN, [slice1, slice2])) table = aspects.slice_table(table, slice_list) # removing all metric column except the one by which we do # group_by operation required_columns = all_dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # operational_dimensions contain list of all dimension except # slice_compare_column operational_dimensions = all_dimensions.copy() operational_dimensions.remove(slice_compare_column) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) query_table = aspects.crop_other_columns(table, required_columns) grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) # result_table is the result table requested by user. result_table = aspects.group_by(query_table, grouping_columns, summary_operator) # suggestions store the list of debiasing for this oversight. suggestions = [] dimension_list = [] if dimensions is not None: dimension_list = dimensions.copy() dimensions_len = len(dimension_list) for column in operational_dimensions: # we try to find the debiasing for every column which # is not in the grouping list initially if column not in dimension_list: new_grouping_columns = dimension_list.copy() new_grouping_columns.append(column) new_grouping_columns.append(slice_compare_column) new_required_columns = new_grouping_columns.copy() new_required_columns.append(metric) new_cropped_table = aspects.crop_other_columns( table, new_required_columns) # result table after adding the new column in the grouping list. new_result_table = aspects.group_by(new_cropped_table, new_grouping_columns, summary_operator) # it will return the debiasing suggestion after comparing the # initial result table and new result table. new_suggestion = _check_top_down_error(result_table, new_result_table, column, slice1, dimensions_len) if new_suggestion != None: suggestions.append(new_suggestion) return suggestions
def slice_compare(table, metric, dimensions, slices, slice_compare_column, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices, slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we have group to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # slice_compare_column should be the last element of the group # so that groupby will show them together for every grouping dimensions.remove(slice_compare_column[0]) dimensions.append(slice_compare_column[0]) table = aspects.group_by(table, dimensions, summary_operator) return table
def _slice_compare_results(table, metric, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range','slices' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs' dimension is 'batsman'. we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended slice-compare suggestions: Type - List of strings, List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) if slices == None: slices = [(slice_compare_column, Filters.IN, [slice1, slice2])] else: slices.append((slice_compare_column, Filters.IN, [slice1, slice2])) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # slice_compare_column should be the last element of the group # so that groupby will show them together for every grouping grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) after_group_by = aspects.group_by(table, grouping_columns, summary_operator) result_table = after_group_by['table'] suggestions = after_group_by['suggestions'] return (result_table, suggestions)