def test_annotation_reporting(): # read some data into a labeled dataframe so we have something to work with labeled_df = wg.LabeledDataFrame('data/iristest') # get some trend objects so we can populate our results table rankobj = wg.Mean_Rank_Trend() linreg_obj = wg.All_Linear_Trend() # get a sample from the data labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj]) labeled_df.result_df.sample(10) # try to annotate a column of the results dataframe labeled_df.annotate(13,'Comment','Reverse') # first test to see if the new column was created assert('Comment' in labeled_df.result_df) # now test to see if our comment was added at the correct spot and matches what we expect assert((labeled_df.result_df.iloc[13]["Comment"]) == "Reverse") # Try to add a comment to a different row labeled_df.annotate(9,'testCol','Test') # check to make sure the new column exists assert('testCol' in labeled_df.result_df) # test to make sure the new comment was added and is correct assert((labeled_df.result_df.iloc[9]["testCol"]) == "Test") # Try and change an existing comment labeled_df.annotate(13,'Comment','Positive') assert((labeled_df.result_df.iloc[13]["Comment"]) == "Positive") # Test our filtered annotations labeled_df.filter_annotate(feat1='petal length',subgroup=['Iris-setosa'],annotate_col='Test', comment = "1") assert(labeled_df.result_df.iloc[3]["Test"] == "1" and labeled_df.result_df.iloc[6]["Test"] == "1") # delete a comment labeled_df.delete_annotate(13,"Comment") # test to make sure the comment was deleted assert(labeled_df.result_df.iloc[13]["Comment"] == '') ''' Start testing the reports ''' # Create a data frame representing the columns we want to pull for the report report_df = labeled_df.result_df.iloc[[0,1,2,3,4,5,6],[0,1,2,3,4,5,6]] # Create a data frame using our create report table function report_df2 = labeled_df.get_report_table([0,1,2,3,4,5,6],[0,1,2,3,4,5,6]) assert(report_df.equals(report_df2)) # add some simulated distance values to the table distance =[0.6438666996,0.6256534913,0.4857791439,0.2121011069,0.0105583417,0.0948039601,0.148629899,0.0660374135,0.1931507183,0.2514102163,0.2121011069,0.2935684781,0.3011883759,0.1814805029,0.6256534913] labeled_df.result_df['distance'] = distance # test the threshold function assert(len(labeled_df.result_df[labeled_df.result_df["distance"] > .5]) == labeled_df.count_values_above_thres("distance",.5))
def test_model(): labeled_df_setup = wg.LabeledDataFrame('data/iris.csv') meta = "" meta += "[{\"name\":\"sepal length\",\"var_type\":\"continuous\",\"role\":[\"independent\",\"dependent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}," meta += "{\"name\":\"sepal width\",\"var_type\":\"continuous\",\"role\":[\"dependent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}," meta += "{\"name\":\"petal length\",\"var_type\":\"continuous\",\"role\":[\"independent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}," meta += "{\"name\":\"petal width\",\"var_type\":\"continuous\",\"role\":[\"independent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}," meta += "{\"name\":\"class\",\"var_type\":\"categorical\",\"role\":[\"splitby\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}]" # test updateMetaData labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) assert len(labeled_df_setup.meta_df) == 5 # test checkSameMetadata checkResult = models.checkSameMetadata(labeled_df_setup, meta) assert checkResult == True # test getDistanceHeatmapDict corr_obj = wg.All_Pearson() assert corr_obj.is_computable(labeled_df_setup) labeled_df_setup.get_subgroup_trends_1lev([corr_obj]) labeled_df_setup.add_distance() distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, labeled_df_setup.result_df) assert len(distance_heatmap_dict) == 3 # test getRankTrendDetail labeled_df_wage2 = wg.LabeledDataFrame( 'data/wages_gender_rank_time_regression2') rankobj = wg.Mean_Rank_Trend() labeled_df_wage2.get_subgroup_trends_1lev([rankobj]) dependent = 'pay' independent = 'gender' splitby = 'department' rank_trend_detail, rank_trend_count = models.getRankTrendDetail( labeled_df_wage2, dependent, independent, splitby) assert rank_trend_detail.empty == False assert rank_trend_count.empty == False # test getMetaDict result_dict = {} result_dict = models.getMetaDict(labeled_df_wage2) assert len(result_dict) == 6
def test_custom_trends(): labeled_df = wg.LabeledDataFrame( 'data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI') rankobj = wg.Mean_Rank_Trend() linregobj = wg.Linear_Trend() linregobj.get_trend_vars(labeled_df) # The tren objects above will compute all pairs of given types, but what if we want to define custom trends? We can do that by overloading existing types. We'll overload only the get_trend_vars() function for now, but the other functions can also be overloaded or a totally new trend can be added as long as it is compatible. min_lin_reg_obj = min_lin_reg() min_lin_reg_obj.get_trend_vars() # # Component-wise # # We can also use the components of trends to construct custom trends medianrankobj = custom_Median_Rank_Trend() labeled_df.get_subgroup_trends_1lev([medianrankobj, rankobj])
def test_classification_trends(): dataset = 'data/multi_decision_admisions/' labeled_df = wg.LabeledDataFrame(dataset) acc_trend = wg.Binary_Accuracy_Trend() tpr_trend = wg.Binary_TPR_Trend() ppv_trend = wg.Binary_PPV_Trend() tnr_trend = wg.Binary_TNR_Trend() fdr_trend = wg.Binary_FDR_Trend() fnr_trend = wg.Binary_FNR_Trend() err_trend = wg.Binary_Error_Trend() f1_trend = wg.Binary_F1_Trend() trend_list = [ acc_trend, tpr_trend, ppv_trend, tnr_trend, fdr_trend, f1_trend, fnr_trend, err_trend ] [trend.is_computable(labeled_df) for trend in trend_list] labeled_df.get_subgroup_trends_1lev(trend_list) # In[36]: labeled_df.get_SP_rows(thresh=.2)
def main(): if request.method == 'POST': action = request.form['action'] global labeled_df_setup # store filter parameters global filter_flag global filter_object # store project name global project_name if action == 'folder_open': # initial filter flag and filter object filter_flag = False filter_object = {} folder = request.form['folder'] # set folder name to project name project_name = folder folder = 'data/' + folder labeled_df_setup = wg.LabeledDataFrame(folder) result_dict = {} result_dict = models.getMetaDict(labeled_df_setup) result_dict['possible_roles'] = wg.possible_roles result_dict['trend_types'] = list(wg.all_trend_types.keys()) trend_type_list = pd.unique( labeled_df_setup.result_df['trend_type']) result_dict['trend_type_list'] = list(trend_type_list) # get trend display names result_dict['trend_display_names'] = [ v().display_name for k, v in wg.all_trend_types.items() ] return jsonify(result_dict) # index.html 'Open' button clicked for data file if action == 'open': # initial filter flag and filter object filter_flag = False filter_object = {} # initial project name project_name = "" file = request.files.get('file') df = pd.read_csv(file) # Construct the csv data fitting d3.csv format global csv_data csv_data = df.to_dict(orient='records') csv_data = json.dumps(csv_data, indent=2) labeled_df_setup = wg.LabeledDataFrame(df) labeled_df_setup.infer_var_types() # get var_types for dropbox var_types = [] var_types = labeled_df_setup.meta_df['var_type'].tolist() # get sample for data sample_list = [] sample_list = labeled_df_setup.get_data_sample() # get trend display names trend_display_names = [ v().display_name for k, v in wg.all_trend_types.items() ] return jsonify({ 'var_types': var_types, 'samples': sample_list, 'possible_roles': wg.possible_roles, 'trend_types': list(wg.all_trend_types.keys()), 'trend_display_names': trend_display_names }) # index.html 'Save' button if action == 'save': meta = request.form['metaList'] labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) # store meta data into csv project_name = request.form['projectName'] directory = 'data/' + project_name labeled_df_setup.to_csvs(directory) return 'Saved' # index.html 'Compute Quantiles' button clicked if action == 'quantiles': meta = request.form['metaList'] labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) checked_vars = request.form['checked_vars'] checked_vars = checked_vars.split(",") if checked_vars: user_cutoffs = request.form['user_cutoffs'] if user_cutoffs != '': # extract quantiles from user input cutoffs = [float(s) for s in user_cutoffs.split(',')] cutoffs.extend([1]) cutoffs.insert(0, 0) labels = [ str(np.round(a * 100, 2)) + 'to' + str(np.round(b * 100, 2)) + '%' for a, b in zip(cutoffs[:-1], cutoffs[1:]) ] quantiles_dict = dict(zip(labels, cutoffs[1:])) labeled_df_setup.add_quantile(checked_vars, quantiles_dict) else: labeled_df_setup.add_quantile(checked_vars) result_dict = {} result_dict = models.getMetaDict(labeled_df_setup) result_dict['possible_roles'] = wg.possible_roles return jsonify(result_dict) # index.html 'Clustering' button clicked if action == 'clustering': meta = request.form['metaList'] labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) qual_thresh = float(request.form['qual_thresh']) labeled_df_setup.add_all_dpgmm(qual_thresh=qual_thresh) result_dict = {} result_dict = models.getMetaDict(labeled_df_setup) result_dict['possible_roles'] = wg.possible_roles return jsonify(result_dict) # index.html 'Add Intersection' button clicked if action == 'intersection': meta = request.form['metaList'] labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) checked_vars = request.form['intersection_vars'] checked_vars = checked_vars.split(",") if checked_vars: tuple_lens = request.form['tuple_lens'].strip() if tuple_lens != '': tuple_lens = [int(t) for t in tuple_lens.split(',')] labeled_df_setup.add_intersectional( checked_vars, tuple_lens) else: labeled_df_setup.add_intersectional(checked_vars) result_dict = {} result_dict = models.getMetaDict(labeled_df_setup) result_dict['possible_roles'] = wg.possible_roles return jsonify(result_dict) # visualize.html 'Save' button clicked if action == 'save_trends': # store meta data into csv project_name = request.form['projectName'] directory = 'data/' + project_name labeled_df_setup.save_all(directory) return 'Saved' # index.html 'Visualize' button clicked if action == 'visualize': meta = request.form['metaList'] checkResult = models.checkSameMetadata(labeled_df_setup, meta) # check if user input metadata is same as saved metadata if not (labeled_df_setup.result_df.empty) and checkResult == False: # delete result_df labeled_df_setup.result_df = pd.DataFrame() labeled_df_setup = models.updateMetaData(labeled_df_setup, meta) global trend_list # initial trend list trend_list = [] miss_trends_flg = False global filter_trend_list filter_trend_list = [] # redirect flag for checking if page is relaod or redirect # if redirect, flas sets to True; if reload, flag sets to False global redirect_flag redirect_flag = True user_trends = request.form['trend_types'] user_trends = user_trends.split(",") # check if the selected trend types are different from result_df if not (labeled_df_setup.result_df.empty): # result table is not empty, extract trend types from result table trend_list_result_df = [ trend.name for trend in labeled_df_setup.trend_list ] # delete the trend types existing in result table new_user_trends = list( set(user_trends) - set(trend_list_result_df)) # check if trend types missing from result table miss_trends = list( set(trend_list_result_df) - set(user_trends)) if len(miss_trends) > 0: filter_trend_list = list( set(trend_list_result_df) - set(miss_trends)) miss_trends_flg = True else: new_user_trends = user_trends # check user trend list if len(new_user_trends) > 0: trend_list = [ wg.all_trend_types[trend]() for trend in new_user_trends ] # check trends computable trend_computability = [ t.is_computable(labeled_df_setup) for t in trend_list ] # no trends can compute if sum(trend_computability) == 0: return 'no_computable_trend' # drop any specific trends that cannot compute if sum(trend_computability) < len(new_user_trends): trend_list = [ t for t, c in zip(new_user_trends, trend_computability) if c ] if miss_trends_flg: return 'miss_old_trend_type' return redirect(url_for("visualize")) # initial for visualize.html page if action == 'page_load': # check if page is reload or not # if page is reload, skip trend computation if redirect_flag: # if redirect, set redirct_flag to False redirect_flag = False # if filter trends exist, do filtering if len(filter_trend_list) > 0: labeled_df_setup.get_trend_rows( trend_type=filter_trend_list, inplace=True) # check trend list if len(trend_list) > 0: labeled_df_setup.get_subgroup_trends_1lev(trend_list) if labeled_df_setup.result_df.empty: return 'no_result' # add distances labeled_df_setup.add_distance() # Generate distance heatmaps distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, labeled_df_setup.result_df) # Extract overview legend types overview_legend_types = models.getOverviewLegendType( distance_heatmap_dict) df = labeled_df_setup.df.to_dict(orient='records') df = json.dumps(df, indent=2) default_threshold = wg.trend_quality_sp # Add trend diplay name result_df_temp = labeled_df_setup.result_df.copy() result_df = models.replaceTrendDisplayName(result_df_temp) return jsonify(distance_heatmap_dict=distance_heatmap_dict, result_df=result_df.to_json(orient='records'), df=df, default_threshold=default_threshold, project_name=project_name, overview_legend_types=overview_legend_types) # visualize.html rank trend's cells clicked if action == 'detail_ranktrend': independent = request.form['independent'] dependent = request.form['dependent'] group_feat = request.form['group_feat'] rank_trend_detail, rank_trend_count = models.getRankTrendDetail( labeled_df_setup, dependent, independent, group_feat) # covert row label to string to avoid jsonify error, e.g., department: 1 rank_trend_count = rank_trend_count.rename( columns=lambda x: str(x)) return jsonify( rank_trend_detail=rank_trend_detail.reset_index().to_dict( orient='records'), rank_trend_count=rank_trend_count.reset_index().to_dict( orient='records')) # visualize.html 'Filter' button clicked if action == 'filter': filter_object = request.form['filter_object'] filter_object = json.loads(filter_object, cls=Decoder) filter_result = labeled_df_setup.get_trend_rows( independent=filter_object['independent'], dependent=filter_object['dependent'], group_feat=filter_object['group_feat'], subgroup=filter_object['subgroup'], trend_type=filter_object['trend_type']) # Generate distance heatmaps distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, filter_result) # Extract overview legend types overview_legend_types = models.getOverviewLegendType( distance_heatmap_dict) df = labeled_df_setup.df.to_dict(orient='records') df = json.dumps(df, indent=2) # set filter flag filter_flag = True # Add trend diplay name filter_result = models.replaceTrendDisplayName(filter_result) return jsonify(distance_heatmap_dict=distance_heatmap_dict, result_df=filter_result.to_json(orient='records'), df=df, overview_legend_types=overview_legend_types) # visualize.html 'Reset' button clicked if action == 'reset': # Generate distance heatmaps distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, labeled_df_setup.result_df) # Extract overview legend types overview_legend_types = models.getOverviewLegendType( distance_heatmap_dict) df = labeled_df_setup.df.to_dict(orient='records') df = json.dumps(df, indent=2) # set filter flag to False filter_flag = False # clean filter object filter_object.clear() # Add trend diplay name result_df_temp = labeled_df_setup.result_df.copy() result_df = models.replaceTrendDisplayName(result_df_temp) return jsonify(distance_heatmap_dict=distance_heatmap_dict, result_df=result_df.to_json(orient='records'), df=df, overview_legend_types=overview_legend_types) # visualize.html 'Detect' button clicked if action == 'detect': distance_threshold = float(request.form['distance_threshold']) sg_strength_threshold = float( request.form['sg_strength_threshold']) agg_strength_threshold = float( request.form['agg_strength_threshold']) filter_object = request.form['filter_object'] filter_object = json.loads(filter_object, cls=Decoder) trend_filter = filter_object['trend_type'] if not trend_filter: # Default to detect all trend types from result_df trend_filter = list( pd.unique(labeled_df_setup.result_df['trend_type'])) sp_filter = { 'name': 'SP', 'distance': distance_threshold, 'agg_trend_strength': agg_strength_threshold, 'subgroup_trend_strength': sg_strength_threshold, 'trend_type': trend_filter } # check if filter flag is True if filter_flag: # filtered, pass filter parameter detect_result = labeled_df_setup.get_SP_rows( sp_filter, independent=filter_object['independent'], dependent=filter_object['dependent'], group_feat=filter_object['group_feat'], subgroup=filter_object['subgroup'], trend_type=filter_object['trend_type'], replace=True) else: # not filter detect_result = labeled_df_setup.get_SP_rows(sp_filter, replace=True) # Generate distance heatmaps distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, detect_result) # Extract overview legend types overview_legend_types = models.getOverviewLegendType( distance_heatmap_dict) df = labeled_df_setup.df.to_dict(orient='records') df = json.dumps(df, indent=2) # Add trend diplay name detect_result = models.replaceTrendDisplayName(detect_result) return jsonify(distance_heatmap_dict=distance_heatmap_dict, result_df=detect_result.to_json(orient='records'), df=df, overview_legend_types=overview_legend_types) # visualize.html 'Rank' button clicked if action == 'rank': agg_type = request.form['agg_type'] score_col = request.form['score_col'] view_score = agg_type + '_view_' + score_col # check if view score exists if not (view_score in labeled_df_setup.result_df.columns): # not exist, then add view score labeled_df_setup.add_view_score(score_col, agg_type=agg_type, colored=False) rank_result = labeled_df_setup.rank_occurences_by_view( view_score, score_col) # if filter_flag is True, filtering the rank result if filter_flag: rank_result = labeled_df_setup.get_trend_rows( independent=filter_object['independent'], dependent=filter_object['dependent'], group_feat=filter_object['group_feat'], subgroup=filter_object['subgroup'], trend_type=filter_object['trend_type']) # Generate distance heatmaps distance_heatmap_dict = models.getDistanceHeatmapDict( labeled_df_setup, rank_result) # Extract overview legend types overview_legend_types = models.getOverviewLegendType( distance_heatmap_dict) df = labeled_df_setup.df.to_dict(orient='records') df = json.dumps(df, indent=2) # Add trend diplay name rank_result = models.replaceTrendDisplayName(rank_result) return jsonify(distance_heatmap_dict=distance_heatmap_dict, result_df=rank_result.to_json(orient='records'), df=df, overview_legend_types=overview_legend_types)
def test_basic_load_df_wages(): # We'll first load in some data, this has both regression and rate type trends. We will load it two ways and check that the structure is the same # In[2]: labeled_df_file = wg.LabeledDataFrame( 'data/wages_gender_rank_time_regression2/df.csv') # In[3]: labeled_df_dir = wg.LabeledDataFrame( 'data/wages_gender_rank_time_regression2') # In[4]: assert np.product(labeled_df_file.df.columns == labeled_df_dir.df.columns) # In[5]: assert labeled_df_file.df.shape == labeled_df_dir.df.shape # In[6]: compare_df = labeled_df_file.df == labeled_df_dir.df assert np.product(compare_df.sum() == len(labeled_df_file.df)) # Next, we can infer the variable types and assign the roles then check that those match what was read from the saved copy # In[7]: labeled_df_file.infer_var_types() roles = { 'department': ['independent', 'splitby'], 'year': ['independent'], 'pay': ['dependent'], 'gender': ['independent', 'splitby'] } var_types = {'gender': 'categorical'} labeled_df_file.set_counts( {var: False for var in labeled_df_file.df.columns}) labeled_df_file.set_roles(roles) labeled_df_file.set_var_types(var_types) assert np.product( labeled_df_file.meta_df.columns == labeled_df_dir.meta_df.columns) assert labeled_df_file.meta_df.shape == labeled_df_dir.meta_df.shape compare_meta_df = labeled_df_file.meta_df.dropna( axis=1) == labeled_df_dir.meta_df.dropna(axis=1) assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df)) # compare_meta_df # labeled_df_dir.meta_df.dropna(axis=1) # Now, we've set this up, we can also save these configurations to load them in directly in the future assert labeled_df_file.to_csvs('data/wages_test') # Now confirm that all the files were written correctly. assert sorted(os.listdir('data/wages_test/')) == [ 'df.csv', 'meta.csv', 'result_df.csv' ] # it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. # # Now, we can can also load the data back labeled_df = wg.LabeledDataFrame('data/wages_test') labeled_df.meta_df # And confirm that thiss is the same as what was written. First confirm the column headings are the same assert np.product( labeled_df.meta_df.columns == labeled_df_dir.meta_df.columns) # Then confirm the shape is the same assert labeled_df.meta_df.shape == labeled_df_dir.meta_df.shape # Then that non NaN values are all the same, combined with above the NaNs must be in the same location, but np.NaN == np.Nan asserts to false # In[18]: compare_meta_df = labeled_df.meta_df.dropna( axis=1) == labeled_df_dir.meta_df.dropna(axis=1) assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df)) # compare_meta_df # labeled_df_dir.meta_df.dropna(axis=1) # In[19]: assert np.product(labeled_df.df.columns == labeled_df_dir.df.columns) # In[20]: assert labeled_df.df.shape == labeled_df_dir.df.shape # In[21]: compare_df = labeled_df.df.dropna(axis=1) == labeled_df_dir.df.dropna( axis=1) assert np.product(compare_df.sum() == len(labeled_df_dir.df)) # compare_meta_df # labeled_df_dir.meta_df.dropna(axis=1) # In[22]: intersect_cols = ['gender', 'department'] labeled_df.add_intersectional(intersect_cols) # Now check that that worked correctly # In[23]: intersectional_col_name = '_'.join(intersect_cols) intersectional_correct = lambda row: row[ intersectional_col_name] == '_'.join( [row[icol] for icol in intersect_cols]) icol_correct = labeled_df.df.apply(intersectional_correct, axis=1) assert np.product(icol_correct) # In[24]: labeled_df.add_quantile(['pay']) q_limits = np.quantile( labeled_df.df['pay'], [.25, .75, 1], ) limits = {n: q for n, q in zip(['low', 'mid', 'high'], q_limits)} for q, df in labeled_df.df.groupby('payquantiles'): a = df['pay'] <= limits[q] assert np.product(a) # In[26]: assert labeled_df.get_vars_per_type('categorical') == [ 'department', 'gender', 'gender_department', 'payquantiles' ] assert labeled_df.meta_df.loc['gender_department', 'dtype'] == 'object' assert labeled_df.meta_df.loc['gender_department', 'var_type'] == 'categorical' assert labeled_df.meta_df.loc['gender_department', 'role'] == 'splitby' assert labeled_df.meta_df.loc['gender_department', 'isCount'] == False # Check the utility fucntions # In[29]: assert labeled_df.get_vars_per_role('splitby') == [ 'department', 'gender', 'gender_department', 'payquantiles' ] assert labeled_df.get_vars_per_role('independent') == [ 'year', 'department', 'gender' ] assert labeled_df.get_vars_per_role('dependent') == ['pay'] # In[30]: assert labeled_df.get_data_sample() == [ 'Max: 51.04 Min: 13.52', 'Max: 50.0 Min: 0.0', 'Support, Sales, Management, R&D', 'F, M', 'F_Support, M_Support, M_Sales, F_Sales, M_Management', 'mid, low, high' ] # In[31]: assert labeled_df.get_vars_per_type('categorical') == [ 'department', 'gender', 'gender_department', 'payquantiles' ] assert labeled_df.get_vars_per_type('continuous') == ['pay', 'year'] # In[32]: assert labeled_df.get_vars_per_roletype('independent', 'continuous') == ['year'] assert labeled_df.get_vars_per_roletype( 'independent', 'categorical') == ['department', 'gender'] # # Using Trends # # Trend objects define their name, how to compute the trend and how to choose which variables, # # extension will allow that the var lists may be passed to reduce which ones are computed # In[33]: corrobj = wg.All_Pearson() corrobj.get_trend_vars(labeled_df) assert corrobj.regression_vars == [('year', 'pay')] assert len(corrobj.var_weight_list) == len(corrobj.regression_vars) assert corrobj.set_vars == True # In[34]: rankobj = wg.Mean_Rank_Trend() assert rankobj.get_trend_vars(labeled_df) assert rankobj.target == ['pay'] assert rankobj.trendgroup == ['department', 'gender'] assert rankobj.set_vars == True assert len(rankobj.var_weight_list) == len(rankobj.target) # In[35]: linreg_obj = wg.All_Linear_Trend() linreg_obj.get_trend_vars(labeled_df) assert linreg_obj.regression_vars == [('year', 'pay')] assert len(linreg_obj.var_weight_list) == len(linreg_obj.regression_vars) assert linreg_obj.set_vars == True # # Computing Trends on a LabeledDataFrame # There are two ways, we can use default setting and pass the names of the trend type or a trend object # In[36]: labeled_df.get_subgroup_trends_1lev(['pearson_corr']) assert np.product(labeled_df.result_df.columns == [ 'independent', 'dependent', 'group_feat', 'subgroup', 'agg_trend', 'agg_trend_strength', 'subgroup_trend', 'subgroup_trend_strength', 'trend_type', 'comparison_type' ]) # In[38]: # there are 10 fixed columns and the number of rows for this trend is below num_reg_pairs = 1 num_depts = 4 num_genders = 2 num_quantiles = 3 num_dept_genders = num_genders * num_depts num_pearson = num_reg_pairs * (num_depts + num_genders + num_dept_genders + num_quantiles) assert labeled_df.result_df.shape == (num_pearson, 10) # Now we can use a list of objects and apply multiple trends # In[39]: labeled_df.get_subgroup_trends_1lev([rankobj, linreg_obj]) num_lin = num_pearson num_gender_idep = num_depts + num_dept_genders + num_quantiles num_dept_indep = num_genders + num_dept_genders + num_quantiles num_rank = num_gender_idep + num_dept_indep total_rows_agg_sg = num_pearson + num_lin + num_rank assert labeled_df.result_df.shape == (total_rows_agg_sg, 10) # We can see what types of trends were computed from `result_df` # In[41]: assert np.product( pd.unique(labeled_df.result_df['trend_type']) == ['pearson_corr', 'rank_trend', 'lin_reg']) # In[42]: assert pd.unique( labeled_df.result_df['comparison_type']) == ['aggregate-subgroup'] # We can also add trends that are structured for pairwise comparisons # In[43]: labeled_df.get_pairwise_trends_1lev([rankobj, linreg_obj]) # Again, check that the infrastructure of this by checking that the number of rows is correct # In[44]: num_dept_pairs = np.sum(list(range(num_depts))) num_gender_pairs = np.sum(list(range(num_genders))) num_dept_genders_pairs = np.sum(list(range(num_dept_genders))) num_quantile_pairs = np.sum(list(range(num_quantiles))) gender_indep_pairwise_rows = num_dept_pairs + num_dept_genders_pairs + num_quantile_pairs dept_indep_pairwise_rows = num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs lin_reg_pairwise_rows = num_dept_pairs + num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs rank_pairwise_rows = gender_indep_pairwise_rows + dept_indep_pairwise_rows total_rows = total_rows_agg_sg + lin_reg_pairwise_rows + rank_pairwise_rows assert labeled_df.result_df.shape == (total_rows, 13) # In[45]: assert list(pd.unique(labeled_df.result_df['comparison_type'])) == [ 'aggregate-subgroup', 'pairwise' ] # The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend # In[46]: labeled_df.trend_list # In[47]: # labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1) labeled_df.add_distance( row_wise=True) #('subgroup_trend','subgroup_trend2') assert labeled_df.result_df.shape == (total_rows, 14) # Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization. # # Saving with trends # In[48]: assert labeled_df.save_all('data/wages_test_all') # In[49]: assert sorted(os.listdir('data/wages_test_all/')) == [ 'df.csv', 'meta.csv', 'result_df.csv', 'trends.json' ] # In[50]: labeled_df_tl = wg.LabeledDataFrame('data/wages_test_all') # That save function calls the save function tested above, we only need to test that the trend list loaded correctly # In[51]: labeled_df.trend_list[0].trend_precompute # In[52]: labeled_df_tl.trend_list[0].trend_precompute # # Filtering # Test for each filter variable, one at a time and several pairs # In[53]: year_df = labeled_df.get_trend_rows(independent='year') pay_df = labeled_df.get_trend_rows(dependent='pay') dept_df = labeled_df.get_trend_rows(group_feat='department') mgmt_df = labeled_df.get_trend_rows(subgroup='Management') sales_df = labeled_df.get_trend_rows(subgroup2='Sales') linreg_df = labeled_df.get_trend_rows(trend_type='lin_reg') pair_df = labeled_df.get_trend_rows(comparison_type='pairwise') # TODO: manually verify these counts # In[54]: assert len(year_df) == 72 assert len(pay_df) == 169 assert len(dept_df) == 24 assert len(mgmt_df) == 12 assert len(sales_df) == 4 assert len(linreg_df) == 55 assert len(pair_df) == lin_reg_pairwise_rows + rank_pairwise_rows # Now test two conditions and passing a list to a condition # In[55]: y_sm_df = labeled_df.get_trend_rows(independent='year', subgroup=['Management', 'Sales']) pay_rank = labeled_df.get_trend_rows(dependent='pay', trend_type='rank_trend') # We can also filter based on SP detections with ` # In[56]: labeled_df.get_SP_rows(thresh=.2) # In[57]: assert labeled_df.result_df.shape == (total_rows, 15) # ## Detection # # Detection via `get_SP_rows` happens in two steps: # 1. label the rows # 2. filter by that column to return # # Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways # when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided. Some defaults are also built in accessible by string. # In[58]: labeled_df.get_SP_rows('default_qual_sp') assert labeled_df.result_df.shape == (total_rows, 16) # Basic type checks on detections, TODO: accuracy on detections # In[59]: assert labeled_df.result_df['SP_thresh0.2'].dtype == bool assert labeled_df.result_df['default_qual_sp'].dtype == bool # In[60]: labeled_df.get_SP_rows('SP') assert labeled_df.result_df.shape == (total_rows, 17) assert labeled_df.result_df['SP'].dtype == bool # We can also define our own detection filters, using any available column # In[61]: rank_only_qual = { 'name': 'rank_only_qual_sp', 'distance': .2, 'agg_trend_strength': .05, 'subgroup_trend_strength': .05, 'trend_type': 'rank_trend' } labeled_df.get_SP_rows(rank_only_qual, replace=True) assert labeled_df.result_df.shape == (total_rows, 18) # # Ranking # In[62]: labeled_df.rank_occurences_by_view(ascending=False) assert labeled_df.result_df.shape == (total_rows, 19) labeled_df.add_view_score('SP_thresh0.2', agg_type='sum', colored=False) assert labeled_df.result_df.shape == (total_rows, 20) labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2', 'SP_thresh0.2')