def test_annotation_reporting():
    
    # read some data into a labeled dataframe so we have something to work with
    labeled_df = wg.LabeledDataFrame('data/iristest')
    
    # get some trend objects so we can populate our results table
    rankobj = wg.Mean_Rank_Trend()
    linreg_obj = wg.All_Linear_Trend()
    
    # get a sample from the data 
    labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
    labeled_df.result_df.sample(10)
    
    # try to annotate a column of the results dataframe 
    labeled_df.annotate(13,'Comment','Reverse')
     # first test to see if the new column was created
    assert('Comment' in labeled_df.result_df)
    # now test to see if our comment was added at the correct spot and matches what we expect
    assert((labeled_df.result_df.iloc[13]["Comment"]) == "Reverse")
    
    # Try to add a comment to a different row 
    labeled_df.annotate(9,'testCol','Test')
    # check to make sure the new column exists
    assert('testCol' in labeled_df.result_df)
    # test to make sure the new comment was added and is correct
    assert((labeled_df.result_df.iloc[9]["testCol"]) == "Test")
    
    # Try and change an existing comment 
    labeled_df.annotate(13,'Comment','Positive')
    assert((labeled_df.result_df.iloc[13]["Comment"]) == "Positive")
    
    # Test our filtered annotations

    labeled_df.filter_annotate(feat1='petal length',subgroup=['Iris-setosa'],annotate_col='Test', comment = "1")
    assert(labeled_df.result_df.iloc[3]["Test"] == "1" and labeled_df.result_df.iloc[6]["Test"] == "1")
    
    # delete a comment       
    labeled_df.delete_annotate(13,"Comment")
    
    # test to make sure the comment was deleted 
    assert(labeled_df.result_df.iloc[13]["Comment"] == '')  
           
    ''' Start testing the reports '''
    
    # Create a data frame representing the columns we want to pull for the report
    report_df = labeled_df.result_df.iloc[[0,1,2,3,4,5,6],[0,1,2,3,4,5,6]]
           
    # Create a data frame using our create report table function
    report_df2 = labeled_df.get_report_table([0,1,2,3,4,5,6],[0,1,2,3,4,5,6])
           
    assert(report_df.equals(report_df2))
    

    # add some simulated distance values to the table
    distance =[0.6438666996,0.6256534913,0.4857791439,0.2121011069,0.0105583417,0.0948039601,0.148629899,0.0660374135,0.1931507183,0.2514102163,0.2121011069,0.2935684781,0.3011883759,0.1814805029,0.6256534913]
    labeled_df.result_df['distance'] = distance
    # test the threshold function
    assert(len(labeled_df.result_df[labeled_df.result_df["distance"] > .5]) == labeled_df.count_values_above_thres("distance",.5))
Exemplo n.º 2
0
def test_model():
    labeled_df_setup = wg.LabeledDataFrame('data/iris.csv')
    meta = ""
    meta += "[{\"name\":\"sepal length\",\"var_type\":\"continuous\",\"role\":[\"independent\",\"dependent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"},"
    meta += "{\"name\":\"sepal width\",\"var_type\":\"continuous\",\"role\":[\"dependent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"},"
    meta += "{\"name\":\"petal length\",\"var_type\":\"continuous\",\"role\":[\"independent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"},"
    meta += "{\"name\":\"petal width\",\"var_type\":\"continuous\",\"role\":[\"independent\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"},"
    meta += "{\"name\":\"class\",\"var_type\":\"categorical\",\"role\":[\"splitby\"],\"isCount\":\"N\",\"weighting_var\":\"N/A\"}]"

    # test updateMetaData
    labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)
    assert len(labeled_df_setup.meta_df) == 5

    # test checkSameMetadata
    checkResult = models.checkSameMetadata(labeled_df_setup, meta)
    assert checkResult == True

    # test getDistanceHeatmapDict
    corr_obj = wg.All_Pearson()
    assert corr_obj.is_computable(labeled_df_setup)
    labeled_df_setup.get_subgroup_trends_1lev([corr_obj])
    labeled_df_setup.add_distance()
    distance_heatmap_dict = models.getDistanceHeatmapDict(
        labeled_df_setup, labeled_df_setup.result_df)
    assert len(distance_heatmap_dict) == 3

    # test getRankTrendDetail
    labeled_df_wage2 = wg.LabeledDataFrame(
        'data/wages_gender_rank_time_regression2')
    rankobj = wg.Mean_Rank_Trend()
    labeled_df_wage2.get_subgroup_trends_1lev([rankobj])
    dependent = 'pay'
    independent = 'gender'
    splitby = 'department'
    rank_trend_detail, rank_trend_count = models.getRankTrendDetail(
        labeled_df_wage2, dependent, independent, splitby)
    assert rank_trend_detail.empty == False
    assert rank_trend_count.empty == False

    # test getMetaDict
    result_dict = {}
    result_dict = models.getMetaDict(labeled_df_wage2)
    assert len(result_dict) == 6
Exemplo n.º 3
0
def test_custom_trends():
    labeled_df = wg.LabeledDataFrame(
        'data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')

    rankobj = wg.Mean_Rank_Trend()
    linregobj = wg.Linear_Trend()
    linregobj.get_trend_vars(labeled_df)

    # The tren objects above will compute all pairs of given types, but what if we want to define custom trends?  We can do that by overloading existing types.  We'll overload only the get_trend_vars() function for now, but the other functions can also be overloaded or a totally new trend can be added as long as it is compatible.

    min_lin_reg_obj = min_lin_reg()
    min_lin_reg_obj.get_trend_vars()

    # # Component-wise
    #
    # We can also use the components of trends to construct custom trends

    medianrankobj = custom_Median_Rank_Trend()
    labeled_df.get_subgroup_trends_1lev([medianrankobj, rankobj])
Exemplo n.º 4
0
def test_classification_trends():

    dataset = 'data/multi_decision_admisions/'
    labeled_df = wg.LabeledDataFrame(dataset)

    acc_trend = wg.Binary_Accuracy_Trend()
    tpr_trend = wg.Binary_TPR_Trend()
    ppv_trend = wg.Binary_PPV_Trend()
    tnr_trend = wg.Binary_TNR_Trend()
    fdr_trend = wg.Binary_FDR_Trend()
    fnr_trend = wg.Binary_FNR_Trend()
    err_trend = wg.Binary_Error_Trend()
    f1_trend = wg.Binary_F1_Trend()
    trend_list = [
        acc_trend, tpr_trend, ppv_trend, tnr_trend, fdr_trend, f1_trend,
        fnr_trend, err_trend
    ]
    [trend.is_computable(labeled_df) for trend in trend_list]
    labeled_df.get_subgroup_trends_1lev(trend_list)

    # In[36]:

    labeled_df.get_SP_rows(thresh=.2)
Exemplo n.º 5
0
def main():
    if request.method == 'POST':

        action = request.form['action']

        global labeled_df_setup

        # store filter parameters
        global filter_flag
        global filter_object

        # store project name
        global project_name

        if action == 'folder_open':

            # initial filter flag and filter object
            filter_flag = False
            filter_object = {}

            folder = request.form['folder']

            # set folder name to project name
            project_name = folder

            folder = 'data/' + folder
            labeled_df_setup = wg.LabeledDataFrame(folder)

            result_dict = {}
            result_dict = models.getMetaDict(labeled_df_setup)

            result_dict['possible_roles'] = wg.possible_roles
            result_dict['trend_types'] = list(wg.all_trend_types.keys())

            trend_type_list = pd.unique(
                labeled_df_setup.result_df['trend_type'])
            result_dict['trend_type_list'] = list(trend_type_list)

            # get trend display names
            result_dict['trend_display_names'] = [
                v().display_name for k, v in wg.all_trend_types.items()
            ]

            return jsonify(result_dict)

        # index.html 'Open' button clicked for data file
        if action == 'open':

            # initial filter flag and filter object
            filter_flag = False
            filter_object = {}

            # initial project name
            project_name = ""

            file = request.files.get('file')
            df = pd.read_csv(file)

            # Construct the csv data fitting d3.csv format
            global csv_data
            csv_data = df.to_dict(orient='records')
            csv_data = json.dumps(csv_data, indent=2)

            labeled_df_setup = wg.LabeledDataFrame(df)

            labeled_df_setup.infer_var_types()

            # get var_types for dropbox
            var_types = []
            var_types = labeled_df_setup.meta_df['var_type'].tolist()

            # get sample for data
            sample_list = []
            sample_list = labeled_df_setup.get_data_sample()

            # get trend display names
            trend_display_names = [
                v().display_name for k, v in wg.all_trend_types.items()
            ]

            return jsonify({
                'var_types': var_types,
                'samples': sample_list,
                'possible_roles': wg.possible_roles,
                'trend_types': list(wg.all_trend_types.keys()),
                'trend_display_names': trend_display_names
            })

        # index.html 'Save' button
        if action == 'save':
            meta = request.form['metaList']

            labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)

            # store meta data into csv
            project_name = request.form['projectName']
            directory = 'data/' + project_name
            labeled_df_setup.to_csvs(directory)
            return 'Saved'

        # index.html 'Compute Quantiles' button clicked
        if action == 'quantiles':

            meta = request.form['metaList']
            labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)

            checked_vars = request.form['checked_vars']
            checked_vars = checked_vars.split(",")

            if checked_vars:
                user_cutoffs = request.form['user_cutoffs']
                if user_cutoffs != '':
                    # extract quantiles from user input
                    cutoffs = [float(s) for s in user_cutoffs.split(',')]
                    cutoffs.extend([1])
                    cutoffs.insert(0, 0)

                    labels = [
                        str(np.round(a * 100, 2)) + 'to' +
                        str(np.round(b * 100, 2)) + '%'
                        for a, b in zip(cutoffs[:-1], cutoffs[1:])
                    ]

                    quantiles_dict = dict(zip(labels, cutoffs[1:]))

                    labeled_df_setup.add_quantile(checked_vars, quantiles_dict)
                else:
                    labeled_df_setup.add_quantile(checked_vars)

            result_dict = {}
            result_dict = models.getMetaDict(labeled_df_setup)

            result_dict['possible_roles'] = wg.possible_roles

            return jsonify(result_dict)

        # index.html 'Clustering' button clicked
        if action == 'clustering':

            meta = request.form['metaList']
            labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)

            qual_thresh = float(request.form['qual_thresh'])

            labeled_df_setup.add_all_dpgmm(qual_thresh=qual_thresh)

            result_dict = {}
            result_dict = models.getMetaDict(labeled_df_setup)

            result_dict['possible_roles'] = wg.possible_roles

            return jsonify(result_dict)

        # index.html 'Add Intersection' button clicked
        if action == 'intersection':

            meta = request.form['metaList']
            labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)

            checked_vars = request.form['intersection_vars']
            checked_vars = checked_vars.split(",")

            if checked_vars:
                tuple_lens = request.form['tuple_lens'].strip()
                if tuple_lens != '':
                    tuple_lens = [int(t) for t in tuple_lens.split(',')]
                    labeled_df_setup.add_intersectional(
                        checked_vars, tuple_lens)
                else:
                    labeled_df_setup.add_intersectional(checked_vars)

            result_dict = {}
            result_dict = models.getMetaDict(labeled_df_setup)

            result_dict['possible_roles'] = wg.possible_roles

            return jsonify(result_dict)

        # visualize.html 'Save' button clicked
        if action == 'save_trends':
            # store meta data into csv
            project_name = request.form['projectName']
            directory = 'data/' + project_name
            labeled_df_setup.save_all(directory)
            return 'Saved'

        # index.html 'Visualize' button clicked
        if action == 'visualize':

            meta = request.form['metaList']
            checkResult = models.checkSameMetadata(labeled_df_setup, meta)

            # check if user input metadata is same as saved metadata
            if not (labeled_df_setup.result_df.empty) and checkResult == False:
                # delete result_df
                labeled_df_setup.result_df = pd.DataFrame()

            labeled_df_setup = models.updateMetaData(labeled_df_setup, meta)

            global trend_list
            # initial trend list
            trend_list = []
            miss_trends_flg = False
            global filter_trend_list
            filter_trend_list = []

            # redirect flag for checking if page is relaod or redirect
            # if redirect, flas sets to True; if reload, flag sets to False
            global redirect_flag
            redirect_flag = True

            user_trends = request.form['trend_types']
            user_trends = user_trends.split(",")

            # check if the selected trend types are different from result_df
            if not (labeled_df_setup.result_df.empty):
                # result table is not empty, extract trend types from result table
                trend_list_result_df = [
                    trend.name for trend in labeled_df_setup.trend_list
                ]

                # delete the trend types existing in result table
                new_user_trends = list(
                    set(user_trends) - set(trend_list_result_df))

                # check if trend types missing from result table
                miss_trends = list(
                    set(trend_list_result_df) - set(user_trends))

                if len(miss_trends) > 0:
                    filter_trend_list = list(
                        set(trend_list_result_df) - set(miss_trends))
                    miss_trends_flg = True
            else:
                new_user_trends = user_trends

            # check user trend list
            if len(new_user_trends) > 0:
                trend_list = [
                    wg.all_trend_types[trend]() for trend in new_user_trends
                ]

                # check trends computable
                trend_computability = [
                    t.is_computable(labeled_df_setup) for t in trend_list
                ]

                # no trends can compute
                if sum(trend_computability) == 0:
                    return 'no_computable_trend'

                # drop any specific trends that cannot compute
                if sum(trend_computability) < len(new_user_trends):
                    trend_list = [
                        t for t, c in zip(new_user_trends, trend_computability)
                        if c
                    ]

            if miss_trends_flg:
                return 'miss_old_trend_type'

            return redirect(url_for("visualize"))

        # initial for visualize.html page
        if action == 'page_load':
            # check if page is reload or not
            # if page is reload, skip trend computation
            if redirect_flag:
                # if redirect, set redirct_flag to False
                redirect_flag = False

                # if filter trends exist, do filtering
                if len(filter_trend_list) > 0:
                    labeled_df_setup.get_trend_rows(
                        trend_type=filter_trend_list, inplace=True)

                # check trend list
                if len(trend_list) > 0:
                    labeled_df_setup.get_subgroup_trends_1lev(trend_list)

                    if labeled_df_setup.result_df.empty:
                        return 'no_result'

                    # add distances
                    labeled_df_setup.add_distance()

            # Generate distance heatmaps
            distance_heatmap_dict = models.getDistanceHeatmapDict(
                labeled_df_setup, labeled_df_setup.result_df)

            # Extract overview legend types
            overview_legend_types = models.getOverviewLegendType(
                distance_heatmap_dict)

            df = labeled_df_setup.df.to_dict(orient='records')
            df = json.dumps(df, indent=2)

            default_threshold = wg.trend_quality_sp

            # Add trend diplay name
            result_df_temp = labeled_df_setup.result_df.copy()
            result_df = models.replaceTrendDisplayName(result_df_temp)

            return jsonify(distance_heatmap_dict=distance_heatmap_dict,
                           result_df=result_df.to_json(orient='records'),
                           df=df,
                           default_threshold=default_threshold,
                           project_name=project_name,
                           overview_legend_types=overview_legend_types)

        # visualize.html rank trend's cells clicked
        if action == 'detail_ranktrend':
            independent = request.form['independent']
            dependent = request.form['dependent']
            group_feat = request.form['group_feat']

            rank_trend_detail, rank_trend_count = models.getRankTrendDetail(
                labeled_df_setup, dependent, independent, group_feat)

            # covert row label to string to avoid jsonify error, e.g., department: 1
            rank_trend_count = rank_trend_count.rename(
                columns=lambda x: str(x))

            return jsonify(
                rank_trend_detail=rank_trend_detail.reset_index().to_dict(
                    orient='records'),
                rank_trend_count=rank_trend_count.reset_index().to_dict(
                    orient='records'))

        # visualize.html 'Filter' button clicked
        if action == 'filter':
            filter_object = request.form['filter_object']
            filter_object = json.loads(filter_object, cls=Decoder)

            filter_result = labeled_df_setup.get_trend_rows(
                independent=filter_object['independent'],
                dependent=filter_object['dependent'],
                group_feat=filter_object['group_feat'],
                subgroup=filter_object['subgroup'],
                trend_type=filter_object['trend_type'])

            # Generate distance heatmaps
            distance_heatmap_dict = models.getDistanceHeatmapDict(
                labeled_df_setup, filter_result)

            # Extract overview legend types
            overview_legend_types = models.getOverviewLegendType(
                distance_heatmap_dict)

            df = labeled_df_setup.df.to_dict(orient='records')
            df = json.dumps(df, indent=2)

            # set filter flag
            filter_flag = True

            # Add trend diplay name
            filter_result = models.replaceTrendDisplayName(filter_result)

            return jsonify(distance_heatmap_dict=distance_heatmap_dict,
                           result_df=filter_result.to_json(orient='records'),
                           df=df,
                           overview_legend_types=overview_legend_types)

        # visualize.html 'Reset' button clicked
        if action == 'reset':
            # Generate distance heatmaps
            distance_heatmap_dict = models.getDistanceHeatmapDict(
                labeled_df_setup, labeled_df_setup.result_df)

            # Extract overview legend types
            overview_legend_types = models.getOverviewLegendType(
                distance_heatmap_dict)

            df = labeled_df_setup.df.to_dict(orient='records')
            df = json.dumps(df, indent=2)

            # set filter flag to False
            filter_flag = False

            # clean filter object
            filter_object.clear()

            # Add trend diplay name
            result_df_temp = labeled_df_setup.result_df.copy()
            result_df = models.replaceTrendDisplayName(result_df_temp)

            return jsonify(distance_heatmap_dict=distance_heatmap_dict,
                           result_df=result_df.to_json(orient='records'),
                           df=df,
                           overview_legend_types=overview_legend_types)

        # visualize.html 'Detect' button clicked
        if action == 'detect':
            distance_threshold = float(request.form['distance_threshold'])
            sg_strength_threshold = float(
                request.form['sg_strength_threshold'])
            agg_strength_threshold = float(
                request.form['agg_strength_threshold'])

            filter_object = request.form['filter_object']
            filter_object = json.loads(filter_object, cls=Decoder)
            trend_filter = filter_object['trend_type']

            if not trend_filter:
                # Default to detect all trend types from result_df
                trend_filter = list(
                    pd.unique(labeled_df_setup.result_df['trend_type']))

            sp_filter = {
                'name': 'SP',
                'distance': distance_threshold,
                'agg_trend_strength': agg_strength_threshold,
                'subgroup_trend_strength': sg_strength_threshold,
                'trend_type': trend_filter
            }

            # check if filter flag is True
            if filter_flag:
                # filtered, pass filter parameter
                detect_result = labeled_df_setup.get_SP_rows(
                    sp_filter,
                    independent=filter_object['independent'],
                    dependent=filter_object['dependent'],
                    group_feat=filter_object['group_feat'],
                    subgroup=filter_object['subgroup'],
                    trend_type=filter_object['trend_type'],
                    replace=True)
            else:
                # not filter
                detect_result = labeled_df_setup.get_SP_rows(sp_filter,
                                                             replace=True)

            # Generate distance heatmaps
            distance_heatmap_dict = models.getDistanceHeatmapDict(
                labeled_df_setup, detect_result)

            # Extract overview legend types
            overview_legend_types = models.getOverviewLegendType(
                distance_heatmap_dict)

            df = labeled_df_setup.df.to_dict(orient='records')
            df = json.dumps(df, indent=2)

            # Add trend diplay name
            detect_result = models.replaceTrendDisplayName(detect_result)

            return jsonify(distance_heatmap_dict=distance_heatmap_dict,
                           result_df=detect_result.to_json(orient='records'),
                           df=df,
                           overview_legend_types=overview_legend_types)

        # visualize.html 'Rank' button clicked
        if action == 'rank':

            agg_type = request.form['agg_type']
            score_col = request.form['score_col']

            view_score = agg_type + '_view_' + score_col

            # check if view score exists
            if not (view_score in labeled_df_setup.result_df.columns):
                # not exist, then add view score
                labeled_df_setup.add_view_score(score_col,
                                                agg_type=agg_type,
                                                colored=False)

            rank_result = labeled_df_setup.rank_occurences_by_view(
                view_score, score_col)

            # if filter_flag is True, filtering the rank result
            if filter_flag:

                rank_result = labeled_df_setup.get_trend_rows(
                    independent=filter_object['independent'],
                    dependent=filter_object['dependent'],
                    group_feat=filter_object['group_feat'],
                    subgroup=filter_object['subgroup'],
                    trend_type=filter_object['trend_type'])

            # Generate distance heatmaps
            distance_heatmap_dict = models.getDistanceHeatmapDict(
                labeled_df_setup, rank_result)

            # Extract overview legend types
            overview_legend_types = models.getOverviewLegendType(
                distance_heatmap_dict)

            df = labeled_df_setup.df.to_dict(orient='records')
            df = json.dumps(df, indent=2)

            # Add trend diplay name
            rank_result = models.replaceTrendDisplayName(rank_result)

            return jsonify(distance_heatmap_dict=distance_heatmap_dict,
                           result_df=rank_result.to_json(orient='records'),
                           df=df,
                           overview_legend_types=overview_legend_types)
Exemplo n.º 6
0
def test_basic_load_df_wages():
    # We'll first load in some data, this has both regression and rate type trends. We will load it two ways and check that the structure is the same

    # In[2]:

    labeled_df_file = wg.LabeledDataFrame(
        'data/wages_gender_rank_time_regression2/df.csv')

    # In[3]:

    labeled_df_dir = wg.LabeledDataFrame(
        'data/wages_gender_rank_time_regression2')

    # In[4]:

    assert np.product(labeled_df_file.df.columns == labeled_df_dir.df.columns)

    # In[5]:

    assert labeled_df_file.df.shape == labeled_df_dir.df.shape

    # In[6]:

    compare_df = labeled_df_file.df == labeled_df_dir.df
    assert np.product(compare_df.sum() == len(labeled_df_file.df))

    # Next, we can infer the variable types and assign the roles then check that those match what was read from the saved copy

    # In[7]:

    labeled_df_file.infer_var_types()

    roles = {
        'department': ['independent', 'splitby'],
        'year': ['independent'],
        'pay': ['dependent'],
        'gender': ['independent', 'splitby']
    }

    var_types = {'gender': 'categorical'}
    labeled_df_file.set_counts(
        {var: False
         for var in labeled_df_file.df.columns})
    labeled_df_file.set_roles(roles)
    labeled_df_file.set_var_types(var_types)

    assert np.product(
        labeled_df_file.meta_df.columns == labeled_df_dir.meta_df.columns)

    assert labeled_df_file.meta_df.shape == labeled_df_dir.meta_df.shape

    compare_meta_df = labeled_df_file.meta_df.dropna(
        axis=1) == labeled_df_dir.meta_df.dropna(axis=1)
    assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df))
    # compare_meta_df
    # labeled_df_dir.meta_df.dropna(axis=1)

    # Now, we've set this up, we can also save these configurations to load them in directly in the future

    assert labeled_df_file.to_csvs('data/wages_test')

    # Now confirm that all the files were written correctly.

    assert sorted(os.listdir('data/wages_test/')) == [
        'df.csv', 'meta.csv', 'result_df.csv'
    ]

    # it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory.
    #
    # Now, we can can also load the data back

    labeled_df = wg.LabeledDataFrame('data/wages_test')
    labeled_df.meta_df

    # And confirm that thiss is the same as what was written. First confirm the column headings are the same

    assert np.product(
        labeled_df.meta_df.columns == labeled_df_dir.meta_df.columns)

    # Then confirm the shape is the same

    assert labeled_df.meta_df.shape == labeled_df_dir.meta_df.shape

    # Then that non NaN values are all the same, combined with above the NaNs must be in the same location, but np.NaN == np.Nan asserts to false

    # In[18]:

    compare_meta_df = labeled_df.meta_df.dropna(
        axis=1) == labeled_df_dir.meta_df.dropna(axis=1)
    assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df))
    # compare_meta_df
    # labeled_df_dir.meta_df.dropna(axis=1)

    # In[19]:

    assert np.product(labeled_df.df.columns == labeled_df_dir.df.columns)

    # In[20]:

    assert labeled_df.df.shape == labeled_df_dir.df.shape

    # In[21]:

    compare_df = labeled_df.df.dropna(axis=1) == labeled_df_dir.df.dropna(
        axis=1)
    assert np.product(compare_df.sum() == len(labeled_df_dir.df))
    # compare_meta_df
    # labeled_df_dir.meta_df.dropna(axis=1)

    # In[22]:

    intersect_cols = ['gender', 'department']
    labeled_df.add_intersectional(intersect_cols)

    # Now check that that worked correctly
    # In[23]:

    intersectional_col_name = '_'.join(intersect_cols)
    intersectional_correct = lambda row: row[
        intersectional_col_name] == '_'.join(
            [row[icol] for icol in intersect_cols])
    icol_correct = labeled_df.df.apply(intersectional_correct, axis=1)
    assert np.product(icol_correct)

    # In[24]:

    labeled_df.add_quantile(['pay'])

    q_limits = np.quantile(
        labeled_df.df['pay'],
        [.25, .75, 1],
    )
    limits = {n: q for n, q in zip(['low', 'mid', 'high'], q_limits)}
    for q, df in labeled_df.df.groupby('payquantiles'):
        a = df['pay'] <= limits[q]
        assert np.product(a)

    # In[26]:

    assert labeled_df.get_vars_per_type('categorical') == [
        'department', 'gender', 'gender_department', 'payquantiles'
    ]

    assert labeled_df.meta_df.loc['gender_department', 'dtype'] == 'object'
    assert labeled_df.meta_df.loc['gender_department',
                                  'var_type'] == 'categorical'
    assert labeled_df.meta_df.loc['gender_department', 'role'] == 'splitby'
    assert labeled_df.meta_df.loc['gender_department', 'isCount'] == False

    # Check the utility fucntions

    # In[29]:

    assert labeled_df.get_vars_per_role('splitby') == [
        'department', 'gender', 'gender_department', 'payquantiles'
    ]
    assert labeled_df.get_vars_per_role('independent') == [
        'year', 'department', 'gender'
    ]
    assert labeled_df.get_vars_per_role('dependent') == ['pay']

    # In[30]:

    assert labeled_df.get_data_sample() == [
        'Max: 51.04 Min: 13.52', 'Max: 50.0 Min: 0.0',
        'Support, Sales, Management, R&D', 'F, M',
        'F_Support, M_Support, M_Sales, F_Sales, M_Management',
        'mid, low, high'
    ]

    # In[31]:

    assert labeled_df.get_vars_per_type('categorical') == [
        'department', 'gender', 'gender_department', 'payquantiles'
    ]
    assert labeled_df.get_vars_per_type('continuous') == ['pay', 'year']

    # In[32]:

    assert labeled_df.get_vars_per_roletype('independent',
                                            'continuous') == ['year']
    assert labeled_df.get_vars_per_roletype(
        'independent', 'categorical') == ['department', 'gender']

    # # Using Trends
    #
    # Trend objects define their name, how to compute the trend and how to choose which variables,
    #
    # extension will allow that the var lists may be passed to reduce which ones are computed

    # In[33]:

    corrobj = wg.All_Pearson()
    corrobj.get_trend_vars(labeled_df)
    assert corrobj.regression_vars == [('year', 'pay')]
    assert len(corrobj.var_weight_list) == len(corrobj.regression_vars)
    assert corrobj.set_vars == True

    # In[34]:

    rankobj = wg.Mean_Rank_Trend()
    assert rankobj.get_trend_vars(labeled_df)
    assert rankobj.target == ['pay']
    assert rankobj.trendgroup == ['department', 'gender']
    assert rankobj.set_vars == True
    assert len(rankobj.var_weight_list) == len(rankobj.target)

    # In[35]:

    linreg_obj = wg.All_Linear_Trend()
    linreg_obj.get_trend_vars(labeled_df)
    assert linreg_obj.regression_vars == [('year', 'pay')]
    assert len(linreg_obj.var_weight_list) == len(linreg_obj.regression_vars)
    assert linreg_obj.set_vars == True

    # # Computing Trends on a LabeledDataFrame

    # There are two ways, we can use default setting and pass the names of the trend type or a trend object

    # In[36]:

    labeled_df.get_subgroup_trends_1lev(['pearson_corr'])

    assert np.product(labeled_df.result_df.columns == [
        'independent', 'dependent', 'group_feat', 'subgroup', 'agg_trend',
        'agg_trend_strength', 'subgroup_trend', 'subgroup_trend_strength',
        'trend_type', 'comparison_type'
    ])

    # In[38]:

    # there are 10 fixed columns and the number of rows for this trend is below
    num_reg_pairs = 1
    num_depts = 4
    num_genders = 2
    num_quantiles = 3
    num_dept_genders = num_genders * num_depts
    num_pearson = num_reg_pairs * (num_depts + num_genders + num_dept_genders +
                                   num_quantiles)
    assert labeled_df.result_df.shape == (num_pearson, 10)

    # Now we can use a list of objects and apply multiple trends

    # In[39]:

    labeled_df.get_subgroup_trends_1lev([rankobj, linreg_obj])

    num_lin = num_pearson
    num_gender_idep = num_depts + num_dept_genders + num_quantiles
    num_dept_indep = num_genders + num_dept_genders + num_quantiles
    num_rank = num_gender_idep + num_dept_indep
    total_rows_agg_sg = num_pearson + num_lin + num_rank
    assert labeled_df.result_df.shape == (total_rows_agg_sg, 10)

    # We can see what types of trends were computed from `result_df`

    # In[41]:

    assert np.product(
        pd.unique(labeled_df.result_df['trend_type']) ==
        ['pearson_corr', 'rank_trend', 'lin_reg'])

    # In[42]:

    assert pd.unique(
        labeled_df.result_df['comparison_type']) == ['aggregate-subgroup']

    # We can also add trends that are structured for pairwise comparisons

    # In[43]:

    labeled_df.get_pairwise_trends_1lev([rankobj, linreg_obj])

    # Again, check that the infrastructure of this by checking that the number of rows is correct

    # In[44]:

    num_dept_pairs = np.sum(list(range(num_depts)))
    num_gender_pairs = np.sum(list(range(num_genders)))
    num_dept_genders_pairs = np.sum(list(range(num_dept_genders)))
    num_quantile_pairs = np.sum(list(range(num_quantiles)))
    gender_indep_pairwise_rows = num_dept_pairs + num_dept_genders_pairs + num_quantile_pairs
    dept_indep_pairwise_rows = num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs
    lin_reg_pairwise_rows = num_dept_pairs + num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs
    rank_pairwise_rows = gender_indep_pairwise_rows + dept_indep_pairwise_rows
    total_rows = total_rows_agg_sg + lin_reg_pairwise_rows + rank_pairwise_rows
    assert labeled_df.result_df.shape == (total_rows, 13)

    # In[45]:

    assert list(pd.unique(labeled_df.result_df['comparison_type'])) == [
        'aggregate-subgroup', 'pairwise'
    ]

    # The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

    # In[46]:

    labeled_df.trend_list

    # In[47]:

    # labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
    labeled_df.add_distance(
        row_wise=True)  #('subgroup_trend','subgroup_trend2')
    assert labeled_df.result_df.shape == (total_rows, 14)

    # Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

    # # Saving with trends

    # In[48]:

    assert labeled_df.save_all('data/wages_test_all')

    # In[49]:

    assert sorted(os.listdir('data/wages_test_all/')) == [
        'df.csv', 'meta.csv', 'result_df.csv', 'trends.json'
    ]

    # In[50]:

    labeled_df_tl = wg.LabeledDataFrame('data/wages_test_all')

    # That save function calls the save function tested above, we only need to test that the trend list loaded correctly

    # In[51]:

    labeled_df.trend_list[0].trend_precompute

    # In[52]:

    labeled_df_tl.trend_list[0].trend_precompute

    # # Filtering

    # Test for each filter variable, one at a time and several pairs

    # In[53]:

    year_df = labeled_df.get_trend_rows(independent='year')
    pay_df = labeled_df.get_trend_rows(dependent='pay')
    dept_df = labeled_df.get_trend_rows(group_feat='department')
    mgmt_df = labeled_df.get_trend_rows(subgroup='Management')
    sales_df = labeled_df.get_trend_rows(subgroup2='Sales')
    linreg_df = labeled_df.get_trend_rows(trend_type='lin_reg')
    pair_df = labeled_df.get_trend_rows(comparison_type='pairwise')

    # TODO: manually verify these counts

    # In[54]:

    assert len(year_df) == 72
    assert len(pay_df) == 169
    assert len(dept_df) == 24
    assert len(mgmt_df) == 12
    assert len(sales_df) == 4
    assert len(linreg_df) == 55

    assert len(pair_df) == lin_reg_pairwise_rows + rank_pairwise_rows

    # Now test two conditions and passing a list to a condition

    # In[55]:

    y_sm_df = labeled_df.get_trend_rows(independent='year',
                                        subgroup=['Management', 'Sales'])
    pay_rank = labeled_df.get_trend_rows(dependent='pay',
                                         trend_type='rank_trend')

    # We can also filter based on SP detections with `

    # In[56]:

    labeled_df.get_SP_rows(thresh=.2)

    # In[57]:

    assert labeled_df.result_df.shape == (total_rows, 15)

    # ## Detection
    #
    # Detection via `get_SP_rows` happens in two steps:
    # 1. label the rows
    # 2. filter by that column to return
    #
    # Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways

    # when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided.  Some defaults are also built in accessible by string.

    # In[58]:

    labeled_df.get_SP_rows('default_qual_sp')
    assert labeled_df.result_df.shape == (total_rows, 16)

    # Basic type checks on detections, TODO: accuracy on detections

    # In[59]:

    assert labeled_df.result_df['SP_thresh0.2'].dtype == bool
    assert labeled_df.result_df['default_qual_sp'].dtype == bool

    # In[60]:

    labeled_df.get_SP_rows('SP')
    assert labeled_df.result_df.shape == (total_rows, 17)
    assert labeled_df.result_df['SP'].dtype == bool

    # We can also define our own detection filters, using any available column

    # In[61]:

    rank_only_qual = {
        'name': 'rank_only_qual_sp',
        'distance': .2,
        'agg_trend_strength': .05,
        'subgroup_trend_strength': .05,
        'trend_type': 'rank_trend'
    }
    labeled_df.get_SP_rows(rank_only_qual, replace=True)
    assert labeled_df.result_df.shape == (total_rows, 18)

    # # Ranking

    # In[62]:

    labeled_df.rank_occurences_by_view(ascending=False)
    assert labeled_df.result_df.shape == (total_rows, 19)

    labeled_df.add_view_score('SP_thresh0.2', agg_type='sum', colored=False)
    assert labeled_df.result_df.shape == (total_rows, 20)

    labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2', 'SP_thresh0.2')