예제 #1
0
    def test_copy_not_deep(self):

        expected = pd.DataFrame(
            [['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan],
             ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]],
            columns=['string', 'numeric', 'numeric_missing'])

        container = DataContainer([{
            'frame': expected,
            'name': 'test',
            'path': 'foo'
        }])
        new_container = container.copy(deep=False)

        assert_not_equal(id(new_container), id(container))
        for name in new_container.keys():

            frame = new_container.get_frame(name)
            path = new_container.get_path(name)

            old_frame = container.get_frame(name)
            old_path = container.get_path(name)

            eq_(path, old_path)
            assert_frame_equal(frame, old_frame)
            assert_equal(id(frame), id(old_frame))
    def test_copy_not_deep(self):

        expected = pd.DataFrame([['John', 1, 5.0],
                                 ['Mary', 2, 4.0],
                                 ['Sally', 6, np.nan],
                                 ['Jeff', 3, 9.0],
                                 ['Edwin', 9, 1.0]],
                                columns=['string', 'numeric',
                                         'numeric_missing'])

        container = DataContainer([{'frame': expected, 'name': 'test', 'path': 'foo'}])
        new_container = container.copy(deep=False)

        assert_not_equal(id(new_container), id(container))
        for name in new_container.keys():

            frame = new_container.get_frame(name)
            path = new_container.get_path(name)

            old_frame = container.get_frame(name)
            old_path = container.get_path(name)

            eq_(path, old_path)
            assert_frame_equal(frame, old_frame)
            assert_equal(id(frame), id(old_frame))
    def test_drop_warning(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}])

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            container.drop('flab')
예제 #4
0
    def test_drop_warning(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}])

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            container.drop('flab')
    def test_get_frames_no_match(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_one_not'},
                                   {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}])

        frames = container.get_frames(suffix='foo')
        eq_(frames, {})
    def test_get_frames_by_suffix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_one_not'},
                                   {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}])

        frames = container.get_frames(suffix='one')
        eq_(sorted(list(frames.keys())), sorted(['include_this_one', 'we_want_this_one']))
    def test_get_frames_by_prefix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test_two'},
                                   {'frame': pd.DataFrame(), 'name': 'test_three'},
                                   {'frame': pd.DataFrame(), 'name': 'exclude'}])

        frames = container.get_frames(prefix='test')
        eq_(sorted(list(frames.keys())), sorted(['test_two', 'test_three']))
예제 #8
0
    def test_get_frames_by_suffix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_one_not'},
                                   {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}])

        frames = container.get_frames(suffix='one')
        eq_(sorted(list(frames.keys())), sorted(['include_this_one', 'we_want_this_one']))
예제 #9
0
    def test_get_frames_no_match(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_one'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_one_not'},
                                   {'frame': pd.DataFrame(), 'name': 'we_want_this_one'}])

        frames = container.get_frames(suffix='foo')
        eq_(frames, {})
예제 #10
0
    def test_get_frames_by_prefix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test_two'},
                                   {'frame': pd.DataFrame(), 'name': 'test_three'},
                                   {'frame': pd.DataFrame(), 'name': 'exclude'}])

        frames = container.get_frames(prefix='test')
        eq_(sorted(list(frames.keys())), sorted(['test_two', 'test_three']))
    def test_get_frames_both_suffix_and_prefix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_frame'},
                                   {'frame': pd.DataFrame(), 'name': 'include_it'},
                                   {'frame': pd.DataFrame(), 'name': 'exclude_frame'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_other_frame'}])

        frames = container.get_frames(prefix='include', suffix='frame')
        eq_(sorted(list(frames.keys())), sorted(['include_this_frame', 'include_this_other_frame']))
예제 #12
0
    def test_get_frames_both_suffix_and_prefix(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'include_this_frame'},
                                   {'frame': pd.DataFrame(), 'name': 'include_it'},
                                   {'frame': pd.DataFrame(), 'name': 'exclude_frame'},
                                   {'frame': pd.DataFrame(), 'name': 'include_this_other_frame'}])

        frames = container.get_frames(prefix='include', suffix='frame')
        eq_(sorted(list(frames.keys())), sorted(['include_this_frame', 'include_this_other_frame']))
예제 #13
0
    def test_rename(self):

        expected = pd.DataFrame(
            [['John', 1, 5.0], ['Mary', 2, 4.0], ['Sally', 6, np.nan],
             ['Jeff', 3, 9.0], ['Edwin', 9, 1.0]],
            columns=['string', 'numeric', 'numeric_missing'])

        container = DataContainer([{'frame': expected, 'name': 'test'}])
        container.rename('test', 'flerf')
        assert_frame_equal(container.flerf, expected)
예제 #14
0
    def test_rename_with_path(self):

        expected = pd.DataFrame([['John', 1, 5.0],
                                 ['Mary', 2, 4.0],
                                 ['Sally', 6, np.nan],
                                 ['Jeff', 3, 9.0],
                                 ['Edwin', 9, 1.0]],
                                columns=['string', 'numeric',
                                         'numeric_missing'])

        container = DataContainer([{'frame': expected, 'name': 'test', 'path': 'foo'}])
        container.rename('test', 'flerf')
        eq_(container.get_path('flerf'), 'foo')
    def test_rename(self):

        expected = pd.DataFrame([['John', 1, 5.0],
                                 ['Mary', 2, 4.0],
                                 ['Sally', 6, np.nan],
                                 ['Jeff', 3, 9.0],
                                 ['Edwin', 9, 1.0]],
                                columns=['string', 'numeric',
                                         'numeric_missing'])

        container = DataContainer([{'frame': expected, 'name': 'test'}])
        container.rename('test', 'flerf')
        assert_frame_equal(container.flerf, expected)
    def test_rename_with_path(self):

        expected = pd.DataFrame([['John', 1, 5.0],
                                 ['Mary', 2, 4.0],
                                 ['Sally', 6, np.nan],
                                 ['Jeff', 3, 9.0],
                                 ['Edwin', 9, 1.0]],
                                columns=['string', 'numeric',
                                         'numeric_missing'])

        container = DataContainer([{'frame': expected, 'name': 'test', 'path': 'foo'}])
        container.rename('test', 'flerf')
        eq_(container.get_path('flerf'), 'foo')
예제 #17
0
    def test_data_container_save_files_with_id(self):

        data_sets = [{
            'name':
            'dataset1',
            'frame':
            pd.DataFrame(np.random.normal(size=(100, 2)), columns=['A', 'B'])
        }, {
            'name':
            'dataset2',
            'frame':
            pd.DataFrame(np.random.normal(size=(120, 3)),
                         columns=['A', 'B', 'C'])
        }]

        container = DataContainer(data_sets)

        directory = 'temp_directory_save_files_with_id_xyz'
        os.makedirs(directory, exist_ok=True)

        writer = DataWriter('test')
        for file_type in ['json', 'csv', 'xlsx']:

            if file_type != 'json':

                writer.write_experiment_output(directory,
                                               container,
                                               dataframe_names=['dataset1'],
                                               file_format=file_type)
            else:
                writer.write_experiment_output(
                    directory,
                    container,
                    new_names_dict={'dataset1': 'aaa'},
                    dataframe_names=['dataset1'],
                    file_format=file_type)

        aaa_json = pd.read_json(os.path.join(directory, 'test_aaa.json'))
        ds_1_csv = pd.read_csv(os.path.join(directory, 'test_dataset1.csv'))
        ds_1_xls = pd.read_excel(os.path.join(directory, 'test_dataset1.xlsx'))

        output_dir = os.listdir(directory)
        rmtree(directory)
        assert sorted(output_dir) == sorted(
            ['test_aaa.json', 'test_dataset1.csv', 'test_dataset1.xlsx'])

        assert_frame_equal(container.dataset1, aaa_json)
        assert_frame_equal(container.dataset1, ds_1_csv)
        assert_frame_equal(container.dataset1, ds_1_xls)
예제 #18
0
    def test_data_container_save_wrong_format(self):

        data_sets = [{'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)),
                                                                columns=['A', 'B'])},
                     {'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)),
                                                                columns=['A', 'B', 'C'])}]

        container = DataContainer(data_sets)

        directory = 'temp_directory_container_save_wrong_format_xyz'

        writer = DataWriter()
        writer.write_experiment_output(directory,
                                       container,
                                       dataframe_names=['dataset1'],
                                       file_format='html')
예제 #19
0
    def read(self, kwargs_dict=None):
        """
        Read all files passed to the constructor.


        Parameters
        ----------
        kwargs_dict : dict of dicts, optional
            Any additional keyword arguments to pass to a particular DataFrame.
            These arguments will be passed to the `pandas` IO reader function.
            Defaults to None.

        Returns
        -------
        datacontainer : DataContainer
            A DataContainer object.
        """

        for idx, set_path in enumerate(self.dataset_paths):

            name = self.dataset_names[idx]
            converter = self.file_converters.get(name, None)

            if not exists(set_path):
                raise FileNotFoundError(
                    'The file {} does not exist'.format(set_path))

            if kwargs_dict is not None:
                kwargs = kwargs_dict.get(name, {})
            else:
                kwargs = {}

            dataframe = self.read_from_file(set_path, converter, **kwargs)

            # Add to list of datasets
            self.datasets.append({
                'name': name.strip(),
                'path': set_path,
                'frame': dataframe
            })

        return DataContainer(self.datasets)
예제 #20
0
    def test_drop(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}])
        container.drop('test')
        assert_false('test' in container)
예제 #21
0
def get_fairness_analyses(df,
                          group,
                          system_score_column,
                          human_score_column='sc1',
                          base_group=None):
    """
    Compute fairness analyses described in `Loukina et al. 2019 <https://www.aclweb.org/anthology/W19-4401/>`_.

    The function computes how much variance group membership explains in
    overall score accuracy (osa), overall score difference (osd),
    and conditional score difference (csd). See the paper for more
    details.

    Parameters
    ----------
    df: pandas DataFrame
        A dataframe containing columns with numeric human scores,
        columns with numeric system scores and a column with
        group membership.
    group: str
        Name of the column containing group membership.
    system_score_column: str
        Name of the column containing system scores.
    human_score_column: str
        Name of the column containing human scores.
    base_group: str, optional
        Name of the group to use as the reference category.
        Defaults to ``None`` in which case the group with the
        largest number of cases will be used as the reference
        category. Ties are broken alphabetically.

    Returns
    -------
    model_dict: dictionary
        A dictionary with different proposed metrics as keys
        and fitted models as values.
    fairness_container: DataContainer
        A datacontainer with the following datasets:

         - "estimates_<METRIC>_by_<GROUP>" where "<GROUP>" corresponds to
           the given group and "<METRIC>" can be "osa", "osd" and "csd" estimates
           for each group computed by the respective models.
         - "fairness_metrics_by_<GROUP>" - a summary of model fits (R2 and
           p values).
    """
    # compute error and squared error
    df['error'] = df[system_score_column] - df[human_score_column]
    df['SE'] = df['error']**2

    # convert group values to category and reorder them using
    # the largest category as reference

    df['group'] = convert_to_ordered_category(df[group], base_group=base_group)
    base_group = df['group'].cat.categories[0]

    df['sc1_cat'] = convert_to_ordered_category(df[human_score_column])

    # Overall score accuracy (OSA)
    # Variance in squared error explained by L1

    # fit the model
    osa_model = smf.ols(formula='SE ~ group', data=df)
    osa_fit = osa_model.fit()

    # collect the results
    osa_dict = {'R2': osa_fit.rsquared_adj, 'sig': osa_fit.f_pvalue}

    osa_results = pd.Series(osa_dict, name='Overall score accuracy')

    df_coefficients_osa = get_coefficients(osa_fit, base_group)

    # Overall score difference (OSD)
    # variance in signed residuals (raw error) explained by L1

    # fit the model
    osd_model = smf.ols(formula='error ~ group', data=df)
    osd_fit = osd_model.fit()

    # collect the results
    osd_dict = {'R2': osd_fit.rsquared_adj, 'sig': osd_fit.f_pvalue}

    osd_results = pd.Series(osd_dict, name='Overall score difference')

    df_coefficients_osd = get_coefficients(osd_fit, base_group)

    # conditional score difference CSD
    # Variance in score difference conditioned on Native language

    # fit "null" model with human score only
    csd_null_mod = smf.ols(formula='error ~ sc1_cat', data=df)
    csd_null_fit = csd_null_mod.fit()

    # fit model with both human score and group
    csd_mod = smf.ols(formula='error ~ group + sc1_cat', data=df)
    csd_fit = csd_mod.fit()

    # compare the two models using anova_lm
    # we filter warnings for this function because we get
    # runtime warning due to NaNs in the data.
    # these seem to be by design: https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k
    np.warnings.filterwarnings('ignore')
    anova_results = anova_lm(csd_null_fit, csd_fit)
    # we reset warnings
    np.warnings.resetwarnings()

    # collect the results. Note that R2 in this case is a difference
    # in R2 between the two models and significance is obtained from anova
    csd_dict = {
        'R2': csd_fit.rsquared_adj - csd_null_fit.rsquared_adj,
        'sig': anova_results.values[1][-1]
    }
    csd_results = pd.Series(csd_dict, name='Conditional score difference')

    df_coefficients_csd = get_coefficients(csd_fit, base_group)

    # create a summary table

    df_r2_all = pd.concat([osa_results, osd_results, csd_results],
                          axis=1,
                          sort=True)
    df_r2_all['base_category'] = base_group

    # assemble all datasets into a DataContainer

    datasets = [{
        'name': 'estimates_osa_by_{}'.format(group),
        'frame': df_coefficients_osa
    }, {
        'name': 'estimates_osd_by_{}'.format(group),
        'frame': df_coefficients_osd
    }, {
        'name': 'estimates_csd_by_{}'.format(group),
        'frame': df_coefficients_csd
    }, {
        'name': 'fairness_metrics_by_{}'.format(group),
        'frame': df_r2_all
    }]

    # assemble all models into a dictionary
    model_dict = {'osa': osa_fit, 'osd': osd_fit, 'csd': csd_fit}

    return model_dict, DataContainer(datasets=datasets)
    def test_drop(self):

        container = DataContainer([{'frame': pd.DataFrame(), 'name': 'test'}])
        container.drop('test')
        assert_false('test' in container)