Exemplo n.º 1
0
    def create_fixed_point_tex_table(self, top=5, ci=None, **kwargs):
        """
        Creates a tex file with a table that summarizes the variants.
b
        Parameters
        ----------
        top: int or None
            Number of variants to show. If None, all are shown
        ci: float or None
            If float, take this as confidence interval value. If None, skip confidence interval

        Other Parameters
        ----------------
        kwargs: dict
            Parameters that are passed to cjo.functions.TexFunctions.df_to_table

        Raises
        ------
        AssertionError
            If the confidence level is not in [0,1]
        """
        if top is None:
            top = len(self.unique_fp_ids)
        df = self.summary.head(top).copy()
        if ci is not None:
            assert 0 <= ci <= 1, 'ci must be in [0,1]'
            z = stats.norm.interval(ci, 0, 1)[1]
            ci_value = df[IEM +
                          '_std'] / df[SUPPORT].apply(lambda n: n**0.5) * z
            df[IEM] = df[IEM].apply(
                lambda m: f'{m:.2f}') + '$\\pm$' + ci_value.apply(
                    lambda c: f'{c:.2f}')

        df.drop(columns=IEM + '_std', inplace=True)

        df.rename(columns={
            IEM: r'$\Phi$',
        }, inplace=True)
        df = df.reset_index(drop=False).set_index([fp_id, SUPPORT, r'$\Phi$'])

        for i, r in df.iterrows():
            max_weight = r.idxmax()
            for sc, w in r.iteritems():
                df.loc[i, sc] = f'{df.loc[i, sc]:.2f}'
                if sc == max_weight:
                    df.loc[i, sc] = f'\\textbf{{{df.loc[i, sc]}}}'
        kwargs.setdefault('label', 'tab:res:variant')
        kwargs.setdefault(
            'caption',
            'Weights of most frequent variants, showing the number of repetitions and IEM. '
            'The highest weight in each variant is highlighted.')
        kwargs.setdefault('escape', False)
        tex_functions.df_to_table(df.T,
                                  fn_out=self.fd_out / 'variants_table.tex',
                                  **kwargs)
Exemplo n.º 2
0
def run():
    # Load iem

    for sample in 'H':
        df = dataframe_operations \
            .import_df(PKDD_PARAMETERS.RESULTS_3 / f'iem_{sample}.csv').set_index(DATASET_NAME)

        res = pd.DataFrame(columns=[IEM])
        for c in df.columns:
            res.loc[c, IEM] = confidence_interval.latex_string(df[c], 0.95, '{:.3f}', False)
        res = res.rename(columns={IEM: IEM_tex})
        tex_functions.df_to_table(res.reset_index(), escape=False, index=False,
                                  add_phantom=True, phantom_column_position=1, phantom_length=2,
                                  fn_out=PKDD_PARAMETERS.RESULTS_3 / f'competitor_analysis_{sample}.tex')
Exemplo n.º 3
0
    def to_tex_table(self, fn_out=None, standalone=False, **kwargs):

        df = pd.DataFrame(columns=['Categories'])
        import string
        for k, v in self.hierarchy.items():
            df.loc[k, 'Categories'] = ", ".join([string.capwords(vi).replace('Np', 'NP') for vi in v])
        df.index.name = 'Super-Category'
        df = df.reset_index(drop=False).sort_values('Super-Category')

        kwargs.setdefault('column_format', 'lp{3.5in}')
        kwargs.setdefault('caption', 'Hierarchy used in the experiments. (N)P stands for (Non-)Perishable')
        kwargs.setdefault('index', False)
        if standalone:
            string = tex_functions.df_2_standalone_latex_str(df, **kwargs)
        else:
            string = tex_functions.df_to_table(df, **kwargs)

        if fn_out is not None:
            with open(fn_out, 'w+') as wf:
                wf.write(string)
        else:
            return string
Exemplo n.º 4
0
def cluster_statistics_2_tex(cluster_statistics,
                             hierarchy,
                             fn_out,
                             df_ci=None,
                             inclusion_missing=True,
                             num_c=3,
                             **kwargs):
    """
    Generates a pdf from clustering activation statistics

    Parameters
    ----------
    cluster_statistics: pd.DataFrame
        hierarchy x clusters DataFrame
    hierarchy: dict of (str) -> (set of str)
        The hierarchy
    fn_out: str or Path
        Output location
    df_ci: pd.DataFrame
        As cluster_statistics, but then a standard deviation value
    inclusion_missing: bool
        If True, all values will be proceeded by True or False, and each values will at least be 50% (i.e. it gives the
        inclusion or missing percentage; whichever is higher). If False, the inclusion percentages are given.
    """
    if df_ci is None:
        f = '.0f'
    else:
        f = '.1f'
    cat_list = hierarchy.c
    df = pd.DataFrame()
    abbreviation_dict = dict()
    for cluster_name, stat in cluster_statistics.iteritems():
        s = f'${stat[cluster_size]:{f}}'
        if df_ci is not None:
            s += f'\\pm{df_ci.loc[cluster_size, cluster_name]:{f}}'
        s += '$'
        df.loc[cluster_name, cluster_size] = s
        for sc in hierarchy.sc:

            if inclusion_missing:
                # (adapted) average
                if stat[sc] < 0.5:
                    s = f'\\texttt{{False}} : ${100 - stat[sc] * 100:{f}}'
                else:
                    s = f'\\texttt{{True}} : ${stat[sc] * 100:{f}}'
            else:
                s = f'${stat[sc] * 100:{f}}'

            # std
            if df_ci is not None:
                s += f'\\pm{100 * df_ci.loc[sc, cluster_name]:{f}}'

            # close
            s += '$\\%'
            df.loc[cluster_name, sc] = s

        df.loc[cluster_name, ''] = ''

        for i, (k, v) in enumerate(stat[cat_list].sort_values(
                ascending=False).head(num_c).items()):

            # Make improvised multirow cells
            if df_ci is not None:
                df.loc[
                    cluster_name,
                    f'{cat} {i + 1}'] = f'\\texttt{{{category_abbreviation_dict[k]}}}'
                df.loc[cluster_name, f'{cat}|{i + 1}'] = \
                    f'${v * 100:{f}}\\pm{100 * df_ci.loc[k, cluster_name]:{f}}$\\%'

            else:

                df.loc[cluster_name, f'{cat} {i + 1}'] = \
                    f'\\texttt{{{category_abbreviation_dict[k]}}} : ${v * 100:{f}}$\\%'

            # save abbreviation for caption
            abbreviation_dict[k] = category_abbreviation_dict[k]

    df.index.name = 'Cluster'
    df.rename(index=lambda z: f'$K_{{{z}}}$', inplace=True)
    df.reset_index(drop=False, inplace=True)
    df.set_index(['Cluster', cluster_size], inplace=True)

    # Remove the columns that are part of the improvised MultiRow cells
    if df_ci is not None:
        df.rename(columns={f'{cat}|{i + 1}': ''
                           for i in range(num_c)},
                  inplace=True)

    df = df.T
    abbreviations = sorted(abbreviation_dict.keys(),
                           key=lambda z: abbreviation_dict[z])

    def fix_abb(abb):
        abb = abb.capitalize()
        if abb.endswith(' np'):
            abb = abb[:-2] + 'NP'
        elif abb.endswith(' p'):
            abb = abb[:-1] + 'P'
        return abb

    abbreviations = [fix_abb(c) for c in abbreviations]
    categories = sorted(abbreviation_dict.values())

    caption = 'Descriptive Statistics of each cluster.' \
              ' The abbreviations are ' + \
              ', '.join([f'\\texttt{{{v}}}: {k}' for k, v in zip(abbreviations, categories)]) + '.'

    return tex_functions.df_to_table(df,
                                     caption=caption,
                                     fn_out=fn_out,
                                     escape=False,
                                     add_phantom=True,
                                     **kwargs), caption
def run():
    # Make results directory
    if not PKDD_PARAMETERS.RESULTS_1.exists():
        PKDD_PARAMETERS.RESULTS_1.mkdir(parents=True)

    # Get the names of the bootstrap results (each folder == one repetition == iterations until stability)
    repetition_folders_names = file_functions.list_dirs(
        PKDD_PARAMETERS.RESULTS_BOOTSTRAP('H'), False)

    # Progress updates
    ps = ProgressShower(total_steps=len(repetition_folders_names),
                        pre='Analyzing results for fixed points ')

    # Get all results
    super_categories = None
    df = pd.DataFrame()
    for folder_name in repetition_folders_names:
        ps.update_post(folder_name)
        # Get the results
        df = df.append(get_results_for_one_dataset(folder_name))
        ps.update()
    # Fill missing values (some datasets might not have some weight vectors)
    all_sources = df[DATASET_NAME].unique()
    all_fpw_strings = df[fpw_str].unique()
    mi_all = pd.MultiIndex.from_tuples(
        ([(a, b) for a, b in itertools.product(all_sources, all_fpw_strings)]),
        names=[DATASET_NAME, fpw_str])
    df.set_index(keys=[DATASET_NAME, fpw_str], inplace=True)
    df = df.reindex(mi_all)
    df[SUPPORT] = df[SUPPORT].fillna(
        0) / PKDD_PARAMETERS.REPETITION_COUNT * 100

    # Final results (these get converted to the final table)
    res = pd.DataFrame()
    n_datasets = len(repetition_folders_names)

    # Compute the mean and 95%CI of the support
    res['mean'] = df.groupby(fpw_str).mean()[SUPPORT]
    res['std'] = df.groupby(fpw_str).std()[SUPPORT]
    res['CI95'] = confidence_interval.std_n_to_ci(res['std'], n_datasets, 0.95)
    res[SUPPORT] = res.apply(
        lambda r: f'${r["mean"]:.1f} \\pm {r["CI95"]:.1f}\\%$', axis=1)

    # Retrieve the weights
    res.index.name = fpw_str

    def create_df_row(fpws):
        return pd.DataFrame(data={
            s.split(',')[0]: [float(s.split(',')[1])]
            for s in fpws.split(';')
        },
                            index=[fpws])

    df_weights = pd.concat([create_df_row(x) for x in res.index], axis=0)
    df_weights = df_weights[sorted(df_weights.columns)]
    res = pd.merge(left=res,
                   right=df_weights,
                   left_index=True,
                   right_index=True)

    # Sort on support
    res.sort_values('mean', inplace=True, ascending=False)
    res.drop(columns=['mean', 'std', 'CI95'], inplace=True)
    res.rename(columns={ISC_WEIGHT: ISC_WEIGHT_LATEX}, inplace=True)

    # Generate a tex-file
    res.rename(columns={SUPPORT: SUPPORT.capitalize()})
    res.set_index(SUPPORT, inplace=True)
    tex_functions.df_to_table(
        res.head(5).T,
        caption=
        'The most frequently found weights, showing the number of repetitions that found '
        'this combination of weights, averaged over the 100 datasets, with their '
        '95\\%-confidence intervals.',
        label='tab:res:weights',
        add_phantom=True,
        column_format='l' * (2 + 5),
        fn_out=PKDD_PARAMETERS.RESULTS_1 / 'weights.tex',
        escape=False,
        floating='h!')