Пример #1
0
    def execute(self):
        """Execute DfSummary

        Creates a report page for each variable in data frame.

        * create statistics object for column
        * create overview table of column variable
        * plot histogram of column variable
        * store plot

        :returns: execution status code
        :rtype: StatusCode
        """

        ds = ProcessManager().service(DataStore)

        # fetch and check input data frame
        data = ds.get(self.read_key, None)
        if data is None:
            self.log().critical(
                'No input data "%s" found in data store for %s', self.read_key,
                str(self))
            raise RuntimeError('no input data found for {}'.format(str(self)))
        else:
            self.assert_data_type(data)

        # create report page for histogram
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            if not isinstance(self.pages, list):
                raise TypeError(
                    'pages key "{}" does not refer to a list'.format(
                        self.pages_key))

        # determine all possible columns, used for comparison below
        all_columns = self.get_all_columns(data)
        if not self.columns:
            self.columns = all_columns

        for name in self.columns[:]:
            # check if column is in data frame
            if name not in all_columns:
                self.log().warning('Key "%s" not in input data; skipping',
                                   name)
                self.columns.remove(self.columns.index(name))
                continue
            self.log().debug('Processing "%s"', name)
            sample = self.get_sample(data, name)
            self.process_sample(name, sample)

        # add nan histogram to summary if present
        if self.nan_counts:
            nan_hist = self.nan_counts, self.columns
            self.process_nan_histogram(nan_hist, self.get_length(data))

        # storage
        if self.pages_key:
            ds[self.pages_key] = self.pages

        return StatusCode.Success
Пример #2
0
    def test_esk106(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk106_cmdline_options.py'

        # fake a setting from the cmd-line. picked up in the macro
        settings['do_chain0'] = False

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        self.assertTrue(status.isSuccess())
        self.assertEqual(1, len(pm.chains))
        self.assertEqual('Chain1', pm.chains[0].name)
        self.assertEqual(False, settings.get('do_chain0', True))
        self.assertEqual(True, settings.get('do_chain1', True))
        self.assertEqual('Universe', pm.chains[0].links[0].hello)
Пример #3
0
    def execute(self):
        """Execute DsObjectDeleter"""

        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # used in code testing only
        if settings.get('TESTING'):
            self.log().warning(
                'Running in TESTING mode. NOT clearing datastore for testing purposes.'
            )
            return StatusCode.Success

        # delete specific items
        for key in self.deletionKeys:
            if key in ds:
                self.log().debug('Now deleting datastore object with key "%s"',
                                 key)
                del ds[key]

        # delete specific class types
        for cls in self.deletionClasses:
            for key in ds:
                if isinstance(ds[key], cls):
                    self.log().debug(
                        'Now deleting datastore object with key "%s"', key)
                    del ds[key]

        # delete all but specific items
        if len(self.keepOnly):
            keys = list(ds.keys())
            for key in keys:
                if key not in self.keepOnly:
                    self.log().debug(
                        'Now deleting datastore object with key "%s"', key)
                    del ds[key]

        # delete all items in datastore
        if self.clearAll:
            keys = list(ds.keys())
            for key in keys:
                self.log().debug('Now deleting datastore object with key "%s"',
                                 key)
                del ds[key]

        return StatusCode.Success
Пример #4
0
# when chunking through an input file, pick up only N lines in each iteration.
chunksize = 5

#########################################################################################
# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

proc_mgr = ProcessManager()

# --- example 1: readdata loops over the input files, but no file chunking.

if settings.get('do_example1', True):
    ch = proc_mgr.add_chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)

    # --- readdata keeps on opening the next file in the file list.
    #     all kwargs are passed on to pandas file reader.
    readdata = analysis.ReadToDf(name='dflooper1',
                                 key='test1',
                                 sep='|',
                                 reader='csv',
                                 usecols=['x', 'y'])
    readdata.path = [data_path] * 3
    readdata.itr_over_files = True
Пример #5
0
    def execute(self):
        """Execute CorrelationSummary"""

        ds = ProcessManager().service(DataStore)

        import matplotlib.pyplot as plt
        from matplotlib import colors

        # fetch and check input data frame
        # drop all-nan columns right away
        df = ds.get(self.read_key, None).dropna(how='all', axis=1)
        if not isinstance(df, pd.DataFrame):
            self.log().critical(
                'no Pandas data frame "%s" found in data store for %s',
                self.read_key, str(self))
            raise RuntimeError('no input data found for %s' % str(self))

        # compute correlations between all numerical variables
        self.log().debug('Computing "%s" correlations of dataframe "%s"',
                         self.method, self.read_key)

        # mutual info, from sklearn
        if self.method == 'mutual_information':
            # numerical columns only
            cols = df.select_dtypes(include=[np.number]).columns

            # initialize correlation matrix
            n = len(cols)
            cors = np.zeros((n, n))
            for i, c in enumerate(cols):
                # compare each column to all of the columns
                cors[i, :] = mutual_info_regression(df[cols], df[c])

            cors = pd.DataFrame(cors, columns=cols, index=cols)

        elif self.method == 'correlation_ratio':
            # numerical columns only
            cols = df.select_dtypes(include=[np.number]).columns

            # choose bins for each column
            bins = {c: len(np.histogram(df[c])[1]) for c in cols}

            # sort rows into bins
            for c in cols:
                df[str(c) + '_bin'] = pd.cut(df[c], bins[c])

            # initialize correlation matrix
            n = len(cols)
            cors = np.zeros((n, n))

            for i, x in enumerate(cols):
                xbin = str(x) + '_bin'

                # definition from Wikipedia "correlation ratio"
                y_given_x = (df.groupby(xbin))[cols]
                weighted_var_y_bar = (y_given_x.count() *
                                      (y_given_x.mean() - df.mean())**2).sum()
                weighted_var_y = df[cols].count() * df[cols].var()

                cors[i, :] = weighted_var_y_bar / weighted_var_y

            cors = pd.DataFrame(cors, columns=cols, index=cols)

        else:
            cors = df.corr(method=self.method)
            cols = list(cors.columns)

        # set up heatmap of convenient size
        plot_size = max(len(cols) / 1.8, 2)
        fig, ax = plt.subplots(figsize=(1.5 * plot_size, plot_size))

        vmin = -1 if self.method in LINEAR_CORRS else 0
        vmax = 1
        cmap = 'RdYlGn' if self.method in LINEAR_CORRS else 'YlGn'

        norm = colors.Normalize(vmin=vmin, vmax=vmax)
        img = ax.pcolormesh(cors,
                            cmap=cmap,
                            edgecolor='w',
                            linewidth=1,
                            norm=norm)

        # make plot look pretty
        ax.set_title('{0:s} correlations'.format(self.method.capitalize()))
        ax.set_yticks(np.arange(len(cols)) + 0.5)
        ax.set_xticks(np.arange(len(cols)) + 0.5)
        ax.set_yticklabels(cols, rotation='horizontal')
        ax.set_xticklabels(cols, rotation='vertical')
        fig.colorbar(img)

        # annotate with correlation values
        for i in range(len(cols)):
            for j in range(len(cols)):
                point = float(cors[cols[i]][j])
                label = 'NaN' if np.isnan(point) else '{0:.2f}'.format(point)
                white_cond = (point < 0.7 * vmin) or (
                    point >= 0.7 * vmax) or np.isnan(point)
                color = 'w' if white_cond else 'k'
                ax.annotate(label,
                            xy=(i + 0.5, j + 0.5),
                            color=color,
                            horizontalalignment='center',
                            verticalalignment='center')

        # save plots in file
        fname = '_'.join(
            ['correlations',
             self.read_key.replace(' ', ''), self.method]) + '.pdf'
        fpath = os.path.join(self.results_path, fname)
        self.log().debug('Saving correlation heatmap as {}'.format(fpath))
        fig.savefig(fpath, bbox_inches='tight')

        # save correlations to datastore if requested
        if self.write_key:
            ds[self.write_key] = cors

        return StatusCode.Success
Пример #6
0
    def execute(self):
        """Execute CorrelationSummary"""

        ds = ProcessManager().service(DataStore)

        import matplotlib.pyplot as plt
        from matplotlib import colors

        # fetch and check input data frame
        # drop all-nan columns right away
        df = ds.get(self.read_key, None).dropna(how='all', axis=1)
        if not isinstance(df, pd.DataFrame):
            self.log().critical(
                'no Pandas data frame "%s" found in data store for %s',
                self.read_key, str(self))
            raise RuntimeError('no input data found for %s' % str(self))
        n_df = len(df.index)
        assert n_df, 'Pandas data frame "%s" frame has zero length' % self.read_key

        # create report pages
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            assert isinstance(
                self.pages,
                list), 'Pages key %s does not refer to a list' % self.pages_key

        # below, create report pages
        # for each correlation create resulting heatmap
        cors_list = []

        for method in self.methods:
            # compute correlations between all numerical variables
            self.log().debug('Computing "%s" correlations of dataframe "%s"',
                             method, self.read_key)

            # mutual info, from sklearn
            if method == 'mutual_information':
                # numerical columns only
                cols = df.select_dtypes(include=[np.number]).columns

                # initialize correlation matrix
                n = len(cols)
                cors = np.zeros((n, n))
                for i, c in enumerate(cols):
                    # compare each column to all of the columns
                    cors[i, :] = mutual_info_regression(df[cols], df[c])

                cors = pd.DataFrame(cors, columns=cols, index=cols)

            elif method == 'correlation_ratio':
                # numerical columns only
                cols = df.select_dtypes(include=[np.number]).columns

                # choose bins for each column
                bins = {c: len(np.histogram(df[c])[1]) for c in cols}

                # sort rows into bins
                for c in cols:
                    df[str(c) + '_bin'] = pd.cut(df[c], bins[c])

                # initialize correlation matrix
                n = len(cols)
                cors = np.zeros((n, n))

                for i, x in enumerate(cols):
                    # definition from Wikipedia "correlation ratio"
                    xbin = str(x) + '_bin'
                    y_given_x = (df.groupby(xbin))[cols]
                    weighted_var_y_bar = (
                        y_given_x.count() *
                        (y_given_x.mean() - df.mean())**2).sum()
                    weighted_var_y = df[cols].count() * df[cols].var()
                    cors[i, :] = weighted_var_y_bar / weighted_var_y

                cors = pd.DataFrame(cors, columns=cols, index=cols)

            else:
                cors = df.corr(method=method)
                cols = list(cors.columns)

            # replace column names with indices, as with numpy matrix, for plotting function below
            n = len(cols)
            cors.columns = range(n)

            # keep for potential later usage
            cors_list.append(cors)

            # plot settings
            title = '{0:s} correlation matrix'.format(method.capitalize())
            vmin = -1 if method in LINEAR_CORRS else 0
            vmax = 1
            color_map = 'RdYlGn' if method in LINEAR_CORRS else 'YlGn'
            fname = '_'.join(
                ['correlations',
                 self.read_key.replace(' ', ''), method]) + '.pdf'
            fpath = os.path.join(self.results_path, fname)

            # create nice looking plot
            self.log().debug('Saving correlation heatmap as {}'.format(fpath))
            visualization.vis_utils.plot_correlation_matrix(
                cors, cols, cols, fpath, title, vmin, vmax, color_map)

            # statistics table for report page
            n_unique = (n * n -
                        n) / 2 if method is not 'correlation_ratio' else n * n
            stats = [('entries', n_df), ('bins', n * n), ('unique', n_unique),
                     ('> 0', (cors.values.ravel() > 0).sum()),
                     ('< 0', (cors.values.ravel() < 0).sum()),
                     ('avg', np.average(cors.values.ravel())),
                     ('max', max(cors.values.ravel())),
                     ('min', min(cors.values.ravel()))] if n > 0 else []
            stats_table = tabulate.tabulate(stats, tablefmt='latex')

            # add plot and table as page to report
            self.pages.append(
                self.page_template.replace('VAR_LABEL', title).replace(
                    'VAR_STATS_TABLE',
                    stats_table).replace('VAR_HISTOGRAM_PATH', fpath))

        # save correlations to datastore if requested
        if self.store_key:
            ds[self.store_key] = cors_list
        if self.pages_key:
            ds[self.pages_key] = self.pages

        return StatusCode.Success
Пример #7
0
The two flags below control whether chains are turned on or off. (default=on)
from the cmd line, control these with: 

-c do_chain0=False -c do_chain1=False

Try it; No Hello Worlds will be printed.
"""
log.info(msg)

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()

if settings.get('do_chain0', True):
    ch = proc_mgr.add_chain('Chain0')
    link = core_ops.HelloWorld(name='hello0')
    link.hello = 'Town'
    ch.add_link(link)

if settings.get('do_chain1', True):
    ch = proc_mgr.add_chain('Chain1')
    link = core_ops.HelloWorld(name='hello1')
    link.hello = 'Universe'
    ch.add_link(link)

#########################################################################################

log.debug('Done parsing configuration file esk106_cmdline_options')