Пример #1
0
def _make_data(original_data,
               target_data=None,
               features=None,
               target_from_data=False,
               weights_ratio=0,
               weights_original=None,
               weights_target=None):
    """Return the concatenated data, weights and labels for classifier training.

     Differs to only *make_dataset* from the |hepds_type| by providing the
     possibility of using other weights.
    """
    # make temporary weights if specific weights are given as parameters
    temp_ori_weights = None
    temp_tar_weights = None
    if not dev_tool.is_in_primitive(weights_original, None):
        temp_ori_weights = original_data.weights
        original_data.set_weights(weights_original)
    if not dev_tool.is_in_primitive(weights_target, None):
        temp_tar_weights = target_data.weights
        target_data.set_weights(weights_target)

    # create the data, target and weights
    data_out = original_data.make_dataset(target_data,
                                          columns=features,
                                          targets_from_data=target_from_data,
                                          weights_ratio=weights_ratio)

    # reassign weights if specific weights have been used
    if not dev_tool.is_in_primitive(temp_ori_weights, None):
        original_data.set_weights(temp_ori_weights)
    if not dev_tool.is_in_primitive(temp_tar_weights, None):
        original_data.set_weights(temp_tar_weights)

    return data_out
Пример #2
0
    def _make_df(self, columns=None, index=None, copy=False):
        """Return a DataFrame from the internal data. Does some dirty, internal work."""
        # initialize data
        # TODO: remove trailing comment?
        data = self._data  # if dev_tool.is_in_primitive(data) else data
        columns = self.columns if columns is None else data_tools.to_list(columns)
        index = self._index if index is None else data_tools.to_list(index)

        if self._data_type == 'root':
            # update root dictionary
            temp_root_dict = dict(data, **{'branches': columns})
            for key, val in temp_root_dict.items():
                if dev_tool.is_in_primitive(val, None):
                    temp_root_dict[key] = self.data.get(key)
            data = data_tools.to_pandas(temp_root_dict, columns=columns)

        elif self._data_type == 'array':
            data = pd.DataFrame(data, index=index, columns=columns, copy=copy)
        elif self._data_type == 'df':
            if columns is not None:
                data = data[columns]
        else:
            raise NotImplementedError("Unknown/not yet implemented data type")

        assert isinstance(data, pd.DataFrame), "data did not convert correctly"
        data = data if index is None else data.ix[index]

        if isinstance(self.column_alias, dict) and len(self.column_alias) > 0:
            data.rename(columns=self.column_alias, inplace=True, copy=False)

        return data
Пример #3
0
    def _set_target(self, target, index=None):
        """Set the target. Attention with Series, index must be the same as data-index."""

        index = self._index if dev_tool.is_in_primitive(index) else index
        if isinstance(target, (list, np.ndarray, pd.Series)):
            target = pd.Series(target, index=index, copy=True)
            target.sort_index(inplace=True)
        self._target = target
Пример #4
0
    def _get_targets(self, index=None):
        """Return targets as pandas Series or primitive type."""
        # assign defaults
        index = self._index if index is None else list(index)
        # length = len(self) if index is None else len(index)

        if index is None or dev_tool.is_in_primitive(self._target, (-1, 0, 1, None)):
            out_targets = self._target
        else:
            out_targets = self._target.loc[index]

        return out_targets
Пример #5
0
    def _set_weights(self, sample_weights, index=None):
        """Set the weights"""
        index = self.index if index is None else index
        length = len(self) if index is None else len(index)

        if dev_tool.is_in_primitive(sample_weights, (None, 1)):
            if index is None or len(self) == len(index):
                self._weights = 1
                return
            else:
                sample_weights = pd.Series(np.ones(len(index)), index=index)
        #    else:
        #        sample_weights = np.ones(length)
        elif isinstance(sample_weights, pd.Series):
            sample_weights = sample_weights[index]
        else:
            sample_weights = pd.Series(sample_weights, index=index, dtype='f8')

        if len(self) == length and index is None:
            self._weights = sample_weights
        else:
            if dev_tool.is_in_primitive(self._weights, (None, 1)):
                self._weights = pd.Series(np.ones(len(self)), index=self._index)
            self._weights.update(sample_weights)
Пример #6
0
def similar_dist(predictions, weights=None, true_y=1, threshold=0.5):
    """Metric to evaluate the predictions on one label only for similarity test.

    This metric is used inside the mayou_score

    Parameters
    ----------
    predictions : :py:class:`~np.array`
        The predicitons
    weights : array-like
        The weights for the predictions
    true_y : {0 , 1}
        The "true" label of the data
    threshold : float
        The threshold for the predictions to decide whether a point belongs
        to 0 or 1.
    """
    # HACK
    scale = 2  # otherwise, the predictions will be [-0.5, 0.5]
    # HACK END
    data_valid = min(predictions) < threshold < max(predictions)
    if not data_valid:
        raise ValueError("Predictions are all above or below the threshold")

    if true_y == 0:
        predictions = 1 - predictions

    predictions -= threshold
    predictions *= scale
    true_pred = predictions[predictions > 0]
    false_pred = predictions[predictions <= 0] * -1

    true_weights = false_weights = 1

    if not dev_tool.is_in_primitive(weights, None):
        true_weights = weights[predictions > 0]
        false_weights = weights[predictions <= 0]
    score = sum(((np.exp(1.3 * np.square(true_pred + 0.6)) - 1.5969) * 0.5) * true_weights)
    score -= sum(((np.sqrt(false_pred) - np.power(false_pred, 0.8)) * 2) * false_weights)
    score /= sum(weights)

    return score
Пример #7
0
    def _get_weights(self, index=None,  normalize=True):
        """Return pandas Series of weights or None, 1."""
        # initialize values
        index = self._index if index is None else list(index)
        length = len(self) if index is None else len(index)

        # TODO: allow other primitive weights
        if dev_tool.is_in_primitive(self._weights, (None, 1)):
            weights_out = self._weights
            if normalize != 1 or normalize is not True:
                weights_out = pd.Series(np.ones(length), index=index)
            else:
                normalize = False
        elif index is None:
            weights_out = self._weights
        else:
            weights_out = self._weights.loc[index]
        weights_out = copy.deepcopy(weights_out)
        if normalize or normalize > 0:
            normalize = 1 if normalize is True else normalize
            weights_out *= normalize / weights_out.mean()

        return weights_out
Пример #8
0
    def plot(self, figure=None, columns=None, index=None, title=None, sub_title=None,
             data_name=None, bins=None, log_y_axes=False, plot_range=None, x_label=None,
             y_label="probability density", sample_weights=None, importance=3,
             see_all=False, hist_settings=None, figure_kwargs=None):
        """Draw histograms of the data.

        .. warning:: Only 99.98% of the newest plotted data will be shown to focus
           on the essential parts (the axis limits will be set accordingly).
           This implies a risk of cutting the previously (in the same figure)
           plotted data (mostly, if they do not overlap a lot). To ensure that
           all data is plotted, set *see_all* to *True*.

        Parameters
        ----------
        figure : str or int
            The name of the figure. If the figure already exists, the plots
            will be plotted in the same window (can be intentional, for
            example to compare data)
        columns : str or list(str, str, str, ...)
            The columns of the data to be plotted. If None, all are plotted.
        index : |index_type|
            |index_docstring|
        title : str
            | The title of the whole plot (NOT of the subplots). If several
              titles for the same figures are given, they will be *concatenated*.
            | So for a "simple" title, specify the title only once.
        data_name:
            | Additional, (to the *data_name* and *data_name_addition*), human-
              readable name for the legend.
            | Examples: "before cut", "after cut" etc
        bins : int
            Number of bins to plot.
        log_y_axes : boolean
            If True, the y-axes will be scaled logarithmically.
        plot_range : tuple (float, float) or None
            The lower and upper range of the bins. If None, 99.98% of the data
            will be plottet automatically.
        sample_weights : pandas Series
            The weights for the data, how "high" a bin is. Actually, how much
            it should account for the whole distribution or how "often" it
            occures. If None is specified, the weights are taken from the data.
        importance : |importance_type|
            |importance_docstring|
        see_all : boolean
            If True, all data (not just 99.98%) will be plotted.
        hist_settings : dict
            A dictionary containing the settings as keywords for the
            :py:func:`~matplotlib.pyplot.hist()` function.

        """
# ==============================================================================
#        initialize values
# ==============================================================================
        if sample_weights is None:
            sample_weights = self._get_weights(index=index)
            if dev_tool.is_in_primitive(sample_weights, 1):
                sample_weights = None
        figure_kwargs = {} if figure_kwargs is None else figure_kwargs

        # update hist_settings
        if dev_tool.is_in_primitive(hist_settings, None):
            hist_settings = {}
        if isinstance(hist_settings, dict):
            hist_settings = dict(meta_config.DEFAULT_HIST_SETTINGS, **hist_settings)
        if bins is not None:
            hist_settings['bins'] = bins
        if plot_range is not None:
            hist_settings['range'] = plot_range

        # create data
        data_plot = self.pandasDF(columns=columns, index=index)
        columns = data_plot.columns.values
        self.logger.debug("plot columns from pandasDataFrame: " + str(columns))

        # set the right number of rows and columns for the subplot
        subplot_col = int(math.ceil(math.sqrt(len(columns))))
        subplot_row = int(math.ceil(float(len(columns)) / subplot_col))

        # assign a free figure if argument is None
        if dev_tool.is_in_primitive(figure, None):
            while True:
                safety = 0
                figure = self.__figure_number + 1
                self.__figure_number += 1
                assert safety < meta_config.MAX_FIGURES, "stuck in an endless while loop"
                if figure not in self.__figure_dic.keys():
                    x_limits_col = {}
                    # TODO: improve figure dict with title....
                    self.__figure_dic.update({figure: x_limits_col, str(figure) + '_title': ""})
                    break
        elif figure not in self.__figure_dic.keys():
            x_limits_col = {}
            self.__figure_dic.update({figure: x_limits_col, str(figure) + '_title': ""})
        out_figure = out.save_fig(figure, importance=importance,
                                  figure_kwargs=figure_kwargs, **cfg.save_fig_cfg)

        # create a label
        label_name = data_tools.obj_to_string([self._name[0], self._name[1],
                                               data_name], separator=" - ")
        self.__figure_dic[str(figure) + '_title'] += "" if title is None else title
        plt.suptitle(self.__figure_dic.get(str(figure) + '_title'), fontsize=self.supertitle_fontsize)

# ==============================================================================
#       Start plotting
# ==============================================================================
        # plot the distribution column by column
        for col_id, column in enumerate(columns, 1):
            # create sub title
            sub_title_tmp = column if sub_title is None else sub_title
            x_label = "" if x_label is None else x_label

            # only plot in range x_limits, otherwise the plot is too big
            x_limits = self.__figure_dic.get(figure).get(column)
            lower, upper = np.percentile(np.hstack(data_plot[column]),
                                         [0.01, 99.99])
            if dev_tool.is_in_primitive(x_limits, None):
                x_limits = (lower, upper)
            elif see_all:  # choose the maximum range. Bins not nicely overlapping.
                x_limits = (min(x_limits[0], lower), max(x_limits[1], upper))
            if 'range' in hist_settings:
                x_limits = hist_settings.pop('range')
            self.__figure_dic[figure].update({column: x_limits})

            plt.subplot(subplot_row, subplot_col, col_id)
            plt.hist(data_plot[column], weights=sample_weights, log=log_y_axes,
                     range=x_limits, label=label_name, **hist_settings)

            # set labels, titles...
            plt.title(sub_title_tmp)
            ha = 'center'
            plt.xlabel(x_label, ha=ha, position=(0.5, 0))
            if y_label is not None:
                plt.ylabel(y_label, ha=ha, position=(0, 0.5))

        plt.legend()
        return out_figure
Пример #9
0
    def set_targets(self, targets, index=None):
        """Set the targets of the data. Either an array-like object or {0, 1}."""

        if not dev_tool.is_in_primitive(targets, (-1, 0, 1, None)):
            assert len(self) == len(targets), "Invalid targets"
        self._set_target(target=targets, index=index)
Пример #10
0
    def get_weights(self, index=None, normalize=True, **kwargs):
        """Return the weights of the specified indeces or, if None, return all.

        Parameters
        ----------
        normalize : boolean or float > 0
            If True, the weights will be normalized to 1 (the mean is 1).
            If a float is provided, the mean of the weights will be equal
            to *normalize*. So *True* and *1* will yield the same results.
        index : |index_type|
            |index_docstring

        Return
        ------
        out: 1-D pandas Series
            Return the weights as pandas Series
        """
        index = self._index if index is None else list(index)
        length = len(self) if index is None else len(index)
        normalize = 1 if normalize is True else normalize
        second_storage = kwargs.get('second_storage')

        normalize_1 = 1
        normalize_2 = 1

        # HACK
        weights_ratio = normalize

        # TODO: implement if targets are different

        if weights_ratio > 0 and second_storage is not None:
            weights_1 = self.get_weights(index=index)
            weights_2 = second_storage.get_weights()

            sum_weight_1 = float(sum(weights_1))
            sum_weight_2 = float(sum(weights_2))

            ratio_1 = weights_ratio * sum_weight_2 / sum_weight_1
            self.logger.info("ratio_1 = " + str(ratio_1))
            if ratio_1 >= 1:
                ratio_2 = 1.0
            else:
                ratio_2 = 1.0 / ratio_1
                ratio_1 = 1.0

            normalize_1 = ratio_1
            normalize_2 = ratio_2
        elif weights_ratio > 0 and second_storage is None:
            normalize_1 = weights_ratio
        else:
            normalize_1 = normalize_2 = None

        weights_out = self._get_weights(index=index, normalize=normalize_1)

        if dev_tool.is_in_primitive(weights_out, (None, 1)):
            weights_out = pd.Series(data=np.ones(length), index=index) * normalize_1

        if second_storage is not None:
            weights_2 = second_storage.get_weights(normalize=normalize_2)
            weights_out = np.concatenate((weights_out, weights_2))

        return weights_out