class LogarithmicScale(Preprocess):
    Base = Enum("LogarithmicScale", ("BinaryLog", "NaturalLog", "CommonLog"),
                qualname="LogarithmicScale.Base")
    BinaryLog, NaturalLog, CommonLog = Base

    def __init__(self, base=BinaryLog):
        self.base = base

    def __call__(self, data: Table) -> Table:
        new_data = data.copy()

        if self.base == LogarithmicScale.BinaryLog:

            def func(x, *args, **kwargs):
                return np.log2(x + 1, *args, **kwargs)
        elif self.base == LogarithmicScale.CommonLog:

            def func(x, *args, **kwargs):
                return np.log10(x + 1, *args, **kwargs)
        elif self.base == LogarithmicScale.NaturalLog:
            func = np.log1p

        if sp.issparse(new_data.X):
            func(new_data.X.data, out=new_data.X.data)
        else:
            func(new_data.X, out=new_data.X)

        return new_data
class Normalize(Preprocess):
    Method = Enum("Normalize", ("CPM", "Median"), qualname="Normalize.Method")
    CPM, Median = Method

    def __init__(self, method=CPM):
        self.method = method

    def __call__(self, *args):
        raise NotImplementedError

    def normalize(self, *args):
        raise NotImplementedError
class SelectMostVariableGenes(Preprocess):
    Method = Enum("SelectMostVariableGenes",
                  ("Dispersion", "Variance", "Mean"),
                  qualname="SelectMostVariableGenes.Method")
    Dispersion, Variance, Mean = Method

    def __init__(self, method=Dispersion, n_genes=1000, n_groups=20):
        self.method = method
        self.n_genes = n_genes
        self.n_groups = n_groups if n_groups and n_groups > 1 else 1

    def __call__(self, data: Table) -> Table:
        n_groups = min(self.n_groups, len(data.domain.attributes))
        mean = ut.nanmean(data.X, axis=0)
        variance = ut.nanvar(data.X, axis=0)
        percentiles = [percentileofscore(mean, m) for m in mean]
        _, bins = np.histogram(percentiles, n_groups)
        bin_indices = np.digitize(percentiles, bins, True)
        # Right limit is treated differently in histogram and digitize
        # See https://github.com/numpy/numpy/issues/4217
        bin_indices[bin_indices == 0] = 1

        zscores = np.zeros_like(mean)
        for group in range(n_groups):
            group_indices, = np.where(bin_indices == group + 1)
            if self.method == SelectMostVariableGenes.Dispersion:
                group_mean = mean[group_indices]
                group_scores = np.divide(variance[group_indices],
                                         group_mean,
                                         out=np.zeros_like(group_mean),
                                         where=group_mean != 0)
            elif self.method == SelectMostVariableGenes.Variance:
                group_scores = variance[group_indices]
            elif self.method == SelectMostVariableGenes.Mean:
                group_scores = mean[group_indices]

            with np.errstate(invalid="ignore"):
                zscores[group_indices] = zscore(group_scores)

        indices = np.argsort(np.nan_to_num(zscores))[-self.n_genes:]
        return self._filter_columns(data, indices)

    @staticmethod
    def _filter_columns(data, indices):
        indices = sorted(indices)
        domain = data.domain
        attrs, cls, metas = domain.attributes, domain.class_vars, domain.metas
        domain = Domain(tuple(np.array(attrs)[indices]), cls, metas)
        return data.transform(domain)
class Binarize(Preprocess):
    Condition = Enum("Binarize", ("GreaterOrEqual", "Greater"),
                     qualname="Binarize.Condition")
    GreaterOrEqual, Greater = Condition

    def __init__(self, condition=GreaterOrEqual, threshold=1):
        self.condition = condition
        self.threshold = threshold

    def __call__(self, data: Table) -> Table:
        new_data = data.copy()
        if self.condition == Binarize.GreaterOrEqual:
            new_data.X = new_data.X >= self.threshold
        elif self.condition == Binarize.Greater:
            new_data.X = new_data.X > self.threshold
        return new_data
Пример #5
0
class LogarithmicScale(Preprocess):
    Base = Enum("LogarithmicScale", ("BinaryLog", "NaturalLog", "CommonLog"),
                qualname="LogarithmicScale.Base")
    BinaryLog, NaturalLog, CommonLog = Base

    def __init__(self, base=BinaryLog):
        self.base = base

    def __call__(self, data):
        new_data = data.copy()
        if self.base == LogarithmicScale.BinaryLog:
            new_data.X = np.log2(1 + data.X)
        elif self.base == LogarithmicScale.NaturalLog:
            new_data.X = np.log(1 + data.X)
        elif self.base == LogarithmicScale.CommonLog:
            new_data.X = np.log10(1 + data.X)
        return new_data
Пример #6
0
class Continuize(Preprocess):
    (Indicators, FirstAsBase, FrequentAsBase, Remove, RemoveMultinomial,
     ReportError, AsOrdinal, AsNormalizedOrdinal, Leave) = Enum(
         "Continuize", "Indicators, FirstAsBase, FrequentAsBase,"
         "Remove, RemoveMultinomial, ReportError, AsOrdinal,"
         "AsNormalizedOrdinal, Leave")

    def __init__(self, zero_based=True, multinomial_treatment=Indicators):
        self.zero_based = zero_based
        self.multinomial_treatment = multinomial_treatment

    def __call__(self, data):
        from . import continuize

        continuizer = continuize.DomainContinuizer(
            zero_based=self.zero_based,
            multinomial_treatment=self.multinomial_treatment)
        domain = continuizer(data)
        return data.transform(domain)
Пример #7
0
class FilterString(ValueFilter):
    """
    Subfilter for string variables.

    .. attribute:: column

        The column to which the filter applies (int, str or
        :obj:`Orange.data.Variable`).

    .. attribute:: ref

        The reference value; also aliased to `min` for operators
        `Between` and `Outside`.

    .. attribute:: max

        The upper threshold for operators `Between` and `Outside`.

    .. attribute:: oper

        The operator; should be `FilterString.Equal`, `NotEqual`, `Less`,
        `LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside`,
        `Contains`, `StartsWith`, `EndsWith` or `IsDefined`.

    .. attribute:: case_sensitive

        Tells whether the comparisons are case sensitive
    """
    Type = Enum(
        'FilterString', 'Equal, NotEqual, Less, LessEqual, Greater,'
        'GreaterEqual, Between, Outside, Contains,'
        'StartsWith, EndsWith, IsDefined')
    (Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual, Between, Outside,
     Contains, StartsWith, EndsWith, IsDefined) = Type

    def __init__(self,
                 position,
                 oper,
                 ref=None,
                 max=None,
                 case_sensitive=True,
                 **a):
        super().__init__(position)
        if a:
            if len(a) != 1 or "min" not in a:
                raise TypeError(
                    "FilterContinuous got unexpected keyword arguments")
            else:
                ref = a["min"]
        self.ref = ref
        self.max = max
        self.oper = oper
        self.case_sensitive = case_sensitive
        self.position = position

    @property
    def min(self):
        return self.ref

    @min.setter
    def min(self, value):
        self.ref = value

    def __call__(self, inst):
        # the function is a large 'switch'; pylint: disable=too-many-branches
        value = inst[inst.domain.index(self.column)]
        if self.oper == self.IsDefined:
            return not np.isnan(value)
        if self.case_sensitive:
            value = str(value)
            refval = str(self.ref)
        else:
            value = str(value).lower()
            refval = str(self.ref).lower()
        if self.oper == self.Equal:
            return value == refval
        if self.oper == self.NotEqual:
            return value != refval
        if self.oper == self.Less:
            return value < refval
        if self.oper == self.LessEqual:
            return value <= refval
        if self.oper == self.Greater:
            return value > refval
        if self.oper == self.GreaterEqual:
            return value >= refval
        if self.oper == self.Contains:
            return refval in value
        if self.oper == self.StartsWith:
            return value.startswith(refval)
        if self.oper == self.EndsWith:
            return value.endswith(refval)
        high = self.max if self.case_sensitive else self.max.lower()
        if self.oper == self.Between:
            return refval <= value <= high
        if self.oper == self.Outside:
            return not refval <= value <= high
        raise ValueError("invalid operator")
Пример #8
0
class FilterContinuous(ValueFilter):
    """
    Subfilter for continuous variables.

    .. attribute:: column

        The column to which the filter applies (int, str or
        :obj:`Orange.data.Variable`).

    .. attribute:: ref

        The reference value; also aliased to `min` for operators
        `Between` and `Outside`.

    .. attribute:: max

        The upper threshold for operators `Between` and `Outside`.

    .. attribute:: oper

        The operator; should be `FilterContinuous.Equal`, `NotEqual`, `Less`,
        `LessEqual`, `Greater`, `GreaterEqual`, `Between`, `Outside` or
        `IsDefined`.
    """
    Type = Enum(
        'FilterContinuous', 'Equal, NotEqual, Less, LessEqual, Greater,'
        'GreaterEqual, Between, Outside, IsDefined')
    (Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual, Between, Outside,
     IsDefined) = Type

    def __init__(self, position, oper, ref=None, max=None, min=None):
        super().__init__(position)
        self.ref = ref if min is None else min
        self.max = max
        self.oper = oper
        self.position = position

    @property
    def min(self):
        return self.ref

    @min.setter
    def min(self, value):
        self.ref = value

    def __call__(self, inst):
        value = inst[inst.domain.index(self.column)]
        if isnan(value):
            return self.oper == self.Equal and isnan(self.ref)
        if self.oper == self.Equal:
            return value == self.ref
        if self.oper == self.NotEqual:
            return value != self.ref
        if self.oper == self.Less:
            return value < self.ref
        if self.oper == self.LessEqual:
            return value <= self.ref
        if self.oper == self.Greater:
            return value > self.ref
        if self.oper == self.GreaterEqual:
            return value >= self.ref
        if self.oper == self.Between:
            return self.ref <= value <= self.max
        if self.oper == self.Outside:
            return not self.ref <= value <= self.max
        if self.oper == self.IsDefined:
            return True
        raise ValueError("invalid operator")

    def __eq__(self, other):
        return isinstance(other, FilterContinuous) and \
               self.column == other.column and self.oper == other.oper and \
               self.ref == other.ref and self.max == other.max

    def __str__(self):
        if isinstance(self.column, str):
            column = self.column
        elif isinstance(self.column, Variable):
            column = self.column.name
        else:
            column = "feature({})".format(self.column)

        names = {
            self.Equal: "=",
            self.NotEqual: "≠",
            self.Less: "<",
            self.LessEqual: "≤",
            self.Greater: ">",
            self.GreaterEqual: "≥"
        }
        if self.oper in names:
            return "{} {} {}".format(column, names[self.oper], self.ref)
        if self.oper == self.Between:
            return "{} ≤ {} ≤ {}".format(self.min, column, self.max)
        if self.oper == self.Outside:
            return "not {} ≤ {} ≤ {}".format(self.min, column, self.max)
        if self.oper == self.IsDefined:
            return "{} is defined".format(column)
        return "invalid operator"
Пример #9
0
class OWLinearProjection(widget.OWWidget):
    name = "Linear Projection"
    description = "A multi-axis projection of data onto " \
                  "a two-dimensional plane."
    icon = "icons/LinearProjection.svg"
    priority = 240
    keywords = []

    selection_indices = settings.Setting(None, schema_only=True)

    class Inputs:
        data = Input("Data", Table, default=True)
        data_subset = Input("Data Subset", Table)
        projection = Input("Projection", Table)

    class Outputs:
        selected_data = Output("Selected Data", Table, default=True)
        annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table)
        components = Output("Components", Table)

    Placement = Enum("Placement",
                     dict(Circular=0, LDA=1, PCA=2, Projection=3),
                     type=int,
                     qualname="OWLinearProjection.Placement")

    Component_name = {
        Placement.Circular: "C",
        Placement.LDA: "LD",
        Placement.PCA: "PC"
    }
    Variable_name = {
        Placement.Circular: "circular",
        Placement.LDA: "lda",
        Placement.PCA: "pca",
        Placement.Projection: "projection"
    }

    jitter_sizes = [0, 0.1, 0.5, 1.0, 2.0]

    settings_version = 3
    settingsHandler = settings.DomainContextHandler()

    variable_state = settings.ContextSetting({})
    placement = settings.Setting(Placement.Circular)
    radius = settings.Setting(0)
    auto_commit = settings.Setting(True)

    resolution = 256

    graph = settings.SettingProvider(OWLinProjGraph)
    ReplotRequest = QEvent.registerEventType()
    vizrank = settings.SettingProvider(LinearProjectionVizRank)
    graph_name = "graph.plot_widget.plotItem"

    class Warning(widget.OWWidget.Warning):
        no_cont_features = widget.Msg("Plotting requires numeric features")
        not_enough_components = widget.Msg(
            "Input projection has less than 2 components")
        trivial_components = widget.Msg(
            "All components of the PCA are trivial (explain 0 variance). "
            "Input data is constant (or near constant).")

    class Error(widget.OWWidget.Error):
        proj_and_domain_match = widget.Msg(
            "Projection and Data domains do not match")
        no_valid_data = widget.Msg("No projection due to invalid data")

    def __init__(self):
        super().__init__()

        self.data = None
        self.projection = None
        self.subset_data = None
        self._subset_mask = None
        self._selection = None
        self.__replot_requested = False
        self.n_cont_var = 0
        #: Remember the saved state to restore
        self.__pending_selection_restore = self.selection_indices
        self.selection_indices = None

        self.variable_x = None
        self.variable_y = None

        box = gui.vBox(self.mainArea, True, margin=0)
        self.graph = OWLinProjGraph(self,
                                    box,
                                    "Plot",
                                    view_box=LinProjInteractiveViewBox)
        box.layout().addWidget(self.graph.plot_widget)
        plot = self.graph.plot_widget

        SIZE_POLICY = (QSizePolicy.Minimum, QSizePolicy.Maximum)

        self.variables_selection = VariablesSelection()
        self.model_selected = VariableListModel(enable_dnd=True)
        self.model_other = VariableListModel(enable_dnd=True)
        self.variables_selection(self, self.model_selected, self.model_other)

        self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank(
            self.controlArea, self, "Suggest Features", self._vizrank)
        self.variables_selection.add_remove.layout().addWidget(
            self.btn_vizrank)

        box = gui.widgetBox(self.controlArea,
                            "Placement",
                            sizePolicy=SIZE_POLICY)
        self.radio_placement = gui.radioButtonsInBox(
            box,
            self,
            "placement",
            btnLabels=[
                "Circular Placement", "Linear Discriminant Analysis",
                "Principal Component Analysis", "Use input projection"
            ],
            callback=self._change_placement)

        self.viewbox = plot.getViewBox()
        self.replot = None

        g = self.graph.gui
        box = g.point_properties_box(self.controlArea)
        self.models = g.points_models
        g.add_widget(g.JitterSizeSlider, box)
        box.setSizePolicy(*SIZE_POLICY)

        box = gui.widgetBox(self.controlArea,
                            "Hide axes",
                            sizePolicy=SIZE_POLICY)
        self.rslider = gui.hSlider(box,
                                   self,
                                   "radius",
                                   minValue=0,
                                   maxValue=100,
                                   step=5,
                                   label="Radius",
                                   createLabel=False,
                                   ticks=True,
                                   callback=self.update_radius)
        self.rslider.setTickInterval(0)
        self.rslider.setPageStep(10)

        box = gui.vBox(self.controlArea, "Plot Properties")
        box.setSizePolicy(*SIZE_POLICY)

        g.add_widgets([
            g.ShowLegend, g.ToolTipShowsAll, g.ClassDensity,
            g.LabelOnlySelected
        ], box)

        box = self.graph.box_zoom_select(self.controlArea)
        box.setSizePolicy(*SIZE_POLICY)

        self.icons = gui.attributeIconDict

        p = self.graph.plot_widget.palette()
        self.graph.set_palette(p)
        gui.auto_commit(self.controlArea,
                        self,
                        "auto_commit",
                        "Send Selection",
                        auto_label="Send Automatically")
        self.graph.zoom_actions(self)

        self._new_plotdata()
        self._change_placement()
        self.graph.jitter_continuous = True

    def reset_graph_data(self):
        if self.data is not None:
            self.graph.rescale_data()
            self._update_graph(reset_view=True)

    def keyPressEvent(self, event):
        super().keyPressEvent(event)
        self.graph.update_tooltip(event.modifiers())

    def keyReleaseEvent(self, event):
        super().keyReleaseEvent(event)
        self.graph.update_tooltip(event.modifiers())

    def _vizrank(self, attrs):
        self.variables_selection.display_none()
        self.model_selected[:] = attrs[:]
        self.model_other[:] = [
            var for var in self.model_other if var not in attrs
        ]

    def _change_placement(self):
        placement = self.placement
        p_Circular = self.Placement.Circular
        p_LDA = self.Placement.LDA
        self.variables_selection.set_enabled(placement in [p_Circular, p_LDA])
        self._vizrank_color_change()
        self.rslider.setEnabled(placement != p_Circular)
        self._setup_plot()
        self.commit()

    def _get_min_radius(self):
        return self.radius * np.max(np.linalg.norm(self.plotdata.axes,
                                                   axis=1)) / 100 + 1e-5

    def update_radius(self):
        # Update the anchor/axes visibility
        pd = self.plotdata
        assert pd is not None
        if pd.hidecircle is None:
            return
        min_radius = self._get_min_radius()
        for anchor, item in zip(pd.axes, pd.axisitems):
            item.setVisible(np.linalg.norm(anchor) > min_radius)
        pd.hidecircle.setRect(
            QRectF(-min_radius, -min_radius, 2 * min_radius, 2 * min_radius))

    def _new_plotdata(self):
        self.plotdata = namespace(valid_mask=None,
                                  embedding_coords=None,
                                  axisitems=[],
                                  axes=[],
                                  variables=[],
                                  data=None,
                                  hidecircle=None)

    def _anchor_circle(self, variables):
        # minimum visible anchor radius (radius)
        min_radius = self._get_min_radius()
        axisitems = []
        for anchor, var in zip(self.plotdata.axes, variables[:]):
            axitem = AnchorItem(
                line=QLineF(0, 0, *anchor),
                text=var.name,
            )
            axitem.setVisible(np.linalg.norm(anchor) > min_radius)
            axitem.setPen(pg.mkPen((100, 100, 100)))
            axitem.setArrowVisible(True)
            self.viewbox.addItem(axitem)
            axisitems.append(axitem)

        self.plotdata.axisitems = axisitems
        if self.placement == self.Placement.Circular:
            return

        hidecircle = QGraphicsEllipseItem()
        hidecircle.setRect(
            QRectF(-min_radius, -min_radius, 2 * min_radius, 2 * min_radius))

        _pen = QPen(Qt.lightGray, 1)
        _pen.setCosmetic(True)
        hidecircle.setPen(_pen)

        self.viewbox.addItem(hidecircle)
        self.plotdata.hidecircle = hidecircle

    def update_colors(self):
        self._vizrank_color_change()

    def clear(self):
        # Clear/reset the widget state
        self.data = None
        self.model_selected.clear()
        self.model_other.clear()
        self._clear_plot()
        self.selection_indices = None

    def _clear_plot(self):
        self.Warning.trivial_components.clear()
        for axisitem in self.plotdata.axisitems:
            self.viewbox.removeItem(axisitem)
        if self.plotdata.hidecircle:
            self.viewbox.removeItem(self.plotdata.hidecircle)
        self._new_plotdata()
        self.graph.hide_axes()

    def invalidate_plot(self):
        """
        Schedule a delayed replot.
        """
        if not self.__replot_requested:
            self.__replot_requested = True
            QApplication.postEvent(self, QEvent(self.ReplotRequest),
                                   Qt.LowEventPriority - 10)

    def init_attr_values(self):
        self.graph.set_domain(self.data)

    def _vizrank_color_change(self):
        is_enabled = False
        if self.data is None:
            self.btn_vizrank.setToolTip("There is no data.")
            return
        vars = [
            v
            for v in chain(self.data.domain.variables, self.data.domain.metas)
            if v.is_primitive and v is not self.graph.attr_color
        ]
        self.n_cont_var = len(vars)
        if self.placement not in [self.Placement.Circular, self.Placement.LDA]:
            msg = "Suggest Features works only for Circular and " \
                  "Linear Discriminant Analysis Projection"
        elif self.graph.attr_color is None:
            msg = "Color variable has to be selected"
        elif self.graph.attr_color.is_continuous and self.placement == self.Placement.LDA:
            msg = "Suggest Features does not work for Linear Discriminant Analysis Projection " \
                  "when continuous color variable is selected."
        elif len(vars) < 3:
            msg = "Not enough available continuous variables"
        else:
            is_enabled = True
            msg = ""
        self.btn_vizrank.setToolTip(msg)
        self.btn_vizrank.setEnabled(is_enabled)
        self.vizrank.stop_and_reset(is_enabled)

    @Inputs.projection
    def set_projection(self, projection):
        self.Warning.not_enough_components.clear()
        if projection and len(projection) < 2:
            self.Warning.not_enough_components()
            projection = None
        if projection is not None:
            self.placement = self.Placement.Projection
        self.projection = projection

    @Inputs.data
    def set_data(self, data):
        """
        Set the input dataset.

        Args:
            data (Orange.data.table): data instances
        """
        def sql(data):
            if isinstance(data, SqlTable):
                if data.approx_len() < 4000:
                    data = Table(data)
                else:
                    self.information("Data has been sampled")
                    data_sample = data.sample_time(1, no_cache=True)
                    data_sample.download_data(2000, partial=True)
                    data = Table(data_sample)
            return data

        def settings(data):
            # get the default encoded state, replacing the position with Inf
            state = VariablesSelection.encode_var_state(
                [list(self.model_selected),
                 list(self.model_other)])
            state = {
                key: (source_ind, np.inf)
                for key, (source_ind, _) in state.items()
            }

            self.openContext(data.domain)
            selected_keys = [
                key for key, (sind, _) in self.variable_state.items()
                if sind == 0
            ]

            if set(selected_keys).issubset(set(state.keys())):
                pass

            if self.__pending_selection_restore is not None:
                self._selection = np.array(self.__pending_selection_restore,
                                           dtype=int)
                self.__pending_selection_restore = None

            # update the defaults state (the encoded state must contain
            # all variables in the input domain)
            state.update(self.variable_state)
            # ... and restore it with saved positions taking precedence over
            # the defaults
            selected, other = VariablesSelection.decode_var_state(
                state, [list(self.model_selected),
                        list(self.model_other)])
            return selected, other

        self.closeContext()
        self.clear()
        self.Warning.no_cont_features.clear()
        self.information()
        data = sql(data)
        if data is not None:
            domain = data.domain
            vars = [
                var for var in chain(domain.variables, domain.metas)
                if var.is_continuous
            ]
            if not len(vars):
                self.Warning.no_cont_features()
                data = None
        self.data = data
        self.init_attr_values()
        if data is not None and len(data):
            self._initialize(data)
            self.model_selected[:], self.model_other[:] = settings(data)
            self.vizrank.stop_and_reset()
            self.vizrank.attrs = self.data.domain.attributes if self.data is not None else []

    def _check_possible_opt(self):
        def set_enabled(is_enabled):
            for btn in self.radio_placement.buttons:
                btn.setEnabled(is_enabled)
            self.variables_selection.set_enabled(is_enabled)

        p_Circular = self.Placement.Circular
        p_LDA = self.Placement.LDA
        p_Input = self.Placement.Projection
        if self.data:
            set_enabled(True)
            domain = self.data.domain
            if not domain.has_discrete_class or len(
                    domain.class_var.values) < 2:
                self.radio_placement.buttons[p_LDA].setEnabled(False)
                if self.placement == p_LDA:
                    self.placement = p_Circular
            if not self.projection:
                self.radio_placement.buttons[p_Input].setEnabled(False)
                if self.placement == p_Input:
                    self.placement = p_Circular
            self._setup_plot()
        else:
            self.graph.new_data(None)
            self.rslider.setEnabled(False)
            set_enabled(False)
        self.commit()

    @Inputs.data_subset
    def set_subset_data(self, subset):
        """
        Set the supplementary input subset dataset.

        Args:
            subset (Orange.data.table): subset of data instances
        """
        self.subset_data = subset
        self._subset_mask = None
        self.controls.graph.alpha_value.setEnabled(subset is None)

    def handleNewSignals(self):
        if self.data is not None and self.subset_data is not None:
            # Update the plot's highlight items
            dataids = self.data.ids.ravel()
            subsetids = np.unique(self.subset_data.ids)
            self._subset_mask = np.in1d(dataids, subsetids, assume_unique=True)
        self._check_possible_opt()
        self._change_placement()
        self.commit()

    def customEvent(self, event):
        if event.type() == OWLinearProjection.ReplotRequest:
            self.__replot_requested = False
            self._setup_plot()
            self.commit()
        else:
            super().customEvent(event)

    def closeContext(self):
        self.variable_state = VariablesSelection.encode_var_state(
            [list(self.model_selected),
             list(self.model_other)])
        super().closeContext()

    def _initialize(self, data):
        # Initialize the GUI controls from data's domain.
        vars = [
            v for v in chain(data.domain.metas, data.domain.attributes)
            if v.is_continuous
        ]
        self.model_other[:] = vars[3:]
        self.model_selected[:] = vars[:3]

    def prepare_plot_data(self, variables):
        def projection(variables):
            if set(self.projection.domain.attributes).issuperset(variables):
                axes = self.projection[:2, variables].X
            elif set(f.name
                     for f in self.projection.domain.attributes).issuperset(
                         f.name for f in variables):
                axes = self.projection[:2, [f.name for f in variables]].X
            else:
                self.Error.proj_and_domain_match()
                axes = None
            return axes

        def get_axes(variables):
            self.Error.proj_and_domain_match.clear()
            axes = None
            if self.placement == self.Placement.Circular:
                axes = LinProj.defaultaxes(len(variables))
            elif self.placement == self.Placement.LDA:
                axes = self._get_lda(self.data, variables)
            elif self.placement == self.Placement.Projection and self.projection:
                axes = projection(variables)
            return axes

        coords = [
            column_data(self.data, var, dtype=float) for var in variables
        ]
        coords = np.vstack(coords)
        p, N = coords.shape
        assert N == len(self.data), p == len(variables)

        axes = get_axes(variables)
        if axes is None:
            return None, None, None
        assert axes.shape == (2, p)

        valid_mask = ~np.isnan(coords).any(axis=0)
        coords = coords[:, valid_mask]

        X, Y = np.dot(axes, coords)
        if X.size and Y.size:
            X = normalized(X)
            Y = normalized(Y)

        return valid_mask, np.stack((X, Y), axis=1), axes.T

    def _setup_plot(self):
        self._clear_plot()
        if self.data is None:
            return
        self.__replot_requested = False
        names = get_unique_names([
            v.name
            for v in chain(self.data.domain.variables, self.data.domain.metas)
        ], [
            "{}-x".format(self.Variable_name[self.placement]), "{}-y".format(
                self.Variable_name[self.placement])
        ])
        self.variable_x = ContinuousVariable(names[0])
        self.variable_y = ContinuousVariable(names[1])
        if self.placement in [self.Placement.Circular, self.Placement.LDA]:
            variables = list(self.model_selected)
        elif self.placement == self.Placement.Projection:
            variables = self.model_selected[:] + self.model_other[:]
        elif self.placement == self.Placement.PCA:
            variables = [
                var for var in self.data.domain.attributes if var.is_continuous
            ]
        if not variables:
            self.graph.new_data(None)
            return
        if self.placement == self.Placement.PCA:
            valid_mask, ec, axes = self._get_pca()
            variables = self._pca.orig_domain.attributes
        else:
            valid_mask, ec, axes = self.prepare_plot_data(variables)

        self.plotdata.variables = variables
        self.plotdata.valid_mask = valid_mask
        self.plotdata.embedding_coords = ec
        self.plotdata.axes = axes
        if any(e is None for e in (valid_mask, ec, axes)):
            return

        if not sum(valid_mask):
            self.Error.no_valid_data()
            self.graph.new_data(None, None)
            return
        self.Error.no_valid_data.clear()

        self._anchor_circle(variables=variables)
        self._plot()

    def _plot(self):
        domain = self.data.domain
        new_metas = domain.metas + (self.variable_x, self.variable_y)
        domain = Domain(attributes=domain.attributes,
                        class_vars=domain.class_vars,
                        metas=new_metas)
        valid_mask = self.plotdata.valid_mask
        array = np.zeros((len(self.data), 2), dtype=np.float)
        array[valid_mask] = self.plotdata.embedding_coords
        self.plotdata.data = data = self.data.transform(domain)
        data[:, self.variable_x] = array[:, 0].reshape(-1, 1)
        data[:, self.variable_y] = array[:, 1].reshape(-1, 1)
        subset_data = data[self._subset_mask & valid_mask]\
            if self._subset_mask is not None and len(self._subset_mask) else None
        self.plotdata.data = data
        self.graph.new_data(data[valid_mask], subset_data)
        if self._selection is not None:
            self.graph.selection = self._selection[valid_mask]
        self.graph.update_data(self.variable_x, self.variable_y, False)

    def _get_lda(self, data, variables):
        domain = Domain(attributes=variables,
                        class_vars=data.domain.class_vars)
        data = data.transform(domain)
        lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
        lda.fit(data.X, data.Y)
        scalings = lda.scalings_[:, :2].T
        if scalings.shape == (1, 1):
            scalings = np.array([[1.], [0.]])
        return scalings

    def _get_pca(self):
        data = self.data
        MAX_COMPONENTS = 2
        ncomponents = 2
        DECOMPOSITIONS = [PCA]  # TruncatedSVD
        cls = DECOMPOSITIONS[0]
        pca_projector = cls(n_components=MAX_COMPONENTS)
        pca_projector.component = ncomponents
        pca_projector.preprocessors = cls.preprocessors + [Normalize()]

        pca = pca_projector(data)
        variance_ratio = pca.explained_variance_ratio_
        cumulative = np.cumsum(variance_ratio)

        self._pca = pca
        if not np.isfinite(cumulative[-1]):
            self.Warning.trivial_components()

        coords = pca(data).X
        valid_mask = ~np.isnan(coords).any(axis=1)
        # scale axes
        max_radius = np.min(
            [np.abs(np.min(coords, axis=0)),
             np.max(coords, axis=0)])
        axes = pca.components_.T.copy()
        axes *= max_radius / np.max(np.linalg.norm(axes, axis=1))
        return valid_mask, coords, axes

    def _update_graph(self, reset_view=False):
        self.graph.zoomStack = []
        if self.graph.data is None:
            return
        self.graph.update_data(self.variable_x, self.variable_y, reset_view)

    def update_density(self):
        self._update_graph(reset_view=False)

    def selection_changed(self):
        if self.graph.selection is not None:
            self._selection = np.zeros(len(self.data), dtype=np.uint8)
            self._selection[self.plotdata.valid_mask] = self.graph.selection
            self.selection_indices = self._selection.tolist()
        else:
            self._selection = self.selection_indices = None
        self.commit()

    def prepare_data(self):
        pass

    def commit(self):
        def prepare_components():
            if self.placement in [self.Placement.Circular, self.Placement.LDA]:
                attrs = [a for a in self.model_selected[:]]
                axes = self.plotdata.axes
            elif self.placement == self.Placement.PCA:
                axes = self._pca.components_.T
                attrs = [a for a in self._pca.orig_domain.attributes]
            if self.placement != self.Placement.Projection:
                domain = Domain([
                    ContinuousVariable(a.name, compute_value=lambda _: None)
                    for a in attrs
                ],
                                metas=[StringVariable(name='component')])
                metas = np.array([[
                    "{}{}".format(self.Component_name[self.placement], i + 1)
                    for i in range(axes.shape[1])
                ]],
                                 dtype=object).T
                components = Table(domain, axes.T, metas=metas)
                components.name = 'components'
            else:
                components = self.projection
            return components

        selected = annotated = components = None
        if self.data is not None and self.plotdata.data is not None:
            components = prepare_components()

            graph = self.graph
            mask = self.plotdata.valid_mask.astype(int)
            mask[mask == 1] = graph.selection if graph.selection is not None \
            else [False * len(mask)]

            selection = np.array(
                [], dtype=np.uint8) if mask is None else np.flatnonzero(mask)
            name = self.data.name
            data = self.plotdata.data
            if len(selection):
                selected = data[selection]
                selected.name = name + ": selected"
                selected.attributes = self.data.attributes

            if graph.selection is not None and np.max(graph.selection) > 1:
                annotated = create_groups_table(data, mask)
            else:
                annotated = create_annotated_table(data, selection)
            annotated.attributes = self.data.attributes
            annotated.name = name + ": annotated"

        self.Outputs.selected_data.send(selected)
        self.Outputs.annotated_data.send(annotated)
        self.Outputs.components.send(components)

    def send_report(self):
        if self.data is None:
            return

        def name(var):
            return var and var.name

        def projection_name():
            name = ("Circular Placement", "Linear Discriminant Analysis",
                    "Principal Component Analysis", "Input projection")
            return name[self.placement]

        caption = report.render_items_vert(
            (("Projection", projection_name()), ("Color",
                                                 name(self.graph.attr_color)),
             ("Label", name(self.graph.attr_label)),
             ("Shape", name(self.graph.attr_shape)),
             ("Size", name(self.graph.attr_size)),
             ("Jittering", self.graph.jitter_size != 0
              and "{} %".format(self.graph.jitter_size))))
        self.report_plot()
        if caption:
            self.report_caption(caption)

    @classmethod
    def migrate_settings(cls, settings_, version):
        if version < 2:
            settings_["point_width"] = settings_["point_size"]
        if version < 3:
            settings_graph = {}
            settings_graph["jitter_size"] = settings_["jitter_value"]
            settings_graph["point_width"] = settings_["point_width"]
            settings_graph["alpha_value"] = settings_["alpha_value"]
            settings_graph["class_density"] = settings_["class_density"]
            settings_["graph"] = settings_graph

    @classmethod
    def migrate_context(cls, context, version):
        if version < 2:
            domain = context.ordered_domain
            c_domain = [t for t in context.ordered_domain if t[1] == 2]
            d_domain = [t for t in context.ordered_domain if t[1] == 1]
            for d, old_val, new_val in ((domain, "color_index", "attr_color"),
                                        (d_domain, "shape_index",
                                         "attr_shape"),
                                        (c_domain, "size_index", "attr_size")):
                index = context.values[old_val][0] - 1
                context.values[new_val] = (d[index][0], d[index][1] + 100) \
                    if 0 <= index < len(d) else None
        if version < 3:
            context.values["graph"] = {
                "attr_color": context.values["attr_color"],
                "attr_shape": context.values["attr_shape"],
                "attr_size": context.values["attr_size"]
            }
Пример #10
0
        if self.always_show_axes:
            self.plot_widget.removeItem(self.circle_item)
            self.circle_item = None

        if self.circle_item is not None:
            points, _ = self.master.get_anchors()
            if points is None:
                return

            r = self.scaled_radius * np.max(np.linalg.norm(points, axis=1))
            self.circle_item.setRect(QRectF(-r, -r, 2 * r, 2 * r))
            pen = pg.mkPen(QColor(Qt.lightGray), width=1, cosmetic=True)
            self.circle_item.setPen(pen)


Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2), type=int,
                 qualname="Placement")


class OWLinearProjection(OWAnchorProjectionWidget):
    name = "Linear Projection"
    description = "A multi-axis projection of data onto " \
                  "a two-dimensional plane."
    icon = "icons/LinearProjection.svg"
    priority = 240
    keywords = []

    Projection_name = {Placement.Circular: "Circular Placement",
                       Placement.LDA: "Linear Discriminant Analysis",
                       Placement.PCA: "Principal Component Analysis"}

    settings_version = 6
Пример #11
0
class OWLinearProjection(OWAnchorProjectionWidget):
    name = "Linear Projection"
    description = "A multi-axis projection of data onto " \
                  "a two-dimensional plane."
    icon = "icons/LinearProjection.svg"
    priority = 240
    keywords = []

    Placement = Enum("Placement",
                     dict(Circular=0, LDA=1, PCA=2),
                     type=int,
                     qualname="OWLinearProjection.Placement")

    Projection_name = {
        Placement.Circular: "Circular Placement",
        Placement.LDA: "Linear Discriminant Analysis",
        Placement.PCA: "Principal Component Analysis"
    }

    settings_version = 5

    placement = Setting(Placement.Circular)
    selected_vars = ContextSetting([])
    vizrank = SettingProvider(LinearProjectionVizRank)
    GRAPH_CLASS = OWLinProjGraph
    graph = SettingProvider(OWLinProjGraph)

    left_side_scrolling = True

    class Error(OWAnchorProjectionWidget.Error):
        no_cont_features = Msg("Plotting requires numeric features")

    def __init__(self):
        self.model_selected = VariableListModel(enable_dnd=True)
        self.model_selected.removed.connect(self.__model_selected_changed)
        self.model_other = VariableListModel(enable_dnd=True)

        self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank(
            None, self, "Suggest Features", self.__vizrank_set_attrs)

        super().__init__()

    def _add_controls(self):
        self._add_controls_variables()
        self._add_controls_placement()
        super()._add_controls()
        self.gui.add_control(self._effects_box,
                             gui.hSlider,
                             "Hide radius:",
                             master=self.graph,
                             value="hide_radius",
                             minValue=0,
                             maxValue=100,
                             step=10,
                             createLabel=False,
                             callback=self.__radius_slider_changed)
        self.controlArea.layout().removeWidget(self.control_area_stretch)
        self.control_area_stretch.setParent(None)

    def _add_controls_variables(self):
        self.variables_selection = VariablesSelection(self,
                                                      self.model_selected,
                                                      self.model_other,
                                                      self.controlArea)
        self.variables_selection.added.connect(self.__model_selected_changed)
        self.variables_selection.removed.connect(self.__model_selected_changed)
        self.variables_selection.add_remove.layout().addWidget(
            self.btn_vizrank)

    def _add_controls_placement(self):
        box = gui.widgetBox(self.controlArea,
                            True,
                            sizePolicy=(QSizePolicy.Minimum,
                                        QSizePolicy.Maximum))
        self.radio_placement = gui.radioButtonsInBox(
            box,
            self,
            "placement",
            btnLabels=[self.Projection_name[x] for x in self.Placement],
            callback=self.__placement_radio_changed)

    @property
    def continuous_variables(self):
        if self.data is None or self.data.domain is None:
            return []
        dom = self.data.domain
        return [v for v in chain(dom.variables, dom.metas) if v.is_continuous]

    @property
    def effective_variables(self):
        return self.model_selected[:]

    def __vizrank_set_attrs(self, attrs):
        if not attrs:
            return
        self.model_selected[:] = attrs[:]
        self.model_other[:] = [
            var for var in self.continuous_variables if var not in attrs
        ]
        self.__model_selected_changed()

    def __model_selected_changed(self):
        self.selected_vars = [(var.name, vartype(var))
                              for var in self.model_selected]
        self.projection = None
        self._check_options()
        self.init_projection()
        self.setup_plot()
        self.commit()

    def __placement_radio_changed(self):
        self.controls.graph.hide_radius.setEnabled(
            self.placement != self.Placement.Circular)
        self.projection = self.projector = None
        self._init_vizrank()
        self.init_projection()
        self.setup_plot()
        self.commit()

    def __radius_slider_changed(self):
        self.graph.update_radius()

    def colors_changed(self):
        super().colors_changed()
        self._init_vizrank()

    def set_data(self, data):
        super().set_data(data)
        self._check_options()
        self._init_vizrank()
        self.init_projection()

    def use_context(self):
        self.model_selected.clear()
        self.model_other.clear()
        if self.data is not None and len(self.selected_vars):
            d, selected = self.data.domain, [v[0] for v in self.selected_vars]
            self.model_selected[:] = [d[attr] for attr in selected]
            self.model_other[:] = [
                d[attr.name] for attr in self.continuous_variables
                if attr.name not in selected
            ]
        elif self.data is not None:
            self.model_selected[:] = self.continuous_variables[:3]
            self.model_other[:] = self.continuous_variables[3:]

    def _check_options(self):
        buttons = self.radio_placement.buttons
        for btn in buttons:
            btn.setEnabled(True)

        if self.data is not None:
            has_discrete_class = self.data.domain.has_discrete_class
            if not has_discrete_class or len(np.unique(self.data.Y)) < 3:
                buttons[self.Placement.LDA].setEnabled(False)
                if self.placement == self.Placement.LDA:
                    self.placement = self.Placement.Circular

        self.controls.graph.hide_radius.setEnabled(
            self.placement != self.Placement.Circular)

    def _init_vizrank(self):
        is_enabled, msg = False, ""
        if self.data is None:
            msg = "There is no data."
        elif self.attr_color is None:
            msg = "Color variable has to be selected"
        elif self.attr_color.is_continuous and \
                self.placement == self.Placement.LDA:
            msg = "Suggest Features does not work for Linear " \
                  "Discriminant Analysis Projection when " \
                  "continuous color variable is selected."
        elif len(
            [v for v in self.continuous_variables if v is not self.attr_color
             ]) < 3:
            msg = "Not enough available continuous variables"
        elif np.sum(np.all(np.isfinite(self.data.X), axis=1)) < 2:
            msg = "Not enough valid data instances"
        else:
            is_enabled = not np.isnan(
                self.data.get_column_view(
                    self.attr_color)[0].astype(float)).all()
        self.btn_vizrank.setToolTip(msg)
        self.btn_vizrank.setEnabled(is_enabled)
        if is_enabled:
            self.vizrank.initialize()

    def check_data(self):
        def error(err):
            err()
            self.data = None

        super().check_data()
        if self.data is not None:
            if not len(self.continuous_variables):
                error(self.Error.no_cont_features)

    def init_attr_values(self):
        super().init_attr_values()
        self.selected_vars = []

    def init_projection(self):
        if self.placement == self.Placement.Circular:
            self.projector = CircularPlacement()
        elif self.placement == self.Placement.LDA:
            self.projector = LDA(solver="eigen", n_components=2)
        elif self.placement == self.Placement.PCA:
            self.projector = PCA(n_components=2)
            self.projector.component = 2
            self.projector.preprocessors = PCA.preprocessors + [Normalize()]

        super().init_projection()

    def get_coordinates_data(self):
        def normalized(a):
            span = np.max(a, axis=0) - np.min(a, axis=0)
            span[span == 0] = 1
            return (a - np.mean(a, axis=0)) / span

        embedding = self.get_embedding()
        if embedding is None:
            return None, None
        norm_emb = normalized(embedding[self.valid_data])
        return (norm_emb.ravel(), np.zeros(len(norm_emb), dtype=float)) \
            if embedding.shape[1] == 1 else norm_emb.T

    def _get_send_report_caption(self):
        def projection_name():
            return self.Projection_name[self.placement]

        return report.render_items_vert(
            (("Projection", projection_name()),
             ("Color", self._get_caption_var_name(self.attr_color)),
             ("Label", self._get_caption_var_name(self.attr_label)),
             ("Shape", self._get_caption_var_name(self.attr_shape)),
             ("Size", self._get_caption_var_name(self.attr_size)),
             ("Jittering", self.graph.jitter_size != 0
              and "{} %".format(self.graph.jitter_size))))

    @classmethod
    def migrate_settings(cls, settings_, version):
        if version < 2:
            settings_["point_width"] = settings_["point_size"]
        if version < 3:
            settings_graph = {}
            settings_graph["jitter_size"] = settings_["jitter_value"]
            settings_graph["point_width"] = settings_["point_width"]
            settings_graph["alpha_value"] = settings_["alpha_value"]
            settings_graph["class_density"] = settings_["class_density"]
            settings_["graph"] = settings_graph
        if version < 4:
            if "radius" in settings_:
                settings_["graph"]["hide_radius"] = settings_["radius"]
            if "selection_indices" in settings_ and \
                    settings_["selection_indices"] is not None:
                selection = settings_["selection_indices"]
                settings_["selection"] = [
                    (i, 1) for i, selected in enumerate(selection) if selected
                ]
        if version < 5:
            if "placement" in settings_ and \
                    settings_["placement"] not in cls.Placement:
                settings_["placement"] = cls.Placement.Circular

    @classmethod
    def migrate_context(cls, context, version):
        if version < 2:
            domain = context.ordered_domain
            c_domain = [t for t in context.ordered_domain if t[1] == 2]
            d_domain = [t for t in context.ordered_domain if t[1] == 1]
            for d, old_val, new_val in ((domain, "color_index", "attr_color"),
                                        (d_domain, "shape_index",
                                         "attr_shape"),
                                        (c_domain, "size_index", "attr_size")):
                index = context.values[old_val][0] - 1
                context.values[new_val] = (d[index][0], d[index][1] + 100) \
                    if 0 <= index < len(d) else None
        if version < 3:
            context.values["graph"] = {
                "attr_color": context.values["attr_color"],
                "attr_shape": context.values["attr_shape"],
                "attr_size": context.values["attr_size"]
            }
        if version == 3:
            values = context.values
            values["attr_color"] = values["graph"]["attr_color"]
            values["attr_size"] = values["graph"]["attr_size"]
            values["attr_shape"] = values["graph"]["attr_shape"]
            values["attr_label"] = values["graph"]["attr_label"]
Пример #12
0
class Randomize(Preprocess):
    """
    Construct a preprocessor for randomization of classes,
    attributes and/or metas.
    Given a data table, preprocessor returns a new table in
    which the data is shuffled.

    Parameters
    ----------

    rand_type : RandTypes (default: Randomize.RandomizeClasses)
        Randomization type. If Randomize.RandomizeClasses, classes
        are shuffled.
        If Randomize.RandomizeAttributes, attributes are shuffled.
        If Randomize.RandomizeMetas, metas are shuffled.

    rand_seed : int (optional)
        Random seed

    Examples
    --------
    >>> from Orange.data import Table
    >>> from Orange.preprocess import Randomize
    >>> data = Table("iris")
    >>> randomizer = Randomize(Randomize.RandomizeClasses)
    >>> randomized_data = randomizer(data)
    """
    Type = Enum("Randomize",
                dict(RandomizeClasses=1,
                     RandomizeAttributes=2,
                     RandomizeMetas=4),
                type=int,
                qualname="Randomize.Type")
    RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type

    def __init__(self, rand_type=RandomizeClasses, rand_seed=None):
        self.rand_type = rand_type
        self.rand_seed = rand_seed

    def __call__(self, data):
        """
        Apply randomization of the given data. Returns a new
        data table.

        Parameters
        ----------
        data : Orange.data.Table
            A data table to be randomized.

        Returns
        -------
        data : Orange.data.Table
            Randomized data table.
        """
        new_data = data.copy()
        rstate = np.random.RandomState(self.rand_seed)
        # ensure the same seed is not used to shuffle X and Y at the same time
        r1, r2, r3 = rstate.randint(0, 2**32 - 1, size=3, dtype=np.int64)
        if self.rand_type & Randomize.RandomizeClasses:
            new_data.Y = self.randomize(new_data.Y, r1)
        if self.rand_type & Randomize.RandomizeAttributes:
            new_data.X = self.randomize(new_data.X, r2)
        if self.rand_type & Randomize.RandomizeMetas:
            new_data.metas = self.randomize(new_data.metas, r3)
        return new_data

    def randomize(self, table, rand_state=None):
        rstate = np.random.RandomState(rand_state)
        if sp.issparse(table):
            table = table.tocsc()  # type: sp.spmatrix
            for i in range(table.shape[1]):
                permutation = rstate.permutation(table.shape[0])
                col_indices = \
                    table.indices[table.indptr[i]: table.indptr[i + 1]]
                col_indices[:] = permutation[col_indices]
        elif len(table.shape) > 1:
            for i in range(table.shape[1]):
                rstate.shuffle(table[:, i])
        else:
            rstate.shuffle(table)
        return table
Пример #13
0
class Normalize(Preprocess):
    """
    Construct a preprocessor for normalization of features.
    Given a data table, preprocessor returns a new table in
    which the continuous attributes are normalized.

    Parameters
    ----------
    zero_based : bool (default=True)
        Determines the value used as the “low” value of the variable.
        It determines the interval for normalized continuous variables
        (either [-1, 1] or [0, 1]).

    norm_type : NormTypes (default: Normalize.NormalizeBySD)
        Normalization type. If Normalize.NormalizeBySD, the values are
        replaced with standardized values by subtracting the average
        value and dividing by the standard deviation.
        Attribute zero_based has no effect on this standardization.

        If Normalize.NormalizeBySpan, the values are replaced with
        normalized values by subtracting min value of the data and
        dividing by span (max - min).

    transform_class : bool (default=False)
        If True the class is normalized as well.

    Examples
    --------
    >>> from Orange.data import Table
    >>> from Orange.preprocess import Normalize
    >>> data = Table("iris")
    >>> normalizer = Normalize(norm_type=Normalize.NormalizeBySpan)
    >>> normalized_data = normalizer(data)
    """
    Type = Enum("Normalize", ("NormalizeBySpan", "NormalizeBySD"),
                qualname="Normalize.Type")
    NormalizeBySpan, NormalizeBySD = Type

    def __init__(self,
                 zero_based=True,
                 norm_type=NormalizeBySD,
                 transform_class=False):
        self.zero_based = zero_based
        self.norm_type = norm_type
        self.transform_class = transform_class

    def __call__(self, data):
        """
        Compute and apply normalization of the given data. Returns a new
        data table.

        Parameters
        ----------
        data : Orange.data.Table
            A data table to be normalized.

        Returns
        -------
        data : Orange.data.Table
            Normalized data table.
        """
        from . import normalize

        if all(
                a.attributes.get('skip-normalization', False)
                for a in data.domain.attributes if a.is_continuous):
            # Skip normalization for datasets where all features are marked as already normalized.
            # Required for SVMs (with normalizer as their default preprocessor) on sparse data to
            # retain sparse structure. Normalizing sparse data would otherwise result in a dense
            # matrix, which requires too much memory. For example, this is used for Bag of Words
            # models where normalization is not really needed.
            return data
        normalizer = normalize.Normalizer(zero_based=self.zero_based,
                                          norm_type=self.norm_type,
                                          transform_class=self.transform_class)
        return normalizer(data)
Пример #14
0
class Randomize(Preprocess):
    """
    Construct a preprocessor for randomization of classes,
    attributes and/or metas.
    Given a data table, preprocessor returns a new table in
    which the data is shuffled.

    Parameters
    ----------

    rand_type : RandTypes (default: Randomize.RandomizeClasses)
        Randomization type. If Randomize.RandomizeClasses, classes
        are shuffled.
        If Randomize.RandomizeAttributes, attributes are shuffled.
        If Randomize.RandomizeMetas, metas are shuffled.

    rand_seed : int (optional)
        Random seed

    Examples
    --------
    >>> from Orange.data import Table
    >>> from Orange.preprocess import Randomize
    >>> data = Table("iris")
    >>> randomizer = Randomize(Randomize.RandomizeClasses)
    >>> randomized_data = randomizer(data)
    """
    Type = Enum("Randomize",
                dict(RandomizeClasses=1,
                     RandomizeAttributes=2,
                     RandomizeMetas=4),
                type=int)
    RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type

    def __init__(self, rand_type=RandomizeClasses, rand_seed=None):
        self.rand_type = rand_type
        self.rand_seed = rand_seed

    def __call__(self, data):
        """
        Apply randomization of the given data. Returns a new
        data table.

        Parameters
        ----------
        data : Orange.data.Table
            A data table to be randomized.

        Returns
        -------
        data : Orange.data.Table
            Randomized data table.
        """
        new_data = data.copy()
        if self.rand_type & Randomize.RandomizeClasses:
            new_data.Y = self.randomize(new_data.Y)
        if self.rand_type & Randomize.RandomizeAttributes:
            new_data.X = self.randomize(new_data.X)
        if self.rand_type & Randomize.RandomizeMetas:
            new_data.metas = self.randomize(new_data.metas)
        return new_data

    def randomize(self, table):
        return skl_shuffle(table, random_state=self.rand_seed)
Пример #15
0
class OWLinearProjection(OWAnchorProjectionWidget):
    name = "Linear Projection"
    description = "A multi-axis projection of data onto " \
                  "a two-dimensional plane."
    icon = "icons/LinearProjection.svg"
    priority = 240
    keywords = []

    class Inputs(OWAnchorProjectionWidget.Inputs):
        projection_input = Input("Projection", Table)

    Placement = Enum("Placement", dict(Circular=0, LDA=1, PCA=2, Projection=3),
                     type=int, qualname="OWLinearProjection.Placement")

    Component_name = {Placement.Circular: "C", Placement.LDA: "LD",
                      Placement.PCA: "PC"}
    Variable_name = {Placement.Circular: "circular",
                     Placement.LDA: "lda",
                     Placement.PCA: "pca",
                     Placement.Projection: "projection"}
    Projection_name = {Placement.Circular: "Circular Placement",
                       Placement.LDA: "Linear Discriminant Analysis",
                       Placement.PCA: "Principal Component Analysis",
                       Placement.Projection: "Use input projection"}

    settings_version = 4

    placement = Setting(Placement.Circular)
    selected_vars = ContextSetting([])
    vizrank = SettingProvider(LinearProjectionVizRank)
    GRAPH_CLASS = OWLinProjGraph
    graph = SettingProvider(OWLinProjGraph)

    class Warning(OWAnchorProjectionWidget.Warning):
        not_enough_comp = Msg("Input projection has less than two components")
        trivial_components = Msg(
            "All components of the PCA are trivial (explain zero variance). "
            "Input data is constant (or near constant).")

    class Error(OWAnchorProjectionWidget.Error):
        no_cont_features = Msg("Plotting requires numeric features")
        proj_and_domain_match = Msg("Projection and Data domains do not match")

    def __init__(self):
        self.model_selected = VariableListModel(enable_dnd=True)
        self.model_selected.rowsInserted.connect(self.__model_selected_changed)
        self.model_selected.rowsRemoved.connect(self.__model_selected_changed)
        self.model_other = VariableListModel(enable_dnd=True)

        self.vizrank, self.btn_vizrank = LinearProjectionVizRank.add_vizrank(
            None, self, "Suggest Features", self.__vizrank_set_attrs)

        super().__init__()
        self.projection_input = None
        self.variables = None

    def _add_controls(self):
        self._add_controls_variables()
        self._add_controls_placement()
        super()._add_controls()
        self.graph.gui.add_control(
            self._effects_box, gui.hSlider, "Hide radius:", master=self.graph,
            value="hide_radius", minValue=0, maxValue=100, step=10,
            createLabel=False, callback=self.__radius_slider_changed
        )
        self.controlArea.layout().removeWidget(self.control_area_stretch)
        self.control_area_stretch.setParent(None)

    def _add_controls_variables(self):
        self.variables_selection = VariablesSelection(
            self, self.model_selected, self.model_other, self.controlArea
        )
        self.variables_selection.add_remove.layout().addWidget(
            self.btn_vizrank
        )

    def _add_controls_placement(self):
        box = gui.widgetBox(
            self.controlArea, True,
            sizePolicy=(QSizePolicy.Minimum, QSizePolicy.Maximum)
        )
        self.radio_placement = gui.radioButtonsInBox(
            box, self, "placement",
            btnLabels=[self.Projection_name[x] for x in self.Placement],
            callback=self.__placement_radio_changed
        )

    @property
    def continuous_variables(self):
        if self.data is None or self.data.domain is None:
            return []
        dom = self.data.domain
        return [v for v in chain(dom.variables, dom.metas) if v.is_continuous]

    def __vizrank_set_attrs(self, attrs):
        if not attrs:
            return
        self.model_selected[:] = attrs[:]
        self.model_other[:] = [var for var in self.continuous_variables
                               if var not in attrs]

    def __model_selected_changed(self):
        self.selected_vars = [(var.name, vartype(var)) for var
                              in self.model_selected]
        self.projection = None
        self.variables = None
        self._check_options()
        self.setup_plot()
        self.commit()

    def __placement_radio_changed(self):
        self.variables_selection.set_enabled(
            self.placement in [self.Placement.Circular, self.Placement.LDA])
        self.controls.graph.hide_radius.setEnabled(
            self.placement != self.Placement.Circular)
        self.projection = None
        self.variables = None
        self._init_vizrank()
        self.setup_plot()
        self.commit()

    def __radius_slider_changed(self):
        self.graph.update_radius()

    def colors_changed(self):
        super().colors_changed()
        self._init_vizrank()

    def set_data(self, data):
        super().set_data(data)
        if self.data is not None and len(self.selected_vars):
            d, selected = self.data.domain, [v[0] for v in self.selected_vars]
            self.model_selected[:] = [d[attr] for attr in selected]
            self.model_other[:] = [d[attr.name] for attr in
                                   self.continuous_variables
                                   if attr.name not in selected]
        elif self.data is not None:
            self.model_selected[:] = self.continuous_variables[:3]
            self.model_other[:] = self.continuous_variables[3:]

        self._check_options()
        self._init_vizrank()

    def _check_options(self):
        buttons = self.radio_placement.buttons
        for btn in buttons:
            btn.setEnabled(True)
        if self.data is not None:
            has_discrete_class = self.data.domain.has_discrete_class
            if not has_discrete_class or len(np.unique(self.data.Y)) < 2:
                buttons[self.Placement.LDA].setEnabled(False)
                if self.placement == self.Placement.LDA:
                    self.placement = self.Placement.Circular
            if not self.projection_input:
                buttons[self.Placement.Projection].setEnabled(False)
                if self.placement == self.Placement.Projection:
                    self.placement = self.Placement.Circular

        self.variables_selection.set_enabled(
            self.placement in [self.Placement.Circular, self.Placement.LDA])
        self.controls.graph.hide_radius.setEnabled(
            self.placement != self.Placement.Circular)

    def _init_vizrank(self):
        is_enabled, msg = False, ""
        if self.data is None:
            msg = "There is no data."
        elif self.placement not in [self.Placement.Circular,
                                    self.Placement.LDA]:
            msg = "Suggest Features works only for Circular and " \
                  "Linear Discriminant Analysis Projection"
        elif self.attr_color is None:
            msg = "Color variable has to be selected"
        elif self.attr_color.is_continuous and \
                self.placement == self.Placement.LDA:
            msg = "Suggest Features does not work for Linear " \
                  "Discriminant Analysis Projection when " \
                  "continuous color variable is selected."
        elif len([v for v in self.continuous_variables
                  if v is not self.attr_color]) < 3:
            msg = "Not enough available continuous variables"
        elif len(self.data[self.valid_data]) < 2:
            msg = "Not enough valid data instances"
        else:
            is_enabled = not np.isnan(self.data.get_column_view(
                self.attr_color)[0].astype(float)).all()
        self.btn_vizrank.setToolTip(msg)
        self.btn_vizrank.setEnabled(is_enabled)
        if is_enabled:
            self.vizrank.initialize()

    def check_data(self):
        def error(err):
            err()
            self.data = None

        super().check_data()
        if self.data is not None:
            if not len(self.continuous_variables):
                error(self.Error.no_cont_features)

    def init_attr_values(self):
        super().init_attr_values()
        self.selected_vars = []

    @Inputs.projection_input
    def set_projection(self, projection):
        self.Warning.not_enough_comp.clear()
        if projection and len(projection) < 2:
            self.Warning.not_enough_comp()
            projection = None
        if projection is not None:
            self.placement = self.Placement.Projection
        self.projection_input = projection
        self._check_options()

    def get_embedding(self):
        self.valid_data = None
        if self.data is None or not self.variables:
            return None

        if self.placement == self.Placement.PCA:
            self.valid_data, ec, self.projection = self._get_pca()
            self.variables = self._pca.orig_domain.attributes
        else:
            self.valid_data, ec, self.projection = \
                self.prepare_projection_data(self.variables)

        self.Error.no_valid_data.clear()
        if self.valid_data is None or not sum(self.valid_data) or \
                self.projection is None or ec is None:
            self.Error.no_valid_data()
            return None

        embedding = np.zeros((len(self.data), 2), dtype=np.float)
        embedding[self.valid_data] = ec
        return embedding

    def prepare_projection_data(self, variables):
        def projection(_vars):
            attrs = self.projection_input.domain.attributes
            if set(attrs).issuperset(_vars):
                return self.projection_input[:2, _vars].X
            elif set(f.name for f in attrs).issuperset(f.name for f in _vars):
                return self.projection_input[:2, [f.name for f in _vars]].X
            else:
                self.Error.proj_and_domain_match()
                return None

        def get_axes(_vars):
            self.Error.proj_and_domain_match.clear()
            if self.placement == self.Placement.Circular:
                return LinProj.defaultaxes(len(_vars))
            elif self.placement == self.Placement.LDA:
                return self._get_lda(self.data, _vars)
            elif self.placement == self.Placement.Projection and \
                    self.projection_input is not None:
                return projection(_vars)
            else:
                return None

        coords = np.vstack(column_data(self.data, v, float) for v in variables)
        axes = get_axes(variables)
        if axes is None:
            return None, None, None

        valid_mask = ~np.isnan(coords).any(axis=0)
        X, Y = np.dot(axes, coords[:, valid_mask])
        if X.size and Y.size:
            X = normalized(X)
            Y = normalized(Y)
        return valid_mask, np.stack((X, Y), axis=1), axes.T

    def get_anchors(self):
        if self.projection is None:
            return None, None
        return self.projection, [v.name for v in self.variables]

    def setup_plot(self):
        self.init_projection_variables()
        super().setup_plot()

    def init_projection_variables(self):
        self.variables = None
        if self.data is None:
            return

        if self.placement in [self.Placement.Circular, self.Placement.LDA]:
            self.variables = self.model_selected[:]
        elif self.placement == self.Placement.Projection:
            self.variables = self.model_selected[:] + self.model_other[:]
        elif self.placement == self.Placement.PCA:
            self.variables = [var for var in self.data.domain.attributes
                              if var.is_continuous]

    def _get_lda(self, data, variables):
        data = data.transform(Domain(variables, data.domain.class_vars))
        lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
        lda.fit(data.X, data.Y)
        scalings = lda.scalings_[:, :2].T
        if scalings.shape == (1, 1):
            scalings = np.array([[1.], [0.]])
        return scalings

    def _get_pca(self):
        pca_projector = PCA(n_components=2)
        pca_projector.component = 2
        pca_projector.preprocessors = PCA.preprocessors + [Normalize()]

        pca = pca_projector(self.data)
        variance_ratio = pca.explained_variance_ratio_
        cumulative = np.cumsum(variance_ratio)

        self._pca = pca
        if not np.isfinite(cumulative[-1]):
            self.Warning.trivial_components()

        coords = pca(self.data).X
        valid_mask = ~np.isnan(coords).any(axis=1)
        # scale axes
        max_radius = np.min([np.abs(np.min(coords, axis=0)),
                             np.max(coords, axis=0)])
        axes = pca.components_.T.copy()
        axes *= max_radius / np.max(np.linalg.norm(axes, axis=1))
        return valid_mask, coords, axes

    def send_components(self):
        components = None
        if self.data is not None and self.valid_data is not None and \
                self.projection is not None:
            if self.placement in [self.Placement.Circular, self.Placement.LDA]:
                axes = self.projection
                attrs = self.model_selected
            elif self.placement == self.Placement.PCA:
                axes = self._pca.components_.T
                attrs = self._pca.orig_domain.attributes
            if self.placement != self.Placement.Projection:
                meta_attrs = [StringVariable(name='component')]
                metas = np.array(
                    [["{}{}".format(self.Component_name[self.placement], i + 1)
                      for i in range(axes.shape[1])]], dtype=object).T
                components = Table(Domain(attrs, metas=meta_attrs),
                                   axes.T, metas=metas)
                components.name = self.data.name
            else:
                components = self.projection_input
        self.Outputs.components.send(components)

    def _get_projection_variables(self):
        pn = self.Variable_name[self.placement]
        self.embedding_variables_names = ("{}-x".format(pn), "{}-y".format(pn))
        return super()._get_projection_variables()

    def _get_send_report_caption(self):
        def projection_name():
            return self.Projection_name[self.placement]

        return report.render_items_vert((
            ("Projection", projection_name()),
            ("Color", self._get_caption_var_name(self.attr_color)),
            ("Label", self._get_caption_var_name(self.attr_label)),
            ("Shape", self._get_caption_var_name(self.attr_shape)),
            ("Size", self._get_caption_var_name(self.attr_size)),
            ("Jittering", self.graph.jitter_size != 0 and
             "{} %".format(self.graph.jitter_size))))

    def clear(self):
        self.variables = None
        if self.model_selected:
            self.model_selected.clear()
        if self.model_other:
            self.model_other.clear()
        super().clear()

    @classmethod
    def migrate_settings(cls, settings_, version):
        if version < 2:
            settings_["point_width"] = settings_["point_size"]
        if version < 3:
            settings_graph = {}
            settings_graph["jitter_size"] = settings_["jitter_value"]
            settings_graph["point_width"] = settings_["point_width"]
            settings_graph["alpha_value"] = settings_["alpha_value"]
            settings_graph["class_density"] = settings_["class_density"]
            settings_["graph"] = settings_graph
        if version < 4:
            if "radius" in settings_:
                settings_["graph"]["hide_radius"] = settings_["radius"]
            if "selection_indices" in settings_ and \
                    settings_["selection_indices"] is not None:
                selection = settings_["selection_indices"]
                settings_["selection"] = [(i, 1) for i, selected in
                                          enumerate(selection) if selected]

    @classmethod
    def migrate_context(cls, context, version):
        if version < 2:
            domain = context.ordered_domain
            c_domain = [t for t in context.ordered_domain if t[1] == 2]
            d_domain = [t for t in context.ordered_domain if t[1] == 1]
            for d, old_val, new_val in ((domain, "color_index", "attr_color"),
                                        (d_domain, "shape_index", "attr_shape"),
                                        (c_domain, "size_index", "attr_size")):
                index = context.values[old_val][0] - 1
                context.values[new_val] = (d[index][0], d[index][1] + 100) \
                    if 0 <= index < len(d) else None
        if version < 3:
            context.values["graph"] = {
                "attr_color": context.values["attr_color"],
                "attr_shape": context.values["attr_shape"],
                "attr_size": context.values["attr_size"]
            }
        if version == 3:
            values = context.values
            values["attr_color"] = values["graph"]["attr_color"]
            values["attr_size"] = values["graph"]["attr_size"]
            values["attr_shape"] = values["graph"]["attr_shape"]
            values["attr_label"] = values["graph"]["attr_label"]
Пример #16
0
class Randomize(Preprocess):
    """
    Construct a preprocessor for randomization of classes,
    attributes or metas.
    Given a data table, preprocessor returns a new table in
    which the data is shuffled.

    Parameters
    ----------

    rand_type : RandTypes (default: Randomize.RandomizeClasses)
        Randomization type. If Randomize.RandomizeClasses, classes
        are shuffled.
        If Randomize.RandomizeAttributes, attributes are shuffled.
        If Randomize.RandomizeMetas, metas are shuffled.

    rand_seed : int (optional)
        Random seed

    Examples
    --------
    >>> from Orange.data import Table
    >>> from Orange.preprocess import Randomize
    >>> data = Table("iris")
    >>> randomizer = Randomize(Randomize.RandomizeClasses)
    >>> randomized_data = randomizer(data)
    """
    Type = Enum("Randomize",
                "RandomizeClasses, RandomizeAttributes, RandomizeMetas")
    RandomizeClasses, RandomizeAttributes, RandomizeMetas = Type

    def __init__(self, rand_type=RandomizeClasses, rand_seed=None):
        self.rand_type = rand_type
        self.rand_seed = rand_seed

    def __call__(self, data):
        """
        Apply randomization of the given data. Returns a new
        data table.

        Parameters
        ----------
        data : Orange.data.Table
            A data table to be randomized.

        Returns
        -------
        data : Orange.data.Table
            Randomized data table.
        """
        new_data = Table(data)
        new_data.ensure_copy()

        if self.rand_type == Randomize.RandomizeClasses:
            self.randomize(new_data.Y)
        elif self.rand_type == Randomize.RandomizeAttributes:
            self.randomize(new_data.X)
        elif self.rand_type == Randomize.RandomizeMetas:
            self.randomize(new_data.metas)
        else:
            raise TypeError('Unsupported type')

        return new_data

    def randomize(self, table):
        np.random.seed(self.rand_seed)
        if len(table.shape) > 1:
            for i in range(table.shape[1]):
                np.random.shuffle(table[:, i])
        else:
            np.random.shuffle(table)