Exemplo n.º 1
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     if dataset is None:
         self.stats = self.dist = self.conts = []
         return
     self.is_continuous = attr.is_continuous
     if self.group_var:
         self.dist = []
         self.conts = datacaching.getCached(
             dataset, contingency.get_contingency,
             (dataset, attr, self.group_var))
         if self.is_continuous:
             self.stats = [BoxData(cont) for cont in self.conts]
         self.label_txts_all = self.group_var.values
     else:
         self.dist = datacaching.getCached(
             dataset, distribution.get_distribution, (dataset, attr))
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 2
0
 def compute_box_data(self):
     dataset = self.dataset
     if dataset is None:
         self.stats = self.dist = self.conts = []
         return
     attr = self.attributes[self.attributes_select[0]][0]
     attr = dataset.domain[attr]
     self.is_continuous = attr.is_continuous
     group_by = self.grouping_select[0]
     if group_by:
         group = self.grouping[group_by][0]
         self.dist = []
         self.conts = datacaching.getCached(dataset,
                                            contingency.get_contingency,
                                            (dataset, attr, group))
         if self.is_continuous:
             self.stats = [BoxData(cont) for cont in self.conts]
         self.label_txts_all = dataset.domain[group].values
     else:
         self.dist = datacaching.getCached(dataset,
                                           distribution.get_distribution,
                                           (dataset, attr))
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist)]
         self.label_txts_all = [""]
     self.label_txts = [
         txts for stat, txts in zip(self.stats, self.label_txts_all)
         if stat.N > 0
     ]
     self.stats = [stat for stat in self.stats if stat.N > 0]
Exemplo n.º 3
0
 def compute_box_data(self):
     dataset = self.dataset
     if dataset is None:
         self.stats = self.dist = self.conts = []
         return
     attr = self.attributes[self.attributes_select[0]][0]
     attr = dataset.domain[attr]
     self.is_continuous = attr.is_continuous
     group_by = self.grouping_select[0]
     if group_by:
         group = self.grouping[group_by][0]
         self.dist = []
         self.conts = datacaching.getCached(
             dataset, contingency.get_contingency,
             (dataset, attr, group))
         if self.is_continuous:
             self.stats = [BoxData(cont) for cont in self.conts]
         self.label_txts_all = dataset.domain[group].values
     else:
         self.dist = datacaching.getCached(
             dataset, distribution.get_distribution, (dataset, attr))
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.N > 0]
     self.stats = [stat for stat in self.stats if stat.N > 0]
Exemplo n.º 4
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     if dataset is None:
         self.stats = self.dist = self.conts = []
         return
     self.is_continuous = attr.is_continuous
     if self.group_var:
         self.dist = []
         self.conts = datacaching.getCached(dataset,
                                            contingency.get_contingency,
                                            (dataset, attr, self.group_var))
         if self.is_continuous:
             self.stats = [BoxData(cont) for cont in self.conts]
         self.label_txts_all = self.group_var.values
     else:
         self.dist = datacaching.getCached(dataset,
                                           distribution.get_distribution,
                                           (dataset, attr))
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist)]
         self.label_txts_all = [""]
     self.label_txts = [
         txts for stat, txts in zip(self.stats, self.label_txts_all)
         if stat.n > 0
     ]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 5
0
 def compute_box_data(self):
     dataset = self.ddataset
     if dataset is None:
         self.stats = self.dist = self.conts = []
         return
     attr_ind = self.attributes_select[0]
     attr = dataset.domain[attr_ind]
     self.is_continuous = isinstance(attr, ContinuousVariable)
     group_by = self.grouping_select[0]
     if group_by:
         group_attr = self.grouping[group_by][0]
         group_ind = dataset.domain.index(group_attr)
         self.dist = []
         self.conts = datacaching.getCached(
             dataset, contingency.get_contingency,
             (dataset, attr_ind, group_ind))
         if self.is_continuous:
             self.stats = [BoxData(cont) for cont in self.conts]
         self.label_txts = dataset.domain[group_ind].values
     else:
         self.dist = datacaching.getCached(
             dataset, distribution.get_distribution, (dataset, attr_ind))
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist)]
         self.label_txts = [""]
     self.stats = [stat for stat in self.stats if stat.N > 0]
Exemplo n.º 6
0
    def _compute_scaled_data(self):
        data = self.data
        # We cache scaled_data and validArray to share them between widgets
        cached = getCached(data, "visualizationData")
        if cached:
            self.original_data, self.scaled_data, self.valid_data_array = cached
            return

        Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
        self.original_data = np.hstack((data.X, Y)).T
        self.scaled_data = no_jit = self.original_data.copy()
        self.valid_data_array = ~np.isnan(no_jit)
        for index in range(len(data.domain)):
            attr = data.domain[index]
            if attr.is_discrete:
                no_jit[index] *= 2
                no_jit[index] += 1
                no_jit[index] /= 2 * len(attr.values)
            else:
                dstat = self.domain_data_stat[index]
                no_jit[index] -= dstat.min
                if dstat.max != dstat.min:
                    no_jit[index] /= dstat.max - dstat.min
        setCached(
            data, "visualizationData",
            (self.original_data, self.scaled_data, self.valid_data_array))
Exemplo n.º 7
0
    def _compute_scaled_data(self):
        data = self.data
        # We cache scaled_data and validArray to share them between widgets
        cached = getCached(data, "visualizationData")
        if cached:
            self.data, self.scaled_data, self.valid_data_array = cached
            return

        Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
        all_data = np.hstack((data.X, Y, data.metas)).T
        self.scaled_data = self.data.copy()
        self.valid_data_array = np.isfinite(all_data)
        domain = self.domain
        for attr in chain(domain.attributes, domain.class_vars, domain.metas):
            c = self.scaled_data.get_column_view(attr)[0]
            if attr.is_discrete:
                c += 0.5
                c /= len(attr.values)
            else:
                dstat = self.domain_data_stat[attr]
                c -= dstat.min
                if dstat.max != dstat.min:
                    c /= dstat.max - dstat.min
        setCached(data, "visualizationData",
                  (self.data, self.scaled_data, self.valid_data_array))
Exemplo n.º 8
0
    def __compute_density(self, data):
        def desc(part, frm, to):
            nans = sum(dist[i].nans for i in range(frm, to))
            non_nans = sum(dist[i].non_nans for i in range(frm, to))
            tot = nans + non_nans
            if tot == 0:
                return ""
            density = getattr(data, part + "_density")()
            if density == Storage.DENSE:
                dp = "%.1f%%" % (100 * nans / tot) if nans > 0 else "no"
                return " (%s missing values)" % dp
            s = " (sparse" if density == Storage.SPARSE else " (tags"
            return s + ", density %.2f %%)" % (100 * non_nans / tot)

        dist = datacaching.getCached(data, basic_stats.DomainBasicStats,
                                     (data, True))
        domain = data.domain
        descriptions = [
            desc(part, frm, to)
            for part, frm, to in [("X", 0, len(domain.attributes)
                                   ), ("Y", len(domain.attributes),
                                       len(domain)),
                                  ("metas", len(domain),
                                   len(domain) + len(domain.metas))]
        ]
        if all(not d or d == " (no missing values)" for d in descriptions):
            descriptions = self.__no_missing
        return descriptions
Exemplo n.º 9
0
    def _compute_scaled_data(self):
        data = self.data
        # We cache scaled_data and validArray to share them between widgets
        cached = getCached(data, "visualizationData")
        if cached:
            self.original_data, self.scaled_data, self.valid_data_array = cached
            return

        Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
        self.original_data = np.hstack((data.X, Y)).T
        self.scaled_data = no_jit = self.original_data.copy()
        self.valid_data_array = ~np.isnan(no_jit)
        for index in range(len(data.domain)):
            attr = data.domain[index]
            if attr.is_discrete:
                no_jit[index] *= 2
                no_jit[index] += 1
                no_jit[index] /= 2 * len(attr.values)
            else:
                dstat = self.domain_data_stat[index]
                no_jit[index] -= dstat.min
                if dstat.max != dstat.min:
                    no_jit[index] /= dstat.max - dstat.min
        setCached(data, "visualizationData",
                  (self.original_data, self.scaled_data, self.valid_data_array))
Exemplo n.º 10
0
    def data(self, index, role):
        row, col = self.sorted_map[index.row()], index.column()
        example = self.examples[row]

        if role == gui.TableClassValueRole:
            return example.get_class()

        # check whether we have a sparse columns,
        # handle background color role while you are at it
        sp_data = attributes = None
        if col < self.n_attr_cols:
            if role == QtCore.Qt.BackgroundRole:
                return
            density = self.X_density
            if density != Storage.DENSE:
                sp_data, attributes = example.sparse_x, self.domain.attributes
        elif col < self.n_attr_class_cols:
            if role == QtCore.Qt.BackgroundRole:
                return self.cls_color
            density = self.Y_density
            if density != Storage.DENSE:
                sp_data, attributes = example.sparse_y, self.domain.class_vars
        else:
            if role == QtCore.Qt.BackgroundRole:
                return self.meta_color
            density = self.metas_density
            if density != Storage.DENSE:
                sp_data, attributes = \
                    example.sparse_metas, self.domain.class_vars

        if sp_data is not None:
            if role == QtCore.Qt.DisplayRole:
                if density == Storage.SPARSE:
                    return ", ".join(
                        "{}={}".format(attributes[i].name,
                                       attributes[i].repr_val(v))
                        for i, v in zip(sp_data.indices, sp_data.data))
                else:
                    return ", ".join(attributes[i].name
                                     for i in sp_data.indices)

        else:  # not sparse
            attr = self.all_attrs[col]
            val = example[attr]
            if role == QtCore.Qt.DisplayRole:
                return str(val)
            elif (role == gui.TableBarItem.BarRole
                  and isinstance(attr, ContinuousVariable) and not isnan(val)):
                if self.dist is None:
                    self.dist = datacaching.getCached(
                        self.examples, basic_stats.DomainBasicStats,
                        (self.examples, True))
                dist = self.dist[col]
                return (val - dist.min) / (dist.max - dist.min or 1)
            elif role == gui.TableValueRole:
                return val
            elif role == gui.TableVariable:
                return val.variable

        return self._other_data.get((index.row(), index.column(), role), None)
Exemplo n.º 11
0
    def __compute_density(self, data):
        def desc(part, frm, to):
            nans = sum(dist[i].nans for i in range(frm, to))
            non_nans = sum(dist[i].non_nans for i in range(frm, to))
            tot = nans + non_nans
            if tot == 0:
                return ""
            density = getattr(data, part + "_density")()
            if density == Storage.DENSE:
                dp = "%.1f%%" % (100 * nans / tot) if nans > 0 else "no"
                return " (%s missing values)" % dp
            s = " (sparse" if density == Storage.SPARSE else " (tags"
            return s + ", density %.2f %%)" % (100 * non_nans / tot)

        dist = datacaching.getCached(data,
                                     basic_stats.DomainBasicStats, (data, True))
        domain = data.domain
        descriptions = [desc(part, frm, to)
                        for part, frm, to in [
                            ("X", 0, len(domain.attributes)),
                            ("Y", len(domain.attributes), len(domain)),
                            ("metas", len(domain),
                             len(domain) + len(domain.metas))]]
        if all(not d or d == " (no missing values)" for d in descriptions):
            descriptions = self.__no_missing
        return descriptions
Exemplo n.º 12
0
    def _compute_scaled_data(self):
        data = self.data
        # We cache scaled_data and validArray to share them between widgets
        cached = getCached(data, "visualizationData")
        if cached:
            self.data, self.scaled_data, self.valid_data_array = cached
            return

        Y = data.Y if data.Y.ndim == 2 else np.atleast_2d(data.Y).T
        all_data = np.hstack((data.X, Y, data.metas)).T
        self.scaled_data = self.data.copy()
        self.valid_data_array = np.isfinite(all_data)
        domain = self.domain
        for attr in chain(domain.attributes, domain.class_vars, domain.metas):
            c = self.scaled_data.get_column_view(attr)[0]
            if attr.is_discrete:
                c += 0.5
                c /= len(attr.values)
            else:
                dstat = self.domain_data_stat[attr]
                c -= dstat.min
                if dstat.max != dstat.min:
                    c /= dstat.max - dstat.min
        setCached(data, "visualizationData",
                  (self.data, self.scaled_data, self.valid_data_array))
Exemplo n.º 13
0
 def _compute_domain_data_stat(self):
     stt = self.domain_data_stat = \
         getCached(self.data, DomainBasicStats, (self.data,))
     for index in range(len(self.domain)):
         attr = self.domain[index]
         if attr.is_discrete:
             self.attr_values[attr] = [0, len(attr.values)]
         elif attr.is_continuous:
             self.attr_values[attr] = [stt[index].min, stt[index].max]
Exemplo n.º 14
0
 def _compute_domain_data_stat(self):
     stt = self.domain_data_stat = \
         getCached(self.data, DomainBasicStats, (self.data, True))
     domain = self.domain
     for attr in chain(domain.variables, domain.metas):
         if attr.is_discrete:
             self.attr_values[attr] = [0, len(attr.values)]
         elif attr.is_continuous:
             self.attr_values[attr] = [stt[attr].min, stt[attr].max]
Exemplo n.º 15
0
 def _compute_domain_data_stat(self):
     stt = self.domain_data_stat = \
         getCached(self.data, DomainBasicStats, (self.data, True))
     domain = self.domain
     for attr in chain(domain.variables, domain.metas):
         if attr.is_discrete:
             self.attr_values[attr] = [0, len(attr.values)]
         elif attr.is_continuous:
             self.attr_values[attr] = [stt[attr].min, stt[attr].max]
Exemplo n.º 16
0
 def _compute_domain_data_stat(self):
     stt = self.domain_data_stat = \
         getCached(self.data, DomainBasicStats, (self.data,))
     for index in range(len(self.domain)):
         attr = self.domain[index]
         if attr.is_discrete:
             self.attr_values[attr] = [0, len(attr.values)]
         elif attr.is_continuous:
             self.attr_values[attr] = [stt[index].min, stt[index].max]
Exemplo n.º 17
0
    def set_info(self, data):
        """Updates data info."""
        def sp(n):
            if n == 0:
                return "No", "s"
            elif n == 1:
                return str(n), ''
            else:
                return str(n), 's'

        if data is None:
            self.info_ex.setText('No data on input.')
            self.info_attr.setText('')
            self.info_meta.setText('')
            self.info_class.setText('')
        else:
            if isinstance(data, SqlTable):
                descriptions = ['', '', '']
            else:
                descriptions = datacaching.getCached(data,
                                                     self.__compute_density,
                                                     (data, ))
            out_i = "~%s instance%s" % sp(data.approx_len())
            if descriptions is self.__no_missing:
                out_i += " (no missing values)"
            self.info_ex.setText(out_i)

            def update_num_inst():
                out_i = "%s instance%s" % sp(len(data))
                if descriptions is self.__no_missing:
                    out_i += " (no missing values)"
                self.info_ex.setText(out_i)

            threading.Thread(target=update_num_inst).start()

            self.info_attr.setText("%s feature%s" %
                                   sp(len(data.domain.attributes)) +
                                   descriptions[0])

            self.info_meta.setText("%s meta attribute%s" %
                                   sp(len(data.domain.metas)) +
                                   descriptions[2])

            if not data.domain.class_vars:
                out_c = 'No target variable.'
            else:
                if len(data.domain.class_vars) > 1:
                    out_c = "%s outcome%s" % sp(len(data.domain.class_vars))
                elif isinstance(data.domain.class_var, ContinuousVariable):
                    out_c = 'Continuous target variable'
                else:
                    out_c = 'Discrete class with %s value%s' % sp(
                        len(data.domain.class_var.values))
                out_c += descriptions[1]
            self.info_class.setText(out_c)
Exemplo n.º 18
0
    def set_info(self, data):
        """Updates data info."""
        def sp(n):
            if n == 0:
                return "No", "s"
            elif n == 1:
                return str(n), ''
            else:
                return str(n), 's'

        if data is None:
            self.info_ex.setText('No data on input.')
            self.info_attr.setText('')
            self.info_meta.setText('')
            self.info_class.setText('')
        else:
            if isinstance(data, SqlTable):
                descriptions = ['', '', '']
            else:
                descriptions = datacaching.getCached(
                    data, self.__compute_density, (data, ))
            out_i = "~%s instance%s" % sp(data.approx_len())
            if descriptions is self.__no_missing:
                out_i += " (no missing values)"
            self.info_ex.setText(out_i)

            def update_num_inst():
                out_i = "%s instance%s" % sp(len(data))
                if descriptions is self.__no_missing:
                    out_i += " (no missing values)"
                self.info_ex.setText(out_i)

            threading.Thread(target=update_num_inst).start()

            self.info_attr.setText("%s feature%s" %
                                   sp(len(data.domain.attributes)) +
                                   descriptions[0])

            self.info_meta.setText("%s meta attribute%s" %
                                   sp(len(data.domain.metas)) + descriptions[2])

            if not data.domain.class_vars:
                out_c = 'No target variable.'
            else:
                if len(data.domain.class_vars) > 1:
                    out_c = "%s outcome%s" % sp(len(data.domain.class_vars))
                elif isinstance(data.domain.class_var, ContinuousVariable):
                    out_c = 'Continuous target variable'
                else:
                    out_c = 'Discrete class with %s value%s' % sp(
                        len(data.domain.class_var.values))
                out_c += descriptions[1]
            self.info_class.setText(out_c)
Exemplo n.º 19
0
def table_summary(table):
    if isinstance(table, SqlTable):
        approx_len = table.approx_len()
        len_future = concurrent.futures.Future()

        def _len():
            len_future.set_result(len(table))

        threading.Thread(target=_len).start()  # KILL ME !!!

        return ApproxSummary(
            approx_len,
            len_future,
            table.domain,
            NotAvailable(),
            NotAvailable(),
            NotAvailable(),
        )
    else:
        domain = table.domain
        n_instances = len(table)
        # dist = basic_stats.DomainBasicStats(table, include_metas=True)
        bstats = datacaching.getCached(table, basic_stats.DomainBasicStats,
                                       (table, True))

        dist = bstats.stats
        X_dist, Y_dist, M_dist = numpy.split(
            dist,
            numpy.cumsum([len(domain.attributes),
                          len(domain.class_vars)]))

        def parts(array, density, col_dist):
            array = numpy.atleast_2d(array)
            nans = sum([dist.nans for dist in col_dist])
            non_nans = sum([dist.non_nans for dist in col_dist])
            if density == Storage.DENSE:
                return DenseArray(nans, non_nans, col_dist)
            elif density == Storage.SPARSE:
                return SparseArray(nans, non_nans, col_dist)
            elif density == Storage.SPARSE_BOOL:
                return SparseBoolArray(nans, non_nans, col_dist)
            elif density == Storage.MISSING:
                return NotAvailable()
            else:
                assert False

        X_part = parts(table.X, table.X_density(), X_dist)
        Y_part = parts(table.Y, table.Y_density(), Y_dist)
        M_part = parts(table.metas, table.metas_density(), M_dist)
        return Summary(n_instances, domain, X_part, Y_part, M_part)
Exemplo n.º 20
0
    def _stats_for_column(self, column):
        """
        Return BasicStats for `column` index.
        """
        coldesc = self.columns[column]
        if isinstance(coldesc, TableModel.Basket):
            return None

        if self.__stats is None:
            self.__stats = datacaching.getCached(self.source,
                                                 basic_stats.DomainBasicStats,
                                                 (self.source, True))

        return self.__stats[coldesc.var]
Exemplo n.º 21
0
    def _stats_for_column(self, column):
        """
        Return BasicStats for `column` index.
        """
        coldesc = self.columns[column]
        if isinstance(coldesc, TableModel.Basket):
            return None

        if self.__stats is None:
            self.__stats = datacaching.getCached(
                self.source, basic_stats.DomainBasicStats,
                (self.source, True)
            )

        return self.__stats[coldesc.var]
Exemplo n.º 22
0
def table_summary(table):
    if isinstance(table, SqlTable):
        approx_len = table.approx_len()
        len_future = concurrent.futures.Future()

        def _len():
            len_future.set_result(len(table))
        threading.Thread(target=_len).start()  # KILL ME !!!

        return ApproxSummary(approx_len, len_future, table.domain,
                             NotAvailable(), NotAvailable(), NotAvailable())
    else:
        domain = table.domain
        n_instances = len(table)
        # dist = basic_stats.DomainBasicStats(table, include_metas=True)
        bstats = datacaching.getCached(
            table, basic_stats.DomainBasicStats, (table, True)
        )

        dist = bstats.stats
        # pylint: disable=unbalanced-tuple-unpacking
        X_dist, Y_dist, M_dist = numpy.split(
            dist, numpy.cumsum([len(domain.attributes),
                                len(domain.class_vars)]))

        def parts(array, density, col_dist):
            array = numpy.atleast_2d(array)
            nans = sum([dist.nans for dist in col_dist])
            non_nans = sum([dist.non_nans for dist in col_dist])
            if density == Storage.DENSE:
                return DenseArray(nans, non_nans, col_dist)
            elif density == Storage.SPARSE:
                return SparseArray(nans, non_nans, col_dist)
            elif density == Storage.SPARSE_BOOL:
                return SparseBoolArray(nans, non_nans, col_dist)
            elif density == Storage.MISSING:
                return NotAvailable()
            else:
                assert False
                return None

        X_part = parts(table.X, table.X_density(), X_dist)
        Y_part = parts(table.Y, table.Y_density(), Y_dist)
        M_part = parts(table.metas, table.metas_density(), M_dist)
        return Summary(n_instances, domain, X_part, Y_part, M_part)
Exemplo n.º 23
0
    def set_data(self, data, subset_data=None, **args):
        if args.get("skipIfSame", 1):
            if checksum(data) == checksum(self.raw_data) and \
               checksum(subset_data) == checksum(self.raw_subset_data):
                return

        self.domain_data_stat = []
        self.attr_values = {}
        self.original_data = self.original_subset_data = None
        self.scaled_data = self.scaled_subset_data = None
        self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None
        self.valid_data_array = self.valid_subset_data_array = None

        self.raw_data = None
        self.raw_subset_data = None
        self.have_data = False
        self.have_subset_data = False
        self.data_has_class = False
        self.data_has_continuous_class = False
        self.data_has_discrete_class = False
        self.data_class_name = None
        self.data_domain = None
        self.data_class_index = None

        if data is None:
            return
        full_data = self.merge_data_sets(data, subset_data)

        self.raw_data = data
        self.raw_subset_data = subset_data

        len_data = data and len(data) or 0

        self.attribute_names = [attr.name for attr in full_data.domain]
        self.attribute_name_index = dict([(full_data.domain[i].name, i)
                                          for i in range(len(full_data.domain))])
        self.attribute_flip_info = {}

        self.data_domain = full_data.domain
        self.data_has_class = bool(full_data.domain.class_var)
        self.data_has_continuous_class = \
            isinstance(full_data.domain.class_var, ContinuousVariable)
        self.data_has_discrete_class = \
            isinstance(full_data.domain.class_var, DiscreteVariable)

        self.data_class_name = self.data_has_class and full_data.domain.class_var.name
        if self.data_has_class:
            self.data_class_index = self.attribute_name_index[self.data_class_name]
        self.have_data = bool(self.raw_data and len(self.raw_data) > 0)
        self.have_subset_data = bool(self.raw_subset_data and
                                     len(self.raw_subset_data) > 0)

        self.domain_data_stat = getCached(full_data,
                                          DomainBasicStats,
                                          (full_data,))

        sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs",
                                                  1)

        for index in range(len(full_data.domain)):
            attr = full_data.domain[index]
            if isinstance(attr, DiscreteVariable):
                self.attr_values[attr.name] = [0, len(attr.values)]
            elif isinstance(attr, ContinuousVariable):
                self.attr_values[attr.name] = [self.domain_data_stat[index].min,
                                               self.domain_data_stat[index].max]

        # the original_data, no_jittering_scaled_data and validArray are arrays
        # that we can cache so that other visualization widgets don't need to
        # compute it. The scaled_data on the other hand has to be computed for
        # each widget separately because of different
        # jitter_continuous and jitter_size values
        if getCached(data, "visualizationData") and subset_data == None:
            self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data,
                                                                                                 "visualizationData")
            self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array(
                []).reshape([len(self.original_data), 0])
        else:
            no_jittering_data = np.hstack((full_data.X, full_data.Y)).T
            valid_data_array = no_jittering_data != np.NaN
            original_data = no_jittering_data.copy()

            for index in range(len(data.domain)):
                attr = data.domain[index]
                if isinstance(attr, DiscreteVariable):
                    # see if the values for discrete attributes have to be resorted
                    variable_value_indices = get_variable_value_indices(data.domain[index],
                                                                        sort_values_for_discrete_attrs)
                    if 0 in [i == variable_value_indices[attr.values[i]]
                             for i in range(len(attr.values))]:
                        # make the array a contiguous, otherwise the putmask
                        # function does not work
                        line = no_jittering_data[index].copy()
                        indices = [np.where(line == val, 1, 0)
                                   for val in range(len(attr.values))]
                        for i in range(len(attr.values)):
                            np.putmask(line, indices[i],
                                          variable_value_indices[attr.values[i]])
                        no_jittering_data[index] = line   # save the changed array
                        original_data[index] = line     # reorder also the values in the original data
                    no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0)
                                                / float(2 * len(attr.values)))

                elif isinstance(attr, ContinuousVariable):
                    diff = self.domain_data_stat[index].max - self.domain_data_stat[
                        index].min or 1     # if all values are the same then prevent division by zero
                    no_jittering_data[index] = (no_jittering_data[index] -
                                                self.domain_data_stat[index].min) / diff

            self.original_data = original_data[:, :len_data]
            self.original_subset_data = original_data[:, len_data:]
            self.no_jittering_scaled_data = no_jittering_data[:, :len_data]
            self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:]
            self.valid_data_array = valid_data_array[:, :len_data]
            self.valid_subset_data_array = valid_data_array[:, len_data:]

        if data:
            setCached(data, "visualizationData",
                      (self.original_data, self.no_jittering_scaled_data,
                       self.valid_data_array))
        if subset_data:
            setCached(subset_data, "visualizationData",
                      (self.original_subset_data,
                       self.no_jittering_scaled_subset_data,
                       self.valid_subset_data_array))

        # compute the scaled_data arrays
        scaled_data = np.concatenate([self.no_jittering_scaled_data,
                                         self.no_jittering_scaled_subset_data],
                                        axis=1)

        # Random generators for jittering
        random = np.random.RandomState(seed=self.jitter_seed)
        rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain))
        for index, rseed in zip(list(range(len(data.domain))), rand_seeds):
            # Need to use a different seed for each feature
            random = np.random.RandomState(seed=rseed)
            attr = data.domain[index]
            if isinstance(attr, DiscreteVariable):
                scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \
                                      (random.rand(len(full_data)) - 0.5)

            elif isinstance(attr, ContinuousVariable) and self.jitter_continuous:
                scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data)))
                scaled_data[index] = np.absolute(scaled_data[index])       # fix values below zero
                ind = np.where(scaled_data[index] > 1.0, 1, 0)     # fix values above 1
                np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index]))

        if self.have_subset_data:
            # Fix all subset instances which are also in the main data
            # to have the same jittered values
            ids_to_indices = dict((inst.id, i)
                                  for i, inst in enumerate(self.raw_data))

            subset_ids_map = [[i, ids_to_indices[s.id]]
                              for i, s in enumerate(self.raw_subset_data)
                              if s.id in ids_to_indices]
            if len(subset_ids_map):
                subset_ids_map = np.array(subset_ids_map)
                subset_ids_map[:, 0] += len_data
                scaled_data[:, subset_ids_map[:, 0]] = \
                    scaled_data[:, subset_ids_map[:, 1]]

        self.scaled_data = scaled_data[:, :len_data]
        self.scaled_subset_data = scaled_data[:, len_data:]
Exemplo n.º 24
0
    def data(self, index, role):
        row, col = self.sorted_map[index.row()], index.column()
        example = self.examples[row]

        if role == gui.TableClassValueRole:
            return example.get_class()

        # check whether we have a sparse columns,
        # handle background color role while you are at it
        sp_data = attributes = None
        if col < self.n_attr_cols:
            if role == QtCore.Qt.BackgroundRole:
                return
            density = self.X_density
            if density != Storage.DENSE:
                sp_data, attributes = example.sparse_x, self.domain.attributes
        elif col < self.n_attr_class_cols:
            if role == QtCore.Qt.BackgroundRole:
                return self.cls_color
            density = self.Y_density
            if density != Storage.DENSE:
                sp_data, attributes = example.sparse_y, self.domain.class_vars
        else:
            if role == QtCore.Qt.BackgroundRole:
                return self.meta_color
            density = self.metas_density
            if density != Storage.DENSE:
                sp_data, attributes = \
                    example.sparse_metas, self.domain.class_vars

        if sp_data is not None:
            if role == QtCore.Qt.DisplayRole:
                if density == Storage.SPARSE:
                    return ", ".join(
                        "{}={}".format(attributes[i].name,
                                       attributes[i].repr_val(v))
                        for i, v in zip(sp_data.indices, sp_data.data))
                else:
                    return ", ".join(
                        attributes[i].name for i in sp_data.indices)

        else:   # not sparse
            attr = self.all_attrs[col]
            val = example[attr]
            if role == QtCore.Qt.DisplayRole:
                return str(val)
            elif (role == gui.TableBarItem.BarRole and
                    isinstance(attr, ContinuousVariable) and
                    not isnan(val)):
                if self.dist is None:
                    self.dist = datacaching.getCached(
                        self.examples, basic_stats.DomainBasicStats,
                        (self.examples, True))
                dist = self.dist[col]
                return (val - dist.min) / (dist.max - dist.min or 1)
            elif role == gui.TableValueRole:
                return val
            elif role == gui.TableVariable:
                return val.variable

        return self._other_data.get((index.row(), index.column(), role), None)
Exemplo n.º 25
0
    def set_data(self, data, subset_data=None, **args):
        if args.get("skipIfSame", 1):
            if checksum(data) == checksum(self.raw_data) and \
               checksum(subset_data) == checksum(self.raw_subset_data):
                return

        self.domain_data_stat = []
        self.attr_values = {}
        self.original_data = self.original_subset_data = None
        self.scaled_data = self.scaled_subset_data = None
        self.no_jittering_scaled_data = self.no_jittering_scaled_subset_data = None
        self.valid_data_array = self.valid_subset_data_array = None

        self.raw_data = None
        self.raw_subset_data = None
        self.have_data = False
        self.have_subset_data = False
        self.data_has_class = False
        self.data_has_continuous_class = False
        self.data_has_discrete_class = False
        self.data_class_name = None
        self.data_domain = None
        self.data_class_index = None

        if data is None:
            return
        full_data = self.merge_data_sets(data, subset_data)

        self.raw_data = data
        self.raw_subset_data = subset_data

        len_data = data and len(data) or 0

        self.attribute_names = [attr.name for attr in full_data.domain]
        self.attribute_name_index = dict([(full_data.domain[i].name, i)
                                          for i in range(len(full_data.domain))])
        self.attribute_flip_info = {}

        self.data_domain = full_data.domain
        self.data_has_class = bool(full_data.domain.class_var)
        self.data_has_continuous_class = bool(self.data_has_class and
                                              full_data.domain.class_var.var_type == VarTypes.Continuous)
        self.data_has_discrete_class = bool(self.data_has_class and
                                            full_data.domain.class_var.var_type == VarTypes.Discrete)
        self.data_class_name = self.data_has_class and full_data.domain.class_var.name
        if self.data_has_class:
            self.data_class_index = self.attribute_name_index[self.data_class_name]
        self.have_data = bool(self.raw_data and len(self.raw_data) > 0)
        self.have_subset_data = bool(self.raw_subset_data and
                                     len(self.raw_subset_data) > 0)

        self.domain_data_stat = getCached(full_data,
                                          DomainBasicStats,
                                          (full_data,))

        sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs",
                                                  1)

        for index in range(len(full_data.domain)):
            attr = full_data.domain[index]
            if attr.var_type == VarTypes.Discrete:
                self.attr_values[attr.name] = [0, len(attr.values)]
            elif attr.var_type == VarTypes.Continuous:
                self.attr_values[attr.name] = [self.domain_data_stat[index].min,
                                               self.domain_data_stat[index].max]

        # the original_data, no_jittering_scaled_data and validArray are arrays
        # that we can cache so that other visualization widgets don't need to
        # compute it. The scaled_data on the other hand has to be computed for
        # each widget separately because of different
        # jitter_continuous and jitter_size values
        if getCached(data, "visualizationData") and subset_data == None:
            self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data,
                                                                                                 "visualizationData")
            self.original_subset_data = self.no_jittering_scaled_subset_data = self.valid_subset_data_array = np.array(
                []).reshape([len(self.original_data), 0])
        else:
            no_jittering_data = np.hstack((full_data.X, full_data.Y)).T
            valid_data_array = no_jittering_data != np.NaN
            original_data = no_jittering_data.copy()

            for index in range(len(data.domain)):
                attr = data.domain[index]
                if attr.var_type == VarTypes.Discrete:
                    # see if the values for discrete attributes have to be resorted
                    variable_value_indices = get_variable_value_indices(data.domain[index],
                                                                        sort_values_for_discrete_attrs)
                    if 0 in [i == variable_value_indices[attr.values[i]]
                             for i in range(len(attr.values))]:
                        # make the array a contiguous, otherwise the putmask 
                        # function does not work
                        line = no_jittering_data[index].copy()
                        indices = [np.where(line == val, 1, 0)
                                   for val in range(len(attr.values))]
                        for i in range(len(attr.values)):
                            np.putmask(line, indices[i],
                                          variable_value_indices[attr.values[i]])
                        no_jittering_data[index] = line   # save the changed array
                        original_data[index] = line     # reorder also the values in the original data
                    no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0)
                                                / float(2 * len(attr.values)))

                elif attr.var_type == VarTypes.Continuous:
                    diff = self.domain_data_stat[index].max - self.domain_data_stat[
                        index].min or 1     # if all values are the same then prevent division by zero
                    no_jittering_data[index] = (no_jittering_data[index] -
                                                self.domain_data_stat[index].min) / diff

            self.original_data = original_data[:, :len_data]
            self.original_subset_data = original_data[:, len_data:]
            self.no_jittering_scaled_data = no_jittering_data[:, :len_data]
            self.no_jittering_scaled_subset_data = no_jittering_data[:, len_data:]
            self.valid_data_array = valid_data_array[:, :len_data]
            self.valid_subset_data_array = valid_data_array[:, len_data:]

        if data:
            setCached(data, "visualizationData",
                      (self.original_data, self.no_jittering_scaled_data,
                       self.valid_data_array))
        if subset_data:
            setCached(subset_data, "visualizationData",
                      (self.original_subset_data,
                       self.no_jittering_scaled_subset_data,
                       self.valid_subset_data_array))

        # compute the scaled_data arrays
        scaled_data = np.concatenate([self.no_jittering_scaled_data,
                                         self.no_jittering_scaled_subset_data],
                                        axis=1)

        # Random generators for jittering
        random = np.random.RandomState(seed=self.jitter_seed)
        rand_seeds = random.random_integers(0, sys.maxsize - 1, size=len(data.domain))
        for index, rseed in zip(list(range(len(data.domain))), rand_seeds):
            # Need to use a different seed for each feature
            random = np.random.RandomState(seed=rseed)
            attr = data.domain[index]
            if attr.var_type == VarTypes.Discrete:
                scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \
                                      (random.rand(len(full_data)) - 0.5)

            elif attr.var_type == VarTypes.Continuous and self.jitter_continuous:
                scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data)))
                scaled_data[index] = np.absolute(scaled_data[index])       # fix values below zero
                ind = np.where(scaled_data[index] > 1.0, 1, 0)     # fix values above 1
                np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index]))

        if self.have_subset_data:
            # Fix all subset instances which are also in the main data
            # to have the same jittered values
            ids_to_indices = dict((inst.id, i)
                                  for i, inst in enumerate(self.raw_data))

            subset_ids_map = [[i, ids_to_indices[s.id]]
                              for i, s in enumerate(self.raw_subset_data)
                              if s.id in ids_to_indices]
            if len(subset_ids_map):
                subset_ids_map = np.array(subset_ids_map)
                subset_ids_map[:, 0] += len_data
                scaled_data[:, subset_ids_map[:, 0]] = \
                    scaled_data[:, subset_ids_map[:, 1]]

        self.scaled_data = scaled_data[:, :len_data]
        self.scaled_subset_data = scaled_data[:, len_data:]
Exemplo n.º 26
0
    def set_data(self, data, **args):
        if args.get("skipIfSame", 1):
            if checksum(data) == checksum(self.raw_data):
                return

        self.domain_data_stat = []
        self.attr_values = {}
        self.original_data = None
        self.scaled_data = None
        self.no_jittering_scaled_data = None
        self.valid_data_array = None

        self.raw_data = None
        self.have_data = False
        self.data_has_class = False
        self.data_has_continuous_class = False
        self.data_has_discrete_class = False
        self.data_class_name = None
        self.data_domain = None
        self.data_class_index = None

        if data is None:
            return
        full_data = data
        self.raw_data = data

        len_data = data and len(data) or 0

        self.attribute_names = [attr.name for attr in full_data.domain]
        self.attribute_name_index = dict([
            (full_data.domain[i].name, i) for i in range(len(full_data.domain))
        ])
        self.attribute_flip_info = {}

        self.data_domain = full_data.domain
        self.data_has_class = bool(full_data.domain.class_var)
        self.data_has_continuous_class = full_data.domain.has_continuous_class
        self.data_has_discrete_class = full_data.domain.has_discrete_class

        self.data_class_name = self.data_has_class and full_data.domain.class_var.name
        if self.data_has_class:
            self.data_class_index = self.attribute_name_index[
                self.data_class_name]
        self.have_data = bool(self.raw_data and len(self.raw_data) > 0)

        self.domain_data_stat = getCached(full_data, DomainBasicStats,
                                          (full_data, ))

        sort_values_for_discrete_attrs = args.get(
            "sort_values_for_discrete_attrs", 1)

        for index in range(len(full_data.domain)):
            attr = full_data.domain[index]
            if attr.is_discrete:
                self.attr_values[attr.name] = [0, len(attr.values)]
            elif attr.is_continuous:
                self.attr_values[attr.name] = [
                    self.domain_data_stat[index].min,
                    self.domain_data_stat[index].max
                ]

        if 'no_data' in args:
            return

        # the original_data, no_jittering_scaled_data and validArray are arrays
        # that we can cache so that other visualization widgets don't need to
        # compute it. The scaled_data on the other hand has to be computed for
        # each widget separately because of different
        # jitter_continuous and jitter_size values
        if getCached(data, "visualizationData"):
            self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(
                data, "visualizationData")
        else:
            no_jittering_data = np.c_[full_data.X, full_data.Y].T
            valid_data_array = ~np.isnan(no_jittering_data)
            original_data = no_jittering_data.copy()

            for index in range(len(data.domain)):
                attr = data.domain[index]
                if attr.is_discrete:
                    # see if the values for discrete attributes have to be resorted
                    variable_value_indices = get_variable_value_indices(
                        data.domain[index], sort_values_for_discrete_attrs)
                    if 0 in [
                            i == variable_value_indices[attr.values[i]]
                            for i in range(len(attr.values))
                    ]:
                        # make the array a contiguous, otherwise the putmask
                        # function does not work
                        line = no_jittering_data[index].copy()
                        indices = [
                            np.where(line == val, 1, 0)
                            for val in range(len(attr.values))
                        ]
                        for i in range(len(attr.values)):
                            np.putmask(line, indices[i],
                                       variable_value_indices[attr.values[i]])
                        no_jittering_data[
                            index] = line  # save the changed array
                        original_data[
                            index] = line  # reorder also the values in the original data
                    no_jittering_data[index] = (
                        (no_jittering_data[index] * 2.0 + 1.0) /
                        float(2 * len(attr.values)))

                elif attr.is_continuous:
                    diff = self.domain_data_stat[
                        index].max - self.domain_data_stat[
                            index].min or 1  # if all values are the same then prevent division by zero
                    no_jittering_data[index] = (
                        no_jittering_data[index] -
                        self.domain_data_stat[index].min) / diff

            self.original_data = original_data
            self.no_jittering_scaled_data = no_jittering_data
            self.valid_data_array = valid_data_array

        if data:
            setCached(data, "visualizationData",
                      (self.original_data, self.no_jittering_scaled_data,
                       self.valid_data_array))

        # compute the scaled_data arrays
        scaled_data = self.no_jittering_scaled_data

        # Random generators for jittering
        random = np.random.RandomState(seed=self.jitter_seed)
        rand_seeds = random.random_integers(0,
                                            2**30 - 1,
                                            size=len(data.domain))
        for index, rseed in zip(list(range(len(data.domain))), rand_seeds):
            # Need to use a different seed for each feature
            random = np.random.RandomState(seed=rseed)
            attr = data.domain[index]
            if attr.is_discrete:
                scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \
                                      (random.rand(len(full_data)) - 0.5)

            elif attr.is_continuous and self.jitter_continuous:
                scaled_data[index] += self.jitter_size / 50.0 * (
                    0.5 - random.rand(len(full_data)))
                scaled_data[index] = np.absolute(
                    scaled_data[index])  # fix values below zero
                ind = np.where(scaled_data[index] > 1.0, 1,
                               0)  # fix values above 1
                np.putmask(scaled_data[index], ind,
                           2.0 - np.compress(ind, scaled_data[index]))

        self.scaled_data = scaled_data[:, :len_data]
Exemplo n.º 27
0
    def set_data(self, data, **args):
        if args.get("skipIfSame", 1):
            if checksum(data) == checksum(self.raw_data):
                return

        self.domain_data_stat = []
        self.attr_values = {}
        self.original_data = None
        self.scaled_data = None
        self.no_jittering_scaled_data = None
        self.valid_data_array = None

        self.raw_data = None
        self.have_data = False
        self.data_has_class = False
        self.data_has_continuous_class = False
        self.data_has_discrete_class = False
        self.data_class_name = None
        self.data_domain = None
        self.data_class_index = None

        if data is None:
            return
        full_data = data
        self.raw_data = data

        len_data = data and len(data) or 0

        self.attribute_names = [attr.name for attr in full_data.domain]
        self.attribute_name_index = dict([(full_data.domain[i].name, i)
                                          for i in range(len(full_data.domain))])
        self.attribute_flip_info = {}

        self.data_domain = full_data.domain
        self.data_has_class = bool(full_data.domain.class_var)
        self.data_has_continuous_class = full_data.domain.has_continuous_class
        self.data_has_discrete_class = full_data.domain.has_discrete_class

        self.data_class_name = self.data_has_class and full_data.domain.class_var.name
        if self.data_has_class:
            self.data_class_index = self.attribute_name_index[self.data_class_name]
        self.have_data = bool(self.raw_data and len(self.raw_data) > 0)

        self.domain_data_stat = getCached(full_data,
                                          DomainBasicStats,
                                          (full_data,))

        sort_values_for_discrete_attrs = args.get("sort_values_for_discrete_attrs",
                                                  1)

        for index in range(len(full_data.domain)):
            attr = full_data.domain[index]
            if attr.is_discrete:
                self.attr_values[attr.name] = [0, len(attr.values)]
            elif attr.is_continuous:
                self.attr_values[attr.name] = [self.domain_data_stat[index].min,
                                               self.domain_data_stat[index].max]

        if 'no_data' in args:
            return

        # the original_data, no_jittering_scaled_data and validArray are arrays
        # that we can cache so that other visualization widgets don't need to
        # compute it. The scaled_data on the other hand has to be computed for
        # each widget separately because of different
        # jitter_continuous and jitter_size values
        if getCached(data, "visualizationData"):
            self.original_data, self.no_jittering_scaled_data, self.valid_data_array = getCached(data,
                                                                                                 "visualizationData")
        else:
            no_jittering_data = np.c_[full_data.X, full_data.Y].T
            valid_data_array = ~np.isnan(no_jittering_data)
            original_data = no_jittering_data.copy()

            for index in range(len(data.domain)):
                attr = data.domain[index]
                if attr.is_discrete:
                    # see if the values for discrete attributes have to be resorted
                    variable_value_indices = get_variable_value_indices(data.domain[index],
                                                                        sort_values_for_discrete_attrs)
                    if 0 in [i == variable_value_indices[attr.values[i]]
                             for i in range(len(attr.values))]:
                        # make the array a contiguous, otherwise the putmask
                        # function does not work
                        line = no_jittering_data[index].copy()
                        indices = [np.where(line == val, 1, 0)
                                   for val in range(len(attr.values))]
                        for i in range(len(attr.values)):
                            np.putmask(line, indices[i],
                                          variable_value_indices[attr.values[i]])
                        no_jittering_data[index] = line   # save the changed array
                        original_data[index] = line     # reorder also the values in the original data
                    no_jittering_data[index] = ((no_jittering_data[index] * 2.0 + 1.0)
                                                / float(2 * len(attr.values)))

                elif attr.is_continuous:
                    diff = self.domain_data_stat[index].max - self.domain_data_stat[
                        index].min or 1     # if all values are the same then prevent division by zero
                    no_jittering_data[index] = (no_jittering_data[index] -
                                                self.domain_data_stat[index].min) / diff

            self.original_data = original_data
            self.no_jittering_scaled_data = no_jittering_data
            self.valid_data_array = valid_data_array

        if data:
            setCached(data, "visualizationData",
                      (self.original_data, self.no_jittering_scaled_data,
                       self.valid_data_array))

        # compute the scaled_data arrays
        scaled_data = self.no_jittering_scaled_data

        # Random generators for jittering
        random = np.random.RandomState(seed=self.jitter_seed)
        rand_seeds = random.random_integers(0, 2 ** 30 - 1,
                                            size=len(data.domain))
        for index, rseed in zip(list(range(len(data.domain))), rand_seeds):
            # Need to use a different seed for each feature
            random = np.random.RandomState(seed=rseed)
            attr = data.domain[index]
            if attr.is_discrete:
                scaled_data[index] += (self.jitter_size / (50.0 * max(1, len(attr.values)))) * \
                                      (random.rand(len(full_data)) - 0.5)

            elif attr.is_continuous and self.jitter_continuous:
                scaled_data[index] += self.jitter_size / 50.0 * (0.5 - random.rand(len(full_data)))
                scaled_data[index] = np.absolute(scaled_data[index])       # fix values below zero
                ind = np.where(scaled_data[index] > 1.0, 1, 0)     # fix values above 1
                np.putmask(scaled_data[index], ind, 2.0 - np.compress(ind, scaled_data[index]))

        self.scaled_data = scaled_data[:, :len_data]