Пример #1
0
    def add_new_class_label(self, undoable=True):

        newlabel = next(label for label in namegen("C", 1) if label not in self.class_model)

        command = SimpleUndoCommand(lambda: self.class_model.append(newlabel), lambda: self.class_model.__delitem__(-1))
        if undoable:
            self.undo_stack.push(command)
        else:
            command.redo()
Пример #2
0
    def add_new_class_label(self, undoable=True):

        newlabel = next(label for label in namegen('C', 1)
                        if label not in self.class_model)

        command = SimpleUndoCommand(lambda: self.class_model.append(newlabel),
                                    lambda: self.class_model.__delitem__(-1))
        if undoable:
            self.undo_stack.push(command)
        else:
            command.redo()
Пример #3
0
    def __init__(self, data: np.ndarray, ncols: int, header: _TableHeader,
                 offset: int):
        self.data = data
        self.ncols = ncols
        self.header = header
        self.offset = offset
        self.namegen: Generator[str] = namegen('Feature ', 1)

        self.cols_X: List[np.ndarray] = []
        self.cols_Y: List[np.ndarray] = []
        self.cols_M: List[np.ndarray] = []
        self.cols_W: List[np.ndarray] = []
        self.attrs: List[Variable] = []
        self.clses: List[Variable] = []
        self.metas: List[Variable] = []
Пример #4
0
    def data_table(cls, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = cls.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if len(headers) == 3:
            names, types, flags = map(list, headers)
        else:
            if len(headers) == 1:
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif len(headers) == 2:
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        strip = False

        def _equal_length(lst):
            nonlocal strip
            if len(lst) > rowlen > 0:
                lst = lst[:rowlen]
                strip = True
            elif len(lst) < rowlen:
                lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = [
            _equal_length([s.strip() for s in row]) for row in data if any(row)
        ]
        data = np.array(data, dtype=object, order='F')

        if strip:
            warnings.warn("Columns with no headers were removed.")

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {
                name: 0
                for name, count in name_counts.items() if count > 1
            }
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        namask = np.empty(data.shape[0], dtype=bool)
        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = data[:, col]
            except IndexError:
                orig_values = np.array([], dtype=object)

            namask = isnastr(orig_values, out=namask)

            coltype_kwargs = {}
            valuemap = None
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
                values = orig_values
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                values = np.empty(data.shape[0], dtype=float)
                try:
                    np.copyto(values,
                              orig_values,
                              casting="unsafe",
                              where=~namask)
                    values[namask] = np.nan
                except ValueError:
                    for row, num in enumerate(orig_values):
                        if not isnastr(num):
                            try:
                                float(num)
                            except ValueError:
                                break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable
                values = np.where(namask, "", orig_values)
            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                orig_values = values = np.where(namask, "", orig_values)
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {""})
            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(
                    orig_values, namask)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to

            if domain_vars is not None:
                var_name = names and names[col]
                if not var_name:
                    var_name = next(NAMEGEN)

                values, var = sanitize_variable(valuemap,
                                                values,
                                                orig_values,
                                                coltype,
                                                coltype_kwargs,
                                                name=var_name)
            else:
                var = None
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            if isinstance(values, np.ndarray) and not values.flags.owndata:
                values = values.copy()  # might view `data` (string columns)
            cols.append(values)

            try:
                # allow gc to reclaim memory used by string values
                data[:, col] = None
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        X = Y = M = W = None
        if Xcols:
            X = np.c_[tuple(Xcols)]
            assert X.dtype == np.float_
        else:
            X = np.empty((data.shape[0], 0), dtype=np.float_)
        if Ycols:
            Y = np.c_[tuple(Ycols)]
            assert Y.dtype == np.float_
        if Mcols:
            M = np.c_[tuple(Mcols)].astype(object)
        if Wcols:
            W = np.c_[tuple(Wcols)].astype(float)

        table = Table.from_numpy(domain, X, Y, M, W)
        return table
Пример #5
0
    def data_table(cls, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).
        Basically, the idea of subclasses is to produce those two iterables,
        however they might.
        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = cls.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if len(headers) == 3:
            names, types, flags = map(list, headers)
        else:
            if len(headers) == 1:
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif len(headers) == 2:
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False,
                        dtype=object,
                        order='F')

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {
                name: 0
                for name, count in name_counts.items() if count > 1
            }
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [
                    np.nan if i in MISSING_VALUES else i
                    for i in (i.strip() for i in data[:, col])
                ]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try:
                            float(num)
                        except ValueError:
                            break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(orig_values)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)

            existing_var, new_var_name = None, None
            if domain_vars is not None:
                existing_var = names and names[col]
                if not existing_var:
                    new_var_name = next(NAMEGEN)

            values, var = sanitize_variable(valuemap, values, orig_values,
                                            coltype, coltype_kwargs,
                                            domain_vars, existing_var,
                                            new_var_name, data)
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try:
                data[:, col] = values
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain, data[:, Xcols].astype(float,
                                                               order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Пример #6
0
    def data_table(cls, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = cls.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if len(headers) == 3:
            names, types, flags = map(list, headers)
        else:
            if len(headers) == 1:
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1)
                                      if HEADER1_FLAG_SEP in i else ('', i)
                                      for i in headers[0]]
                                   )
                names = list(names)
            elif len(headers) == 2:
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        strip = False

        def _equal_length(lst):
            nonlocal strip
            if len(lst) > rowlen > 0:
                lst = lst[:rowlen]
                strip = True
            elif len(lst) < rowlen:
                lst.extend(['']*(rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = [_equal_length([s.strip() for s in row])
                for row in data if any(row)]
        data = np.array(data, dtype=object, order='F')

        if strip:
            warnings.warn("Columns with no headers were removed.")

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {name: 0 for name, count in name_counts.items() if count > 1}
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        namask = np.empty(data.shape[0], dtype=bool)
        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = data[:, col]
            except IndexError:
                orig_values = np.array([], dtype=object)

            namask = isnastr(orig_values, out=namask)

            coltype_kwargs = {}
            valuemap = None
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
                values = orig_values
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                values = np.empty(data.shape[0], dtype=float)
                try:
                    np.copyto(values, orig_values, casting="unsafe",
                              where=~namask)
                    values[namask] = np.nan
                except ValueError:
                    for row, num in enumerate(orig_values):
                        if not isnastr(num):
                            try:
                                float(num)
                            except ValueError:
                                break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(row + len(headers) + 1,
                                                                 col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable
                values = np.where(namask, "", orig_values)
            elif (type_flag in DiscreteVariable.TYPE_HEADERS or
                  _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                orig_values = values = np.where(namask, "", orig_values)
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {""})
            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(orig_values, namask)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to

            if domain_vars is not None:
                var_name = names and names[col]
                if not var_name:
                    var_name = next(NAMEGEN)

                values, var = sanitize_variable(
                    valuemap, values, orig_values, coltype, coltype_kwargs,
                    name=var_name)
            else:
                var = None
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            if isinstance(values, np.ndarray) and not values.flags.owndata:
                values = values.copy()  # might view `data` (string columns)
            cols.append(values)

            try:
                # allow gc to reclaim memory used by string values
                data[:, col] = None
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        X = Y = M = W = None
        if Xcols:
            X = np.c_[tuple(Xcols)]
            assert X.dtype == np.float_
        else:
            X = np.empty((data.shape[0], 0), dtype=np.float_)
        if Ycols:
            Y = np.c_[tuple(Ycols)]
            assert Y.dtype == np.float_
        if Mcols:
            M = np.c_[tuple(Mcols)].astype(object)
        if Wcols:
            W = np.c_[tuple(Wcols)].astype(float)

        table = Table.from_numpy(domain, X, Y, M, W)
        return table
Пример #7
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i)
                                      for i in headers[0]])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend(['']*(rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False, dtype=object, order='F')

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {name: 0 for name, count in name_counts.items() if count > 1}
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [np.nan if i in MISSING_VALUES else i
                               for i in (i.strip() for i in data[:, col])]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try:
                            float(num)
                        except ValueError:
                            break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(row + len(headers) + 1,
                                                                 col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS or
                  _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(orig_values)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)

            existing_var, new_var_name, column = None, None, None
            if domain_vars is not None:
                existing_var = names and names[col]
                if not existing_var:
                    new_var_name = next(NAMEGEN)

            values, var = sanitize_variable(
                valuemap, values, orig_values, coltype, coltype_kwargs,
                domain_vars, existing_var, new_var_name, data)
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try:
                data[:, col] = values
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain,
                                 data[:, Xcols].astype(float, order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Пример #8
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i)
                                      for i in headers[0]])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend(['']*(rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False, dtype=object, order='F')

        # Data may actually be longer than headers were
        try: rowlen = data.shape[1]
        except IndexError: pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i: continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [np.nan if i in MISSING_VALUES else i
                               for i in (i.strip() for i in data[:, col])]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try: float(num)
                        except ValueError: break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(row + len(headers) + 1,
                                                                 col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS or
                  _RE_DISCRETE_LIST.match(type_flag)):
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                is_discrete = is_discrete_values(orig_values)
                if is_discrete:
                    valuemap = sorted(is_discrete)
                else:
                    try: values = [float(i) for i in orig_values]
                    except ValueError:
                        tvar = TimeVariable('_')
                        try: values = [tvar.parse(i) for i in orig_values]
                        except ValueError:
                            coltype = StringVariable
                        else:
                            coltype = TimeVariable
                    else:
                        coltype = ContinuousVariable

            if valuemap:
                # Map discrete data to ints
                def valuemap_index(val):
                    try: return valuemap.index(val)
                    except ValueError: return np.nan

                values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
                coltype = DiscreteVariable
                coltype_kwargs.update(values=valuemap)

            if coltype is StringVariable:
                values = ['' if i is np.nan else i
                          for i in orig_values]

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)
            if domain_vars is not None:
                if names and names[col]:
                    # Use existing variable if available
                    var = coltype.make(names[col].strip(), **coltype_kwargs)
                else:
                    # Never use existing for un-named variables
                    var = coltype(next(NAMEGEN), **coltype_kwargs)
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

                # Reorder discrete values to match existing variable
                if var.is_discrete and not var.ordered:
                    new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
                    if new_order != old_order:
                        offset = len(new_order)
                        column = values if data.ndim > 1 else data
                        column += offset
                        for i, val in enumerate(var.values):
                            try: oldval = old_order.index(val)
                            except ValueError: continue
                            bn.replace(column, offset + oldval, new_order.index(val))

            if coltype is TimeVariable:
                # Re-parse the values because only now after coltype.make call
                # above, variable var is the correct one
                values = [var.parse(i) for i in orig_values]

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try: data[:, col] = values
            except IndexError: pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain,
                                 data[:, Xcols].astype(float, order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table
Пример #9
0
    def data_table(self, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might
        as well **have it sorted column-major**, e.g. ``order='F'``).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = self.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if 3 == len(headers):
            names, types, flags = map(list, headers)
        else:
            if 1 == len(headers):
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif 2 == len(headers):
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        def _equal_length(lst):
            lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = np.array([_equal_length(list(row)) for row in data if any(row)],
                        copy=False,
                        dtype=object,
                        order='F')

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i: continue

            type_flag = types and types[col].strip()
            try:
                orig_values = [
                    np.nan if i in MISSING_VALUES else i
                    for i in (i.strip() for i in data[:, col])
                ]
            except IndexError:
                # No data instances leads here
                orig_values = []
                # In this case, coltype could be anything. It's set as-is
                # only to satisfy test_table.TableTestCase.test_append
                coltype = DiscreteVariable

            coltype_kwargs = {}
            valuemap = []
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                try:
                    values = [float(i) for i in orig_values]
                except ValueError:
                    for row, num in enumerate(orig_values):
                        try:
                            float(num)
                        except ValueError:
                            break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable

            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {np.nan})

            else:
                # No known type specified, use heuristics
                is_discrete = is_discrete_values(orig_values)
                if is_discrete:
                    valuemap = sorted(is_discrete)
                else:
                    try:
                        values = [float(i) for i in orig_values]
                    except ValueError:
                        tvar = TimeVariable('_')
                        try:
                            values = [tvar.parse(i) for i in orig_values]
                        except ValueError:
                            coltype = StringVariable
                        else:
                            coltype = TimeVariable
                    else:
                        coltype = ContinuousVariable

            if valuemap:
                # Map discrete data to ints
                def valuemap_index(val):
                    try:
                        return valuemap.index(val)
                    except ValueError:
                        return np.nan

                values = np.vectorize(valuemap_index,
                                      otypes=[float])(orig_values)
                coltype = DiscreteVariable
                coltype_kwargs.update(values=valuemap)

            if coltype is StringVariable:
                values = ['' if i is np.nan else i for i in orig_values]

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to
            cols.append(col)
            if domain_vars is not None:
                if names and names[col]:
                    # Use existing variable if available
                    var = coltype.make(names[col].strip(), **coltype_kwargs)
                else:
                    # Never use existing for un-named variables
                    var = coltype(next(NAMEGEN), **coltype_kwargs)
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

                # Reorder discrete values to match existing variable
                if var.is_discrete and not var.ordered:
                    new_order, old_order = var.values, coltype_kwargs.get(
                        'values', var.values)
                    if new_order != old_order:
                        offset = len(new_order)
                        column = values if data.ndim > 1 else data
                        column += offset
                        for i, val in enumerate(var.values):
                            try:
                                oldval = old_order.index(val)
                            except ValueError:
                                continue
                            bn.replace(column, offset + oldval,
                                       new_order.index(val))

            if coltype is TimeVariable:
                # Re-parse the values because only now after coltype.make call
                # above, variable var is the correct one
                values = [var.parse(i) for i in orig_values]

            # Write back the changed data. This is needeed to pass the
            # correct, converted values into Table.from_numpy below
            try:
                data[:, col] = values
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        table = Table.from_numpy(domain, data[:, Xcols].astype(float,
                                                               order='C'),
                                 data[:, Ycols].astype(float, order='C'),
                                 data[:, Mcols].astype(object, order='C'),
                                 data[:, Wcols].astype(float, order='C'))
        return table