def __init__(self, tables, keys, key_name='group', key_type=None, _is_fork=False): tables = tuple(tables) keys = tuple(keys) self._key_name = key_name self._key_type = key_type or Text() self._sample_table = tables[0] while isinstance(self._sample_table, TableSet): self._sample_table = self._sample_table[0] self._column_types = self._sample_table.column_types self._column_names = self._sample_table.column_names if not _is_fork: for table in tables: if any(not isinstance(a, type(b)) for a, b in zip_longest( table.column_types, self._column_types)): raise ValueError( 'Not all tables have the same column types!') if table.column_names != self._column_names: raise ValueError( 'Not all tables have the same column names!') MappedSequence.__init__(self, tables, keys)
def test_ne(self): row2 = MappedSequence(self.data, self.column_names) self.assertFalse(self.row != (u'a', u'b', u'c')) self.assertFalse(self.row != [u'a', u'b', u'c']) self.assertFalse(self.row != row2) self.assertTrue(self.row != (u'a', u'b', u'c', u'd')) self.assertTrue(self.row != 1)
def test_stringify_long(self): column_names = ('one', 'two', 'three', 'four', 'five', 'six') data = (u'a', u'b', u'c', u'd', u'e', u'f') row = MappedSequence(data, column_names) if six.PY2: self.assertEqual(str(row), "<agate.MappedSequence: (u'a', u'b', u'c', u'd', u'e', ...)>") else: self.assertEqual(str(row), "<agate.MappedSequence: ('a', 'b', 'c', 'd', 'e', ...)>")
def __init__(self, tables, keys, key_name='group', key_type=None): tables = tuple(tables) keys = tuple(keys) self._key_name = key_name self._key_type = key_type or Text() self._sample_table = tables[0] while isinstance(self._sample_table, TableSet): self._sample_table = self._sample_table[0] self._column_types = self._sample_table.column_types self._column_names = self._sample_table.column_names for table in tables: if table.column_types != self.column_types: raise ValueError('Not all tables have the same column types!') if table.column_names != self.column_names: raise ValueError('Not all tables have the same column names!') MappedSequence.__init__(self, tables, keys)
def __init__(self, tables, keys, key_name='group', key_type=None, _is_fork=False): tables = tuple(tables) keys = tuple(keys) self._key_name = key_name self._key_type = key_type or Text() self._sample_table = tables[0] while isinstance(self._sample_table, TableSet): self._sample_table = self._sample_table[0] self._column_types = self._sample_table.column_types self._column_names = self._sample_table.column_names if not _is_fork: for table in tables: if any(not isinstance(a, type(b)) for a, b in zip_longest(table.column_types, self.column_types)): raise ValueError('Not all tables have the same column types!') if table.column_names != self.column_names: raise ValueError('Not all tables have the same column names!') MappedSequence.__init__(self, tables, keys)
def __init__(self, rows, column_info, row_names=None, _is_fork=False): column_info = list(column_info) if isinstance(column_info[0], Column): self._column_names = tuple(c.name for c in column_info) self._column_types = tuple(c.data_type for c in column_info) else: column_names, self._column_types = zip(*column_info) self._column_names = [] # Validation for i, column_name in enumerate(column_names): if not column_name: self._column_names.append(letter_name(i)) else: if not isinstance(column_name, six.string_types): raise ValueError('Column names must be strings.') self._column_names.append(column_name) len_column_names = len(self._column_names) if len(set(self._column_names)) != len_column_names: raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(self._column_names) for column_type in self._column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
class Table(object): """ A dataset consisting of rows and columns. Columns refer to "vertical" slices of data that must all be of the same type. Rows refer to "horizontal" slices of data that may (and usually do) contain mixed types. The sequence of :class:`.Column` instances are retrieved via the :attr:`.Table.columns` property. They may be accessed by either numeric index or by unique column name. The sequence of :class:`.Row` instances are retrieved via the :attr:`.Table.rows` property. They may be accessed by either numeric index or, if specified, unique row names. :param rows: The data as a sequence of any sequences: tuples, lists, etc. If any row has fewer values than the number of columns, it will be filled out with nulls. No row may have more values than the number of columns. :param column_names: A sequence of string names for each column or `None`, in which case column names will be automatically assigned using :func:`.letter_name`. :param column_types: A sequence of instances of :class:`.DataType` or an instance of :class:`.TypeTester` or `None` in which case a generic TypeTester will be used. Alternatively, a dictionary with column names as keys and instances of :class:`.DataType` as values to specify some types. :param row_names: Specifies unique names for each row. This parameter is optional. If specified it may be 1) the name of a single column that contains a unique identifier for each row, 2) a key function that takes a :class:`.Row` and returns a unique identifier or 3) a sequence of unique identifiers of the same length as the sequence of rows. The uniqueness of resulting identifiers is not validated, so be certain the values you provide are truly unique. :param _is_fork: Used internally to skip certain validation steps when data is propagated from an existing table. When :code:`True`, rows are assumed to be :class:`.Row` instances, rather than raw data. """ def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError('Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names) def __str__(self): """ Print the table's structure using :meth:`.Table.print_structure`. """ structure = six.StringIO() self.print_structure(output=structure) return structure.getvalue() def __len__(self): """ Shorthand for :code:`len(table.rows)`. """ return self._rows.__len__() def __iter__(self): """ Shorthand for :code:`iter(table.rows)`. """ return self._rows.__iter__() def __getitem__(self, key): """ Shorthand for :code:`table.rows[foo]`. """ return self._rows.__getitem__(key) @property def column_types(self): """ An tuple :class:`.DataType` instances. """ return self._column_types @property def column_names(self): """ An tuple of strings. """ return self._column_names @property def row_names(self): """ An tuple of strings, if this table has row names. If this table does not have row names, then :code:`None`. """ return self._row_names @property def columns(self): """ A :class:`.MappedSequence` with column names for keys and :class:`.Column` instances for values. """ return self._columns @property def rows(self): """ A :class:`.MappedSeqeuence` with row names for keys (if specified) and :class:`.Row` instances for values. """ return self._rows def _fork(self, rows, column_names=None, column_types=None, row_names=None): """ Create a new table using the metadata from this one. This method is used internally by functions like :meth:`.Table.order_by`. :param rows: Row data for the forked table. :param column_names: Column names for the forked table. If not specified, fork will use this table's column names. :param column_types: Column types for the forked table. If not specified, fork will use this table's column names. :param row_names: Row names for the forked table. If not specified, fork will use this table's row names. """ if column_names is None: column_names = self._column_names if column_types is None: column_types = self._column_types if row_names is None: row_names = self._row_names return Table(rows, column_names, column_types, row_names=row_names, _is_fork=True) def print_csv(self, **kwargs): """ Print this table as a CSV. This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_csv`. :code:`kwargs` will be passed on to :meth:`.Table.to_csv`. """ self.to_csv(sys.stdout, **kwargs) def print_json(self, **kwargs): """ Print this table as JSON. This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_json`. :code:`kwargs` will be passed on to :meth:`.Table.to_json`. """ self.to_json(sys.stdout, **kwargs)
class TestMappedSequence(unittest.TestCase): def setUp(self): self.column_names = ('one', 'two', 'three') self.data = (u'a', u'b', u'c') self.row = MappedSequence(self.data, self.column_names) def test_is_immutable(self): with self.assertRaises(TypeError): self.row[0] = 'foo' with self.assertRaises(TypeError): self.row['one'] = 100 def test_stringify(self): if six.PY2: self.assertEqual(str(self.row), "<agate.MappedSequence: (u'a', u'b', u'c')>") else: self.assertEqual(str(self.row), "<agate.MappedSequence: ('a', 'b', 'c')>") def test_stringify_long(self): column_names = ('one', 'two', 'three', 'four', 'five', 'six') data = (u'a', u'b', u'c', u'd', u'e', u'f') row = MappedSequence(data, column_names) if six.PY2: self.assertEqual(str(row), "<agate.MappedSequence: (u'a', u'b', u'c', u'd', u'e', ...)>") else: self.assertEqual(str(row), "<agate.MappedSequence: ('a', 'b', 'c', 'd', 'e', ...)>") def test_length(self): self.assertEqual(len(self.row), 3) def test_eq(self): row2 = MappedSequence(self.data, self.column_names) self.assertTrue(self.row == (u'a', u'b', u'c')) self.assertTrue(self.row == [u'a', u'b', u'c']) self.assertTrue(self.row == row2) self.assertFalse(self.row == (u'a', u'b', u'c', u'd')) self.assertFalse(self.row == 1) def test_ne(self): row2 = MappedSequence(self.data, self.column_names) self.assertFalse(self.row != (u'a', u'b', u'c')) self.assertFalse(self.row != [u'a', u'b', u'c']) self.assertFalse(self.row != row2) self.assertTrue(self.row != (u'a', u'b', u'c', u'd')) self.assertTrue(self.row != 1) def test_contains(self): self.assertTrue('a' in self.row) self.assertFalse('d' in self.row) def test_set_item(self): with self.assertRaises(TypeError): self.row['one'] = u't' with self.assertRaises(TypeError): self.row['five'] = u'g' def test_get_item(self): self.assertEqual(self.row['one'], 'a') self.assertEqual(self.row['two'], 'b') self.assertEqual(self.row['three'], 'c') def test_get_by_key(self): self.assertEqual(self.row['one'], 'a') self.assertEqual(self.row[0], 'a') def test_get_by_slice(self): self.assertSequenceEqual(self.row[1:], ('b', 'c')) def test_get_invalid(self): with self.assertRaises(IndexError): self.row[3] with self.assertRaises(KeyError): self.row['foo'] def test_keys(self): self.assertIs(self.row.keys(), self.column_names) def test_values(self): self.assertIs(self.row.values(), self.data) def test_items(self): self.assertSequenceEqual(self.row.items(), [ ('one', 'a'), ('two', 'b'), ('three', 'c') ]) def test_get(self): self.assertEqual(self.row.get('one'), 'a') def test_get_default(self): self.assertEqual(self.row.get('four'), None) self.assertEqual(self.row.get('four', 'foo'), 'foo') def test_dict(self): self.assertDictEqual(self.row.dict(), { 'one': 'a', 'two': 'b', 'three': 'c' }) def test_dict_no_keys(self): row = MappedSequence(self.data) with self.assertRaises(KeyError): row.dict() def test_iterate(self): it = iter(self.row) self.assertSequenceEqual(next(it), 'a') self.assertSequenceEqual(next(it), 'b') self.assertSequenceEqual(next(it), 'c') with self.assertRaises(StopIteration): next(it)
def test_dict_no_keys(self): row = MappedSequence(self.data) with self.assertRaises(KeyError): row.dict()
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?' ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str( duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in six.itervalues(column_types): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
class Table(object): """ A dataset consisting of rows and columns. Columns refer to "vertical" slices of data that must all be of the same type. Rows refer to "horizontal" slices of data that may (and usually do) contain mixed types. The sequence of :class:`.Column` instances are retrieved via the :attr:`.Table.columns` property. They may be accessed by either numeric index or by unique column name. The sequence of :class:`.Row` instances are retrieved via the :attr:`.Table.rows` property. They may be accessed by either numeric index or, if specified, unique row names. :param rows: The data as a sequence of any sequences: tuples, lists, etc. If any row has fewer values than the number of columns, it will be filled out with nulls. No row may have more values than the number of columns. :param column_names: A sequence of string names for each column or `None`, in which case column names will be automatically assigned using :func:`.letter_name`. :param column_types: A sequence of instances of :class:`.DataType` or an instance of :class:`.TypeTester` or `None` in which case a generic TypeTester will be used. Alternatively, a dictionary with column names as keys and instances of :class:`.DataType` as values to specify some types. :param row_names: Specifies unique names for each row. This parameter is optional. If specified it may be 1) the name of a single column that contains a unique identifier for each row, 2) a key function that takes a :class:`.Row` and returns a unique identifier or 3) a sequence of unique identifiers of the same length as the sequence of rows. The uniqueness of resulting identifiers is not validated, so be certain the values you provide are truly unique. :param _is_fork: Used internally to skip certain validation steps when data is propagated from an existing table. When :code:`True`, rows are assumed to be :class:`.Row` instances, rather than raw data. """ def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?' ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if not column_name: new_column_name = utils.letter_name(i) warn_unnamed_column(i, new_column_name) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str( duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError( str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError( 'Row names cannot be of type int. Use Decimal for numbered row names.' ) self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names) def __str__(self): """ Print the table's structure using :meth:`.Table.print_structure`. """ structure = six.StringIO() self.print_structure(output=structure) return structure.getvalue() def __len__(self): """ Shorthand for :code:`len(table.rows)`. """ return self._rows.__len__() def __iter__(self): """ Shorthand for :code:`iter(table.rows)`. """ return self._rows.__iter__() def __getitem__(self, key): """ Shorthand for :code:`table.rows[foo]`. """ return self._rows.__getitem__(key) @property def column_types(self): """ An tuple :class:`.DataType` instances. """ return self._column_types @property def column_names(self): """ An tuple of strings. """ return self._column_names @property def row_names(self): """ An tuple of strings, if this table has row names. If this table does not have row names, then :code:`None`. """ return self._row_names @property def columns(self): """ A :class:`.MappedSequence` with column names for keys and :class:`.Column` instances for values. """ return self._columns @property def rows(self): """ A :class:`.MappedSeqeuence` with row names for keys (if specified) and :class:`.Row` instances for values. """ return self._rows def _fork(self, rows, column_names=None, column_types=None, row_names=None): """ Create a new table using the metadata from this one. This method is used internally by functions like :meth:`.Table.order_by`. :param rows: Row data for the forked table. :param column_names: Column names for the forked table. If not specified, fork will use this table's column names. :param column_types: Column types for the forked table. If not specified, fork will use this table's column names. :param row_names: Row names for the forked table. If not specified, fork will use this table's row names. """ if column_names is None: column_names = self._column_names if column_types is None: column_types = self._column_types if row_names is None: row_names = self._row_names return Table(rows, column_names, column_types, row_names=row_names, _is_fork=True) def print_csv(self, **kwargs): """ Print this table as a CSV. This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_csv`. :code:`kwargs` will be passed on to :meth:`.Table.to_csv`. """ self.to_csv(sys.stdout, **kwargs) def print_json(self, **kwargs): """ Print this table as JSON. This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_json`. :code:`kwargs` will be passed on to :meth:`.Table.to_json`. """ self.to_json(sys.stdout, **kwargs)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. ' 'Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError( str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError( 'Row names cannot be of type int. Use Decimal for numbered row names.' ) self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError('Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: final_column_names.append(utils.letter_name(i)) elif isinstance(column_name, six.string_types): final_column_names.append(column_name) else: raise ValueError('Column names must be strings or None.') if len(set(final_column_names)) != len(final_column_names): raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(final_column_names) else: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, TypeTester): pass else: for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
class TestMappedSequence(unittest.TestCase): def setUp(self): self.column_names = ('one', 'two', 'three') self.data = (u'a', u'b', u'c') self.row = MappedSequence(self.data, self.column_names) def test_is_immutable(self): with self.assertRaises(TypeError): self.row[0] = 'foo' with self.assertRaises(TypeError): self.row['one'] = 100 def test_stringify(self): if six.PY2: self.assertEqual(str(self.row), "<agate.MappedSequence: (u'a', u'b', u'c')>") else: self.assertEqual(str(self.row), "<agate.MappedSequence: ('a', 'b', 'c')>") def test_stringify_long(self): column_names = ('one', 'two', 'three', 'four', 'five', 'six') data = (u'a', u'b', u'c', u'd', u'e', u'f') row = MappedSequence(data, column_names) if six.PY2: self.assertEqual( str(row), "<agate.MappedSequence: (u'a', u'b', u'c', u'd', u'e', ...)>") else: self.assertEqual( str(row), "<agate.MappedSequence: ('a', 'b', 'c', 'd', 'e', ...)>") def test_length(self): self.assertEqual(len(self.row), 3) def test_eq(self): row2 = MappedSequence(self.data, self.column_names) self.assertTrue(self.row == (u'a', u'b', u'c')) self.assertTrue(self.row == [u'a', u'b', u'c']) self.assertTrue(self.row == row2) self.assertFalse(self.row == (u'a', u'b', u'c', u'd')) self.assertFalse(self.row == 1) def test_ne(self): row2 = MappedSequence(self.data, self.column_names) self.assertFalse(self.row != (u'a', u'b', u'c')) self.assertFalse(self.row != [u'a', u'b', u'c']) self.assertFalse(self.row != row2) self.assertTrue(self.row != (u'a', u'b', u'c', u'd')) self.assertTrue(self.row != 1) def test_contains(self): self.assertTrue('a' in self.row) self.assertFalse('d' in self.row) def test_get_item(self): self.assertEqual(self.row['one'], 'a') self.assertEqual(self.row['two'], 'b') self.assertEqual(self.row['three'], 'c') def test_get_by_key(self): self.assertEqual(self.row['one'], 'a') self.assertEqual(self.row[0], 'a') def test_get_by_slice(self): self.assertSequenceEqual(self.row[1:], ('b', 'c')) def test_get_invalid(self): with self.assertRaises(IndexError): self.row[3] with self.assertRaises(KeyError): self.row['foo'] def test_keys(self): self.assertIs(self.row.keys(), self.column_names) def test_values(self): self.assertIs(self.row.values(), self.data) def test_items(self): self.assertSequenceEqual(self.row.items(), [('one', 'a'), ('two', 'b'), ('three', 'c')]) def test_get(self): self.assertEqual(self.row.get('one'), 'a') with self.assertRaises(KeyError): self.row.get('four') def test_get_default(self): self.assertEqual(self.row.get('four', 'foo'), 'foo') def test_dict(self): self.assertDictEqual(self.row.dict(), { 'one': 'a', 'two': 'b', 'three': 'c' }) def test_dict_no_keys(self): row = MappedSequence(self.data) with self.assertRaises(KeyError): row.dict() def test_iterate(self): it = iter(self.row) self.assertSequenceEqual(next(it), 'a') self.assertSequenceEqual(next(it), 'b') self.assertSequenceEqual(next(it), 'c') with self.assertRaises(StopIteration): next(it)
def setUp(self): self.column_names = ('one', 'two', 'three') self.data = (u'a', u'b', u'c') self.row = MappedSequence(self.data, self.column_names)