def aggregate(self, aggregations): """ Apply one or more :class:`.Aggregation` instances to this table. :param aggregations: A single :class:`.Aggregation` instance or sequence of them. :returns: If the input was a single :class:`Aggregation` then a single result will be returned. If it was a sequence then a tuple of results will be returned. """ if utils.issequence(aggregations): results = [] for agg in aggregations: agg.validate(self) for agg in aggregations: results.append(agg.run(self)) return tuple(results) else: aggregations.validate(self) return aggregations.run(self)
def aggregate(self, aggregations): """ Aggregate data from the columns in this self by applying a sequence of :class:`.Aggregation` instances. :param aggregations: A single :class:`.Aggregation` instance or sequence of them. :returns: If the input was a single :class:`Aggregation` then a single result will be returned. If it was a sequence then a tuple of results will be returned. """ if utils.issequence(aggregations): results = [] for agg in aggregations: agg.validate(self) for agg in aggregations: results.append(agg.run(self)) return tuple(results) else: aggregations.validate(self) return aggregations.run(self)
def aggregate(self, aggregations): """ Apply one or more :class:`.Aggregation` instances to this table. :param aggregations: A single :class:`.Aggregation` instance or a sequence of tuples in the format :code:`(name, aggregation)`, where each :code:`aggregation` is an instance of :class:`.Aggregation`. :returns: If the input was a single :class:`Aggregation` then a single result will be returned. If it was a sequence then an :class:`.OrderedDict` of results will be returned. """ if utils.issequence(aggregations): results = OrderedDict() for name, agg in aggregations: agg.validate(self) for name, agg in aggregations: results[name] = agg.run(self) return results else: aggregations.validate(self) return aggregations.run(self)
def validate(self, table): if issequence(self._column_name): column_names = self._column_name else: column_names = [self._column_name] for column_name in column_names: column = table.columns[column_name] if not isinstance(column.data_type, Text): raise DataTypeError('Slug column must contain Text data.') if HasNulls(column_name).run(table): raise ValueError('Slug column cannot contain `None`.')
def exclude(self, key): """ Create a new table without the specified columns. :param key: Either the name of a single column to exclude or a sequence of such names. :returns: A new :class:`.Table`. """ if not utils.issequence(key): key = [key] selected_column_names = [n for n in self._column_names if n not in key] return self.select(selected_column_names)
def exclude(self, key): """ Create a new table with the same rows as this one, but only columns not in the ``key``. :param key: Either the name of a column to exclude or a sequence of such names. :returns: A new :class:`Table`. """ if not utils.issequence(key): key = [key] selected_column_names = [n for n in self._column_names if n not in key] return self.select(selected_column_names)
def order_by(self, key, reverse=False): """ Create a new table that is sorted. :param key: Either the name of a single column to sort by, a sequence of such names, or a :class:`function` that takes a row and returns a value to sort by. :param reverse: If `True` then sort in reverse (typically, descending) order. :returns: A new :class:`.Table`. """ if len(self._rows) == 0: return self._fork(self._rows) else: key_is_row_function = hasattr(key, '__call__') key_is_sequence = utils.issequence(key) def sort_key(data): row = data[1] if key_is_row_function: k = key(row) elif key_is_sequence: k = tuple(row[n] for n in key) else: k = row[key] if k is None: return utils.NullOrder() return k results = sorted(enumerate(self._rows), key=sort_key, reverse=reverse) indices, rows = zip(*results) if self._row_names is not None: row_names = [self._row_names[i] for i in indices] else: row_names = None return self._fork(rows, row_names=row_names)
def order_by(self, key, reverse=False): """ Sort this table by the :code:`key`. This can be either a column_name or callable that returns a value to sort by. :param key: Either the name of a column to sort by, a sequence of such names, or a :class:`function` that takes a row and returns a value to sort by. :param reverse: If `True` then sort in reverse (typically, descending) order. :returns: A new :class:`Table`. """ if len(self._rows) == 0: return self._fork(self._rows) else: key_is_row_function = hasattr(key, "__call__") key_is_sequence = utils.issequence(key) def sort_key(data): row = data[1] if key_is_row_function: k = key(row) elif key_is_sequence: k = tuple(row[n] for n in key) else: k = row[key] if k is None: return utils.NullOrder() return k results = sorted(enumerate(self._rows), key=sort_key, reverse=reverse) indices, rows = zip(*results) if self._row_names is not None: row_names = [self._row_names[i] for i in indices] else: row_names = None return self._fork(rows, row_names=row_names)
def run(self, table): """ :returns: :class:`string` """ new_column = [] for row in table.rows: if issequence(self._column_name): column_value = '' for column_name in self._column_name: column_value = column_value + ' ' + row[column_name] new_column.append(column_value) else: new_column.append(row[self._column_name]) return slugify(new_column, ensure_unique=self._ensure_unique, **self._slug_args)
def select(self, key): """ Create a new table with the same rows as this one, but only those columns in the ``key``. :param key: Either the name of a column to include or a sequence of such names. :returns: A new :class:`Table`. """ if not utils.issequence(key): key = [key] column_types = [self.columns[name].data_type for name in key] new_rows = [] for row in self._rows: new_rows.append(Row(tuple(row[n] for n in key), key)) return self._fork(new_rows, key, column_types)
def select(self, key): """ Create a new table with only the specified columns. :param key: Either the name of a single column to include or a sequence of such names. :returns: A new :class:`.Table`. """ if not utils.issequence(key): key = [key] column_types = [self.columns[name].data_type for name in key] new_rows = [] for row in self._rows: new_rows.append(Row(tuple(row[n] for n in key), key)) return self._fork(new_rows, key, column_types)
def distinct(self, key=None): """ Create a new table with only unique rows. :param key: Either the name of a single column to use to identify unique rows, a sequence of such column names, a :class:`function` that takes a row and returns a value to identify unique rows, or `None`, in which case the entire row will be checked for uniqueness. :returns: A new :class:`.Table`. """ key_is_row_function = hasattr(key, '__call__') key_is_sequence = utils.issequence(key) uniques = [] rows = [] if self._row_names is not None: row_names = [] else: row_names = None for i, row in enumerate(self._rows): if key_is_row_function: k = key(row) elif key_is_sequence: k = (row[j] for j in key) elif key is None: k = tuple(row) else: k = row[key] if k not in uniques: uniques.append(k) rows.append(row) if self._row_names is not None: row_names.append(self._row_names[i]) return self._fork(rows, row_names=row_names)
def select(self, key): """ Create a new table with only the specified columns. :param key: Either the name of a single column to include or a sequence of such names. :returns: A new :class:`.Table`. """ if not utils.issequence(key): key = [key] indexes = tuple(self._column_names.index(k) for k in key) column_types = tuple(self._column_types[i] for i in indexes) new_rows = [] for row in self._rows: new_rows.append(Row((row[i] for i in indexes), key)) return self._fork(new_rows, key, column_types)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?') # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn('Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str(duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError('Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Create a new table with row values converted into columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. This is the opposite of :meth:`.Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method implements most varieties of SQL join, in addition to some unique features. If :code:`left_key` and :code:`right_key` are both :code:`None` then this method will perform a "sequential join", which is to say it will join on row number. The :code:`inner` and :code:`full_outer` arguments will determine whether dangling left-hand and right-hand rows are included, respectively. If :code:`left_key` is specified, then a "left outer join" will be performed. This will combine columns from the :code:`right_table` anywhere that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from the left table will be included with the right-hand columns set to :code:`None`. If :code:`inner` is :code:`True` then an "inner join" will be performed. Unmatched rows from either table will be left out. If :code:`full_outer` is :code:`True` then a "full outer join" will be performed. Unmatched rows from both tables will be included, with the columns in the other table set to :code:`None`. In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key` will be used for both tables. If :code:`left_key` and :code:`right_key` are column names, the right-hand identifier column will not be included in the output table. If :code:`require_match` is :code:`True` unmatched rows will raise an exception. This is like an "inner join" except any row that doesn't have a match will raise an exception instead of being dropped. This is useful for enforcing expectations about datasets that should match. Column names from the right table which also exist in this table will be suffixed "2" in the new table. A subset of columns from the right-hand table can be included in the joined table using the :code:`columns` argument. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, the index of a column, a sequence of such column identifiers, a :class:`function` that takes a row and returns a value to join on, or :code:`None` in which case the tables will be joined on row number. :param right_key: Either the name of a column from :code:table` to join on, the index of a column, a sequence of such column identifiers, or a :class:`function` that takes a ow and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. If :code:`left_key` is :code:`None` then this value is ignored. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param full_outer: Perform a SQL-style "full outer" join rather than a left or a right. May not be used in combination with :code:`inner`. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`. :returns: A new :class:`.Table`. """ if inner and full_outer: raise ValueError('A join can not be both "inner" and "full_outer".') if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is None if left_key is None: left_data = tuple(range(len(self._rows))) # Left key is a function elif left_key_is_func: left_data = [left_key(row) for row in self._rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self._columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self._columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Sequential join if left_key is None: right_data = tuple(range(len(right_table._rows))) # Right key is a function elif right_key_is_func: right_data = [right_key(row) for row in right_table._rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table._columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [right_table._columns._keys.index(key) for key in right_key] # Right key is a column name/index else: right_column = right_table._columns[right_key] right_data = right_column.values() right_key_indices = [right_table._columns.index(right_column)] # Build names and type lists column_names = list(self._column_names) column_types = list(self._column_types) for i, column in enumerate(right_table._columns): name = column.name if not full_outer: if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None and not full_outer: right_table = right_table.select([n for n in right_table._column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table._rows[i]) # Collect new rows rows = [] if self._row_names is not None and not full_outer: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError('Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self._rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Rows without matches elif not inner: new_row = list(self._rows[left_index]) for k, v in enumerate(right_table._column_names): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Full outer join if full_outer: left_set = set(left_data) for right_index, right_value in enumerate(right_data): if right_value in left_set: continue new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index]) rows.append(Row(new_row, column_names)) return self._fork(rows, column_names, column_types, row_names=row_names)
def join(self, right_table, left_key, right_key=None, inner=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method performs the equivalent of SQL's "left outer join", combining columns from this table and from :code:`right_table` anywhere that the :code:`left_key` and :code:`right_key` are equivalent. Where there is no match for :code:`left_key` the left columns will be included with the right columns set to :code:`None` unless the :code:`inner` argument is specified. If :code:`left_key` and :code:`right_key` are column names, only the left columns will be included in the output table. Column names from the right table which also exist in this table will be suffixed "2" in the new table. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. :param right_key: Either the name of a column from :code:table` to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. :returns: A new :class:`.Table`. """ if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is a function if left_key_is_func: left_data = [left_key(row) for row in self.rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self.columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self.columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Right key is a function if right_key_is_func: right_data = [right_key(row) for row in right_table.rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table.columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [ right_table.columns._keys.index(key) for key in right_key ] # Right key is a column name/index else: right_column = right_table.columns[right_key] right_data = right_column.values() right_key_indices = [right_table.columns._keys.index(right_key)] # Build names and type lists column_names = list(self.column_names) column_types = list(self.column_types) for i, column in enumerate(right_table.columns): name = column.name if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None: right_table = right_table.select( [n for n in right_table.column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table.rows[i]) # Collect new rows rows = [] if self.row_names is not None: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError( 'Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self.rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) # Rows without matches elif not inner: new_row = list(self.rows[left_index]) for k, v in enumerate(right_table.column_names): if columns is None and k in right_key_indices: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) return self._fork(rows, column_names, column_types, row_names=row_names)
def homogenize(self, key, compare_values, default_row=None): """ Fills missing rows in a dataset with default values. Determines what rows are missing by comparing the values in the given column_names with the expected compare_values. Values not found in the table will be used to generate new rows with the given default_row. Default_row should be an array of values or an array-generating function. If not specified, the new rows will have `None` in columns not given in column_names. If it is an array of values, the length should be row length minus column_names count and the gap will be filled with the missing values. If it is an array-generating function, the function should take an array of missing values for each new row and output a full row including those values. :param key: Either a column name or a sequence of such names. :param compare_values: Either an array of column values if key is a single column name or a sequence of arrays of values if key is a sequence of names. It can also be a generator that yields one of the two. A row is created for each value or list of values not found in the rows of the table. :param default_row: An array of values or a function to generate new rows. The length of the input array should be equal to row length minus column_names count. The length of array generated by the function should be the row length. :returns: A new :class:`Table`. """ rows = list(self.rows) if not utils.issequence(key): key = [key] if len(key) == 1: if any(not utils.issequence(compare_value) for compare_value in compare_values): compare_values = [[compare_value] for compare_value in compare_values] column_values = [self.columns.get(name) for name in key] column_indexes = [self.column_names.index(name) for name in key] column_values = zip(*column_values) differences = list(set(map(tuple, compare_values)) - set(column_values)) for difference in differences: if callable(default_row): rows.append(Row(default_row(difference), self.column_names)) else: if default_row is not None: new_row = default_row else: new_row = [None] * (len(self.column_names) - len(key)) for i, d in zip(column_indexes, difference): new_row.insert(i, d) rows.append(Row(new_row, self.column_names)) return self._fork(rows, self.column_names, self.column_types)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( "When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?" ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning ) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError("Column names must be strings or None.") final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + "_" + str(duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2, ) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError("Column types must be instances of DataType.") if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError("column_names and column_types must be the same length.") if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( "Row %i has %i values, but Table only has %i columns." % (i, len_row, len_column_names) ) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, "__call__"): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError("row_names must be a column name, function or sequence") self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', **kwargs): """ Create a new table from a CSV. This method uses agate's builtin CSV reader, which supplies encoding support for both Python 2 and Python 3. :code:`kwargs` will be passed through to the CSV reader. :param path: Filepath or file-like object from which to read CSV data. :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. :param row_names: See :meth:`.Table.__init__`. :param skip_lines: Either a single number indicating the number of lines to skip from the top of the file or a sequence of line indexes to skip where the first line is index 0. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. If `header` and `column_names` are both specified then a row will be skipped, but `column_names` will be used. :param sniff_limit: Limit CSV dialect sniffing to the specified number of bytes. Set to None to sniff the entire file. Defaults to 0 or no sniffing. :param encoding: Character encoding of the CSV file. Note: if passing in a file handle it is assumed you have already opened it with the correct encoding specified. """ from agate import csv from agate.table import Table if hasattr(path, 'read'): lines = path.readlines() else: with io.open(path, encoding=encoding) as f: lines = f.readlines() if utils.issequence(skip_lines): lines = [line for i, line in enumerate(lines) if i not in skip_lines] contents = ''.join(lines) elif isinstance(skip_lines, int): contents = ''.join(lines[skip_lines:]) else: raise ValueError('skip_lines argument must be an int or sequence') if sniff_limit is None: kwargs['dialect'] = csv.Sniffer().sniff(contents) elif sniff_limit > 0: kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit]) if six.PY2: contents = contents.encode('utf-8') rows = list(csv.reader(six.StringIO(contents), header=header, **kwargs)) if header: if column_names is None: column_names = rows.pop(0) else: rows.pop(0) return Table(rows, column_names, column_types, row_names=row_names)
def homogenize(self, key, compare_values, default_row=None): """ Fill in missing rows in a series. This can be used, for instance, to add rows for missing years in a time series. Missing rows are found by comparing the values in the :code:`key` columns with those provided as :code:`compare_values`. Values not found in the table will be used to generate new rows with the given :code:`default_row`. :code:`default_row` should be an array of values or an array-generating function. If not specified, the new rows will have :code:`None` in columns all columns not specified in :code:`key`. If :code:`default_row` is an array of values, its length should be row length minus the number of column names provided in the :code:`key`. If it is an array-generating function, the function should take an array of missing values for each new row and output a full row including those values. :param key: Either a column name or a sequence of such names. :param compare_values: Either an array of column values if key is a single column name or a sequence of arrays of values if key is a sequence of names. It can also be a generator that yields either of the two. A row is created for each value or list of values not found in the rows of the table. :param default_row: An array of values or a function to generate new rows. The length of the input array should be equal to row length minus column_names count. The length of array generated by the function should be the row length. :returns: A new :class:`.Table`. """ rows = list(self._rows) if not utils.issequence(key): key = [key] if len(key) == 1: if any(not utils.issequence(compare_value) for compare_value in compare_values): compare_values = [[compare_value] for compare_value in compare_values] column_values = [self._columns.get(name) for name in key] column_indexes = [self._column_names.index(name) for name in key] column_values = zip(*column_values) differences = list(set(map(tuple, compare_values)) - set(column_values)) for difference in differences: if callable(default_row): rows.append(Row(default_row(difference), self._column_names)) else: if default_row is not None: new_row = default_row else: new_row = [None] * (len(self._column_names) - len(key)) for i, d in zip(column_indexes, difference): new_row.insert(i, d) rows.append(Row(new_row, self._column_names)) return self._fork(rows)
def join(self, right_table, left_key, right_key=None, inner=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method performs the equivalent of SQL's "left outer join", combining columns from this table and from :code:`right_table` anywhere that the :code:`left_key` and :code:`right_key` are equivalent. Where there is no match for :code:`left_key` the left columns will be included with the right columns set to :code:`None` unless the :code:`inner` argument is specified. If :code:`left_key` and :code:`right_key` are column names, only the left columns will be included in the output table. Column names from the right table which also exist in this table will be suffixed "2" in the new table. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. :param right_key: Either the name of a column from :code:table` to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. :returns: A new :class:`.Table`. """ if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is a function if left_key_is_func: left_data = [left_key(row) for row in self.rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self.columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self.columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Right key is a function if right_key_is_func: right_data = [right_key(row) for row in right_table.rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table.columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [right_table.columns._keys.index(key) for key in right_key] # Right key is a column name/index else: right_column = right_table.columns[right_key] right_data = right_column.values() right_key_indices = [right_table.columns._keys.index(right_key)] # Build names and type lists column_names = list(self.column_names) column_types = list(self.column_types) for i, column in enumerate(right_table.columns): name = column.name if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None: right_table = right_table.select([n for n in right_table.column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table.rows[i]) # Collect new rows rows = [] if self.row_names is not None: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError('Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self.rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) # Rows without matches elif not inner: new_row = list(self.rows[left_index]) for k, v in enumerate(right_table.column_names): if columns is None and k in right_key_indices: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) return self._fork(rows, column_names, column_types, row_names=row_names)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self._rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append(Row((left_row + [f, row[f]]), new_column_names)) key_column_types = [self._column_types[self._column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method implements most varieties of SQL join, in addition to some unique features. If :code:`left_key` and :code:`right_key` are both :code:`None` then this method will peform a "sequential join", which is to say it will join on row number. The :code:`inner` and :code:`full_outer` arguments will determine whether dangling left-hand and right-hand rows are included, respectively. If :code:`left_key` is specified, then a "left outer join" will be performed. This will combine columns from the :code:`right_table` anywhere that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from the left table will be included with the right-hand columns set to :code:`None`. If :code:`inner` is :code:`True` then an "inner join" will be performed. Unmatched rows from either table will be left out. If :code:`full_outer` is :code:`True` then a "full outer join" will be performed. Unmatched rows from both tables will be included, with the columns in the other table set to :code:`None`. In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key` will be used for both tables. If :code:`left_key` and :code:`right_key` are column names, the right-hand identifier column will not be included in the output table. If :code:`require_match` is :code:`True` unmatched rows will raise an exception. This is like an "inner join" except any row that doesn't have a match will raise an exception instead of being dropped. This is useful for enforcing expectations about datasets that should match. Column names from the right table which also exist in this table will be suffixed "2" in the new table. A subset of columns from the right-hand table can be included in the joined table using the :code:`columns` argument. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, the index of a column, a sequence of such column identifiers, a :class:`function` that takes a row and returns a value to join on, or :code:`None` in which case the tables will be joined on row number. :param right_key: Either the name of a column from :code:table` to join on, the index of a column, a sequence of such column identifiers, or a :class:`function` that takes a ow and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. If :code:`left_key` is :code:`None` then this value is ignored. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param full_outer: Perform a SQL-style "full outer" join rather than a left or a right. May not be used in combination with :code:`inner`. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`. :returns: A new :class:`.Table`. """ if inner and full_outer: raise ValueError('A join can not be both "inner" and "full_outer".') if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is None if left_key is None: left_data = tuple(range(len(self._rows))) # Left key is a function elif left_key_is_func: left_data = [left_key(row) for row in self._rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self._columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self._columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Sequential join if left_key is None: right_data = tuple(range(len(right_table._rows))) # Right key is a function elif right_key_is_func: right_data = [right_key(row) for row in right_table._rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table._columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [right_table._columns._keys.index(key) for key in right_key] # Right key is a column name/index else: right_column = right_table._columns[right_key] right_data = right_column.values() right_key_indices = [right_table._columns.index(right_column)] # Build names and type lists column_names = list(self._column_names) column_types = list(self._column_types) for i, column in enumerate(right_table._columns): name = column.name if not full_outer: if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None and not full_outer: right_table = right_table.select([n for n in right_table._column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table._rows[i]) # Collect new rows rows = [] if self._row_names is not None and not full_outer: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError('Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self._rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Rows without matches elif not inner: new_row = list(self._rows[left_index]) for k, v in enumerate(right_table._column_names): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Full outer join if full_outer: left_set = set(left_data) for right_index, right_value in enumerate(right_data): if right_value in left_set: continue new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index]) rows.append(Row(new_row, column_names)) return self._fork(rows, column_names, column_types, row_names=row_names)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Denormalize a dataset so that unique values in a column become their own columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have `row_names` set to those keys. This is the opposite of :meth:`Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [self.column_types[self.column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. ' 'Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError( str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError( 'Row names cannot be of type int. Use Decimal for numbered row names.' ) self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def homogenize(self, key, compare_values, default_row=None): """ Fill in missing rows in a series. This can be used, for instance, to add rows for missing years in a time series. Missing rows are found by comparing the values in the :code:`key` columns with those provided as :code:`compare_values`. Values not found in the table will be used to generate new rows with the given :code:`default_row`. :code:`default_row` should be an array of values or an array-generating function. If not specified, the new rows will have :code:`None` in columns all columns not specified in :code:`key`. If :code:`default_row` is an array of values, its length should be row length minus the number of column names provided in the :code:`key`. If it is an array-generating function, the function should take an array of missing values for each new row and output a full row including those values. :param key: Either a column name or a sequence of such names. :param compare_values: Either an array of column values if key is a single column name or a sequence of arrays of values if key is a sequence of names. It can also be a generator that yields either of the two. A row is created for each value or list of values not found in the rows of the table. :param default_row: An array of values or a function to generate new rows. The length of the input array should be equal to row length minus column_names count. The length of array generated by the function should be the row length. :returns: A new :class:`.Table`. """ rows = list(self.rows) if not utils.issequence(key): key = [key] if len(key) == 1: if any(not utils.issequence(compare_value) for compare_value in compare_values): compare_values = [[compare_value] for compare_value in compare_values] column_values = [self.columns.get(name) for name in key] column_indexes = [self.column_names.index(name) for name in key] column_values = zip(*column_values) differences = list(set(map(tuple, compare_values)) - set(column_values)) for difference in differences: if callable(default_row): rows.append(Row(default_row(difference), self.column_names)) else: if default_row is not None: new_row = default_row else: new_row = [None] * (len(self.column_names) - len(key)) for i, d in zip(column_indexes, difference): new_row.insert(i, d) rows.append(Row(new_row, self.column_names)) return self._fork(rows, self.column_names, self.column_types)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self.rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append( Row(tuple(left_row + [f, row[f]]), new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def pivot(self, key=None, pivot=None, aggregation=None, computation=None, default_value=utils.default, key_name=None): """ Pivot reorganizes the data in a table by grouping the data, aggregating those groups, optionally applying a computation, and then organizing the groups into new rows and columns. For example: +---------+---------+-- ------+ | name | race | gender | +=========+=========+========+ | Joe | white | male | +---------+---------+--------+ | Jane | black | female | +---------+---------+--------+ | Josh | black | male | +---------+---------+--------+ | Jim | asian | female | +---------+---------+--------+ This table can be pivoted with :code:`key` equal to "race" and :code:`columns` equal to "gender". The default aggregation is :class:`.Count`. This would result in the following table. +---------+---------+--------+ | race | male | female | +=========+=========+========+ | white | 1 | 0 | +---------+---------+--------+ | black | 1 | 1 | +---------+---------+--------+ | asian | 0 | 1 | +---------+---------+--------+ If one or more keys are specified then the resulting table will automatically have `row_names` set to those keys. See also the related method :meth:`Table.denormalize`. :param key: Either the name of a column from the this table to group by, a sequence of such column names, a :class:`function` that takes a row and returns a value to group by, or :code:`None`, in which case there will be only a single row in the output table. :param columns: A column name whose unique values will become columns in the new table, or :code:`None` in which case there will be a single value column in the output table. :param aggregation: An instance of an :class:`.Aggregation` to perform on each group of data in the pivot table. (Each cell is the result of an aggregation of the grouped data.) If not specified this defaults to :class:`.Count` with no arguments. :param computation: An optional :class:`.Computation` instance to be applied to the aggregated sequence of values before they are transposed into the pivot table. Use the class name of the aggregation as your column name argument when constructing your computation. (This is "Count" if using the default value for :code:`aggregation`.) :param default_value: Value to be used for missing values in the pivot table. Defaults to :code:`Decimal(0)`. If performing non-mathematical aggregations you may wish to set this to :code:`None`. :param key_name: A name for the key column in the output table. This is most useful when the provided key is a function. This argument is not valid when :code:`key` is a sequence. :returns: A new :class:`Table`. """ if key is None: key = [] elif not utils.issequence(key): key = [key] elif key_name: raise ValueError('key_name is not a valid argument when key is a sequence.') if aggregation is None: aggregation = Count() groups = self for k in key: groups = groups.group_by(k, key_name=key_name) aggregation_name = six.text_type(aggregation) computation_name = six.text_type(computation) if computation else None def apply_computation(table): computed = table.compute([ (computation_name, computation) ]) excluded = computed.exclude([aggregation_name]) return excluded if pivot is not None: groups = groups.group_by(pivot) column_type = aggregation.get_aggregate_data_type(groups) table = groups.aggregate([ (aggregation_name, aggregation) ]) pivot_count = len(set(table.columns[pivot].values())) if computation is not None: column_types = computation.get_computed_data_type(table) table = apply_computation(table) column_types = [column_type] * pivot_count table = table.denormalize(key, pivot, computation_name or aggregation_name, default_value=default_value, column_types=column_types) else: table = groups.aggregate([ (aggregation_name, aggregation) ]) if computation: table = apply_computation(table) return table
def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', **kwargs): """ Create a new table from a CSV. This method uses agate's builtin CSV reader, which supplies encoding support for both Python 2 and Python 3. :code:`kwargs` will be passed through to the CSV reader. :param path: Filepath or file-like object from which to read CSV data. :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. :param row_names: See :meth:`.Table.__init__`. :param skip_lines: Either a single number indicating the number of lines to skip from the top of the file or a sequence of line indexes to skip where the first line is index 0. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. If `header` and `column_names` are both specified then a row will be skipped, but `column_names` will be used. :param sniff_limit: Limit CSV dialect sniffing to the specified number of bytes. Set to None to sniff the entire file. Defaults to 0 or no sniffing. :param encoding: Character encoding of the CSV file. Note: if passing in a file handle it is assumed you have already opened it with the correct encoding specified. """ if hasattr(path, 'read'): lines = path.readlines() else: with io.open(path, encoding=encoding) as f: lines = f.readlines() if utils.issequence(skip_lines): lines = [ line for i, line in enumerate(lines) if i not in skip_lines ] contents = ''.join(lines) elif isinstance(skip_lines, int): contents = ''.join(lines[skip_lines:]) else: raise ValueError('skip_lines argument must be an int or sequence') if sniff_limit is None: kwargs['dialect'] = csv.Sniffer().sniff(contents) elif sniff_limit > 0: kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit]) if six.PY2: contents = contents.encode('utf-8') rows = list(csv.reader(six.StringIO(contents), header=header, **kwargs)) if header: if column_names is None: column_names = rows.pop(0) else: rows.pop(0) return Table(rows, column_names, column_types, row_names=row_names)
def pivot(self, key=None, pivot=None, aggregation=None, computation=None, default_value=utils.default, key_name=None): """ Create a new table by grouping the data, aggregating those groups, applying a computation, and then organizing the groups into new rows and columns. This is sometimes called a "crosstab". +---------+---------+--------+ | name | race | gender | +=========+=========+========+ | Joe | white | male | +---------+---------+--------+ | Jane | black | female | +---------+---------+--------+ | Josh | black | male | +---------+---------+--------+ | Jim | asian | female | +---------+---------+--------+ This table can be pivoted with :code:`key` equal to "race" and :code:`columns` equal to "gender". The default aggregation is :class:`.Count`. This would result in the following table. +---------+---------+--------+ | race | male | female | +=========+=========+========+ | white | 1 | 0 | +---------+---------+--------+ | black | 1 | 1 | +---------+---------+--------+ | asian | 0 | 1 | +---------+---------+--------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. See also the related method :meth:`.Table.denormalize`. :param key: Either the name of a column from the this table to group by, a sequence of such column names, a :class:`function` that takes a row and returns a value to group by, or :code:`None`, in which case there will be only a single row in the output table. :param pivot: A column name whose unique values will become columns in the new table, or :code:`None` in which case there will be a single value column in the output table. :param aggregation: An instance of an :class:`.Aggregation` to perform on each group of data in the pivot table. (Each cell is the result of an aggregation of the grouped data.) If not specified this defaults to :class:`.Count` with no arguments. :param computation: An optional :class:`.Computation` instance to be applied to the aggregated sequence of values before they are transposed into the pivot table. Use the class name of the aggregation as your column name argument when constructing your computation. (This is "Count" if using the default value for :code:`aggregation`.) :param default_value: Value to be used for missing values in the pivot table. Defaults to :code:`Decimal(0)`. If performing non-mathematical aggregations you may wish to set this to :code:`None`. :param key_name: A name for the key column in the output table. This is most useful when the provided key is a function. This argument is not valid when :code:`key` is a sequence. :returns: A new :class:`.Table`. """ if key is None: key = [] elif not utils.issequence(key): key = [key] elif key_name: raise ValueError( 'key_name is not a valid argument when key is a sequence.') if aggregation is None: aggregation = Count() groups = self for k in key: groups = groups.group_by(k, key_name=key_name) aggregation_name = six.text_type(aggregation) computation_name = six.text_type(computation) if computation else None def apply_computation(table): computed = table.compute([(computation_name, computation)]) excluded = computed.exclude([aggregation_name]) return excluded if pivot is not None: groups = groups.group_by(pivot) column_type = aggregation.get_aggregate_data_type(self) table = groups.aggregate([(aggregation_name, aggregation)]) pivot_count = len(set(table.columns[pivot].values())) if computation is not None: column_types = computation.get_computed_data_type(table) table = apply_computation(table) column_types = [column_type] * pivot_count table = table.denormalize(key, pivot, computation_name or aggregation_name, default_value=default_value, column_types=column_types) else: table = groups.aggregate([(aggregation_name, aggregation)]) if computation: table = apply_computation(table) return table
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?' ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str( duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in six.itervalues(column_types): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)