Exemplo n.º 1
0
    def test_max_length(self):
        rows = [
            Row(['a'], ['test']),
            Row(['gobble'], ['test']),
            Row(['w'], ['test']),
        ]

        column = Column(0, 'test', Text(), rows)
        self.assertEqual(column.aggregate(MaxLength()), 6)
Exemplo n.º 2
0
    def test_max(self):
        rows = [
            Row([datetime.datetime(1994, 3, 3, 6, 31)], ['test']),
            Row([datetime.datetime(1994, 3, 3, 6, 30, 30)], ['test']),
            Row([datetime.datetime(1994, 3, 3, 6, 30)], ['test']),
        ]

        column = Column(0, 'test', DateTime(), rows)

        self.assertIsInstance(Max().get_aggregate_data_type(column), DateTime)
        self.assertEqual(column.aggregate(Max()), datetime.datetime(1994, 3, 3, 6, 31))
Exemplo n.º 3
0
    def compute(self, computations):
        """
        Compute new columns by applying one or more :class:`.Computation` to
        each row.

        :param computations:
            A sequence of pairs of new column names and :class:`.Computation`
            instances.
        :returns:
            A new :class:`Table`.
        """
        column_names = list(copy(self._column_names))
        column_types = list(copy(self._column_types))

        for new_column_name, computation in computations:
            column_names.append(new_column_name)
            column_types.append(computation.get_computed_data_type(self))

            computation.validate(self)

        new_columns = tuple(c.run(self) for n, c in computations)
        new_rows = []

        for i, row in enumerate(self._rows):
            values = tuple(row) + tuple(c[i] for c in new_columns)
            new_rows.append(Row(values, column_names))

        return self._fork(new_rows, column_names, column_types)
Exemplo n.º 4
0
    def compute(self, computations):
        """
        Compute new columns by applying one or more :class:`.Computation` to
        each row.

        :param computations: An iterable of pairs of new column names and
            :class:`.Computation` instances.
        :returns: A new :class:`Table`.
        """
        column_names = list(copy(self._column_names))
        column_types = list(copy(self._column_types))

        for computation, new_column_name in computations:
            if not isinstance(computation, Computation):
                raise ValueError('The first element in pair must be a Computation instance.')

            column_names.append(new_column_name)
            column_types.append(computation.get_computed_data_type(self))

            computation.prepare(self)

        new_rows = []

        for row in self._rows:
            new_columns = tuple(c.run(row) for c, n in computations)
            new_rows.append(Row(tuple(row) + new_columns, column_names))

        return self._fork(new_rows, zip(column_names, column_types))
Exemplo n.º 5
0
    def merge(cls, tables):
        """
        Merge an array of tables with identical columns into a single table.
        Each table must have exactly the same column types. Their column names
        need not be identical. The first table's column names will be the ones
        which are used.

        :param tables:
            An sequence of :class:`Table` instances.
        :returns:
            A new :class:`Table`.
        """
        column_names = tables[0].column_names
        column_types = tables[0].column_types

        for table in tables[1:]:
            if table.column_types != column_types:
                raise ValueError(
                    'Only tables with identical column types may be merged.')

        rows = []

        for table in tables:
            if table.column_names == column_names:
                rows.extend(table.rows)
            else:
                for row in table.rows:
                    rows.append(Row(row.values(), column_names))

        return Table(rows,
                     column_names,
                     column_types,
                     row_names=tables[0].row_names,
                     _is_fork=True)
Exemplo n.º 6
0
def compute(self, computations, replace=False):
    """
    Create a new table by applying one or more :class:`.Computation` instances
    to each row.

    :param computations:
        A sequence of pairs of new column names and :class:`.Computation`
        instances.
    :param replace:
        If :code:`True` then new column names can match existing names, and
        those columns will be replaced with the computed data.
    :returns:
        A new :class:`.Table`.
    """
    column_names = list(copy(self._column_names))
    column_types = list(copy(self._column_types))

    for new_column_name, computation in computations:
        new_column_type = computation.get_computed_data_type(self)

        if new_column_name in column_names:
            if not replace:
                raise ValueError(
                    'New column name "%s" already exists. Specify replace=True to replace with computed data.'
                )

            i = column_names.index(new_column_name)
            column_types[i] = new_column_type
        else:
            column_names.append(new_column_name)
            column_types.append(new_column_type)

        computation.validate(self)

    new_columns = OrderedDict()

    for new_column_name, computation in computations:
        new_columns[new_column_name] = computation.run(self)

    new_rows = []

    for i, row in enumerate(self._rows):
        # Slow version if using replace
        if replace:
            values = []

            for j, column_name in enumerate(column_names):
                if column_name in new_columns:
                    values.append(new_columns[column_name][i])
                else:
                    values.append(row[j])
        # Faster version if not using replace
        else:
            values = row.values() + tuple(c[i] for c in new_columns.values())

        new_rows.append(Row(values, column_names))

    return self._fork(new_rows, column_names, column_types)
Exemplo n.º 7
0
    def merge(self, groups=None, group_name=None, group_type=None):
        """
        Convert this TableSet into a single table. This is the inverse of
        :meth:`.Table.group_by`.

        Any `row_names` set on the merged tables will be lost in this
        process.

        :param groups:
            A list of grouping factors to add to merged rows in a new column. 
            If specified, it should have exactly one element per :class:`Table` 
            in the :class:`TableSet`. If not specified or None, the grouping 
            factor will be the name of the :class:`Row`'s original Table.
        :param group_name:
            This will be the column name of the grouping factors. If None, 
            defaults to the :attr:`TableSet.key_name`.
        :param group_type:
            This will be the column type of the grouping factors. If None, 
            defaults to the :attr:`TableSet.key_type`.
        :returns:
            A new :class:`Table`.
        """
        if type(groups) is not list and groups is not None:
            raise ValueError('Groups must be None or a list.')

        if type(groups) is list and len(groups) != len(self):
            raise ValueError('Groups length must be equal to TableSet length.')

        column_names = list(self.column_names)
        column_types = list(self.column_types)

        column_names.insert(0, group_name if group_name else self.key_name)
        column_types.insert(0, group_type if group_type else self.key_type)

        rows = []

        for index, (key, table) in enumerate(self.items()):
            for row in table.rows:
                if groups is None:
                    rows.append(Row((key, ) + tuple(row), column_names))
                else:
                    rows.append(
                        Row((groups[index], ) + tuple(row), column_names))

        return Table(rows, column_names, column_types)
Exemplo n.º 8
0
def merge(cls, tables, row_names=None, column_names=None):
    """
    Create a new table from a sequence of similar tables.

    This method will not carry over row names from the merged tables, but new
    row names can be specified with the :code:`row_names` argument.

    It is possible to limit the columns included in the new :class:`.Table`
    with :code:`column_names` argument. For example, to only include columns
    from a specific table, set :code:`column_names` equal to
    :code:`table.column_names`.

    :param tables:
        An sequence of :class:`.Table` instances.
    :param row_names:
        See :class:`.Table` for the usage of this parameter.
    :param column_names:
        A sequence of column names to include in the new :class:`.Table`. If
        not specified, all distinct column names from `tables` are included.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_columns = OrderedDict()

    for table in tables:
        for i in range(0, len(table.columns)):
            if column_names is None or table.column_names[i] in column_names:
                column_name = table.column_names[i]
                column_type = table.column_types[i]

                if column_name in new_columns:
                    if not isinstance(column_type, type(new_columns[column_name])):
                        raise DataTypeError('Tables contain columns with the same names, but different types.')
                else:
                    new_columns[column_name] = column_type

    column_keys = new_columns.keys()
    column_types = new_columns.values()

    rows = []

    for table in tables:
        # Performance optimization for identical table structures
        if table.column_names == column_keys and table.column_types == column_types:
            rows.extend(table.rows)
        else:
            for row in table.rows:
                data = []

                for column_key in column_keys:
                    data.append(row.get(column_key, None))

                rows.append(Row(data, column_keys))

    return Table(rows, column_keys, column_types, row_names=row_names, _is_fork=True)
Exemplo n.º 9
0
    def _get_row(self, i):
        """
        Get a :class:`.Row` of data, caching a copy for the next request.
        """
        if i not in self._cached_rows:
            # If rows are from a fork, they are safe to access directly
            if isinstance(self._data[i], Row):
                self._cached_rows[i] = self._data[i]
            else:
                self._cached_rows[i] = Row(self, i)

        return self._cached_rows[i]
Exemplo n.º 10
0
    def select(self, selected_names):
        """
        Reduce this table to only the specified columns.

        :param selected_names: A sequence of names of columns to include in the
            new table.
        :returns: A new :class:`Table`.
        """
        new_columns = [self.columns[name] for name in selected_names]
        new_rows = []

        for row in self._rows:
            new_rows.append(Row(tuple(row[n] for n in selected_names), selected_names))

        return self._fork(new_rows, new_columns)
Exemplo n.º 11
0
    def select(self, key):
        """
        Create a new table with only the specified columns.

        :param key:
            Either the name of a single column to include or a sequence of such
            names.
        :returns:
            A new :class:`.Table`.
        """
        if not utils.issequence(key):
            key = [key]

        column_types = [self.columns[name].data_type for name in key]
        new_rows = []

        for row in self._rows:
            new_rows.append(Row(tuple(row[n] for n in key), key))

        return self._fork(new_rows, key, column_types)
Exemplo n.º 12
0
def select(self, key):
    """
    Create a new table with only the specified columns.

    :param key:
        Either the name of a single column to include or a sequence of such
        names.
    :returns:
        A new :class:`.Table`.
    """
    if not utils.issequence(key):
        key = [key]

    indexes = tuple(self._column_names.index(k) for k in key)
    column_types = tuple(self._column_types[i] for i in indexes)
    new_rows = []

    for row in self._rows:
        new_rows.append(Row((row[i] for i in indexes), key))

    return self._fork(new_rows, key, column_types)
Exemplo n.º 13
0
    def select(self, selected_column_names):
        """
        Create a new table with the same rows as this one, but only those
        columns in the ``selected_column_names`` sequence.

        :param selected_column_names:
            A sequence of names of columns to include in the new table.
        :returns:
            A new :class:`Table`.
        """
        column_types = [
            self.columns[name].data_type for name in selected_column_names
        ]
        new_rows = []

        for row in self._rows:
            new_rows.append(
                Row(tuple(row[n] for n in selected_column_names),
                    selected_column_names))

        return self._fork(new_rows, selected_column_names, column_types)
Exemplo n.º 14
0
    def merge(cls, tables, row_names=None):
        """
        Merge an array of tables with identical columns into a single table.
        Each table must have exactly the same column types. Their column names
        need not be identical. The first table's column names will be the ones
        which are used.

        Row names will be lost, but new row names can be specified with the
        `row_names` argument.

        :param tables:
            An sequence of :class:`Table` instances.
        :param row_names:
            See :class:`Table` for the usage of this parameter.
        :returns:
            A new :class:`Table`.
        """
        column_names = tables[0].column_names
        column_types = tables[0].column_types

        for table in tables[1:]:
            if any(not isinstance(a, type(b))
                   for a, b in zip_longest(table.column_types, column_types)):
                raise ValueError(
                    'Only tables with identical column types may be merged.')

        rows = []

        for table in tables:
            if table.column_names == column_names:
                rows.extend(table.rows)
            else:
                for row in table.rows:
                    rows.append(Row(row.values(), column_names))

        return Table(rows,
                     column_names,
                     column_types,
                     row_names=row_names,
                     _is_fork=True)
Exemplo n.º 15
0
    def test_all(self):
        rows = [
            Row([True], ['test']),
            Row([True], ['test']),
            Row([None], ['test']),
        ]
        column = Column(0, 'test', Boolean(), rows)
        self.assertEqual(column.aggregate(All()), False)

        rows = [
            Row([True], ['test']),
            Row([True], ['test']),
            Row([True], ['test']),
        ]
        column = Column(0, 'test', Boolean(), rows)
        self.assertEqual(column.aggregate(All()), True)
Exemplo n.º 16
0
    def merge(self):
        """
        Convert this TableSet into a single table. This is the inverse of
        :meth:`.Table.group_by`.

        Any :code:`row_names` set on the merged tables will be lost in this
        process.

        :returns: A new :class:`Table`.
        """
        column_names = list(self.column_names)
        column_types = list(self.column_types)

        column_names.insert(0, self.key_name)
        column_types.insert(0, self.key_type)

        rows = []

        for key, table in self.items():
            for row in table.rows:
                rows.append(Row((key,) + tuple(row), column_names))

        return Table(rows, column_names, column_types)
Exemplo n.º 17
0
    def join(self, right_table, left_key, right_key=None, inner=False):
        """
        Performs the equivalent of SQL's "left outer join", combining columns
        from this table and from :code:`right_table` anywhere that the output of
        :code:`left_key` and :code:`right_key` are equivalent.

        Where there is no match for :code:`left_key` the left columns will
        be included with the right columns set to :code:`None` unless
        the :code:`inner` argument is specified. (See arguments for more.)

        If :code:`left_key` and :code:`right_key` are column names, only
        the left column will be included in the output table.

        Column names from the right table which also exist in this table will
        be suffixed "2" in the new table.

        :param right_table:
            The "right" table to join to.
        :param left_key:
            Either the name of a column from the this table to join on, or a
            :class:`function` that takes a row and returns a value to join on.
        :param right_key:
            Either the name of a column from :code:table` to join on, or a
            :class:`function` that takes a row and returns a value to join on.
            If :code:`None` then :code:`left_key` will be used for both.
        :param inner:
            Perform a SQL-style "inner join" instead of a left outer join. Rows
            which have no match for :code:`left_key` will not be included in
            the output table.
        :returns:
            A new :class:`Table`.
        """
        left_key_is_row_function = hasattr(left_key, '__call__')

        if right_key is None:
            right_key = left_key

        right_key_is_row_function = hasattr(right_key, '__call__')

        # Get join columns
        right_key_index = None

        if left_key_is_row_function:
            left_data = [left_key(row) for row in self.rows]
        else:
            left_data = self._columns[left_key].values()

        if right_key_is_row_function:
            right_data = [right_key(row) for row in right_table.rows]
        else:
            right_column = right_table.columns[right_key]
            right_data = right_column.values()
            right_key_index = right_table.columns._keys.index(right_key)

        # Build names and type lists
        column_names = list(self._column_names)
        column_types = list(self._column_types)

        for column in right_table.columns:
            name = column.name

            if name == right_key:
                continue

            if name in self._column_names:
                column_names.append('%s2' % name)
            else:
                column_names.append(name)

            column_types.append(column.data_type)

        right_hash = {}

        for i, value in enumerate(right_data):
            if value not in right_hash:
                right_hash[value] = []

            right_hash[value].append(right_table._rows[i])

        # Collect new rows
        rows = []

        if self._row_names is not None:
            row_names = []
        else:
            row_names = None

        # Iterate over left column
        for left_index, left_value in enumerate(left_data):
            matching_rows = right_hash.get(left_value, None)

            # Rows with matches
            if matching_rows:
                for right_row in matching_rows:
                    new_row = list(self._rows[left_index])

                    for k, v in enumerate(right_row):
                        if k == right_key_index:
                            continue

                        new_row.append(v)

                    rows.append(Row(new_row, column_names))

                    if self._row_names is not None:
                        row_names.append(self._row_names[left_index])
            # Rows without matches
            elif not inner:
                new_row = list(self._rows[left_index])

                for k, v in enumerate(right_table.column_names):
                    if k == right_key_index:
                        continue

                    new_row.append(None)

                rows.append(Row(new_row, column_names))

                if self._row_names is not None:
                    row_names.append(self._row_names[left_index])

        return self._fork(rows,
                          column_names,
                          column_types,
                          row_names=row_names)
Exemplo n.º 18
0
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False,
         columns=None):
    """
    Create a new table by joining two table's on common values. This method
    implements most varieties of SQL join, in addition to some unique features.

    If :code:`left_key` and :code:`right_key` are both :code:`None` then this
    method will perform a "sequential join", which is to say it will join on row
    number. The :code:`inner` and :code:`full_outer` arguments will determine
    whether dangling left-hand and right-hand rows are included, respectively.

    If :code:`left_key` is specified, then a "left outer join" will be
    performed. This will combine columns from the :code:`right_table` anywhere
    that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from
    the left table will be included with the right-hand columns set to
    :code:`None`.

    If :code:`inner` is :code:`True` then an "inner join" will be performed.
    Unmatched rows from either table will be left out.

    If :code:`full_outer` is :code:`True` then a "full outer join" will be
    performed. Unmatched rows from both tables will be included, with the
    columns in the other table set to :code:`None`.

    In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key`
    will be used for both tables.

    If :code:`left_key` and :code:`right_key` are column names, the right-hand
    identifier column will not be included in the output table.

    If :code:`require_match` is :code:`True` unmatched rows will raise an
    exception. This is like an "inner join" except any row that doesn't have a
    match will raise an exception instead of being dropped. This is useful for
    enforcing expectations about datasets that should match.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    A subset of columns from the right-hand table can be included in the joined
    table using the :code:`columns` argument.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, the index
        of a column, a sequence of such column identifiers, a
        :class:`function` that takes a row and returns a value to join on, or
        :code:`None` in which case the tables will be joined on row number.
    :param right_key:
        Either the name of a column from :code:table` to join on, the index of
        a column, a sequence of such column identifiers, or a :class:`function`
        that takes a ow and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both. If :code:`left_key` is
        :code:`None` then this value is ignored.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param full_outer:
        Perform a SQL-style "full outer" join rather than a left or a right.
        May not be used in combination with :code:`inner`.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`.
    :returns:
        A new :class:`.Table`.
    """
    if inner and full_outer:
        raise ValueError('A join can not be both "inner" and "full_outer".')

    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is None
    if left_key is None:
        left_data = tuple(range(len(self._rows)))
    # Left key is a function
    elif left_key_is_func:
        left_data = [left_key(row) for row in self._rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self._columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self._columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Sequential join
    if left_key is None:
        right_data = tuple(range(len(right_table._rows)))
    # Right key is a function
    elif right_key_is_func:
        right_data = [right_key(row) for row in right_table._rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table._columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [right_table._columns._keys.index(key) for key in right_key]
    # Right key is a column name/index
    else:
        right_column = right_table._columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table._columns.index(right_column)]

    # Build names and type lists
    column_names = list(self._column_names)
    column_types = list(self._column_types)

    for i, column in enumerate(right_table._columns):
        name = column.name

        if not full_outer:
            if columns is None and i in right_key_indices:
                continue

            if columns is not None and name not in columns:
                continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None and not full_outer:
        right_table = right_table.select([n for n in right_table._column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table._rows[i])

    # Collect new rows
    rows = []

    if self._row_names is not None and not full_outer:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError('Left key "%s" does not have a matching right key.' % left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self._rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices and not full_outer:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self._row_names is not None and not full_outer:
                    row_names.append(self._row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self._rows[left_index])

            for k, v in enumerate(right_table._column_names):
                if columns is None and k in right_key_indices and not full_outer:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self._row_names is not None and not full_outer:
                row_names.append(self._row_names[left_index])

    # Full outer join
    if full_outer:
        left_set = set(left_data)

        for right_index, right_value in enumerate(right_data):
            if right_value in left_set:
                continue

            new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index])

            rows.append(Row(new_row, column_names))

    return self._fork(rows, column_names, column_types, row_names=row_names)
Exemplo n.º 19
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. '
                'Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            self._column_names = utils.deduplicate(column_names,
                                                   column_names=True)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = tuple()

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(
                            str(e) + ' Error at row %s column %s.' %
                            (i, self._column_names[j]))

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError(
                        'Row names cannot be of type int. Use Decimal for numbered row names.'
                    )

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Exemplo n.º 20
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    final_column_names.append(utils.letter_name(i))
                elif isinstance(column_name, six.string_types):
                    final_column_names.append(column_name)
                else:
                    raise ValueError('Column names must be strings or None.')

            if len(set(final_column_names)) != len(final_column_names):
                raise ValueError('Duplicate column names are not allowed.')

            self._column_names = tuple(final_column_names)
        else:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, TypeTester):
            pass
        else:
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row,
                                [None] * (len(self.column_names) - len_row))

                new_rows.append(
                    Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)),
                        self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif isinstance(row_names, Sequence):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(
                zip(self._column_names, self._column_types)):
            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Exemplo n.º 21
0
def join(self,
         right_table,
         left_key,
         right_key=None,
         inner=False,
         require_match=False,
         columns=None):
    """
    Create a new table by joining two table's on common values.

    This method performs the equivalent of SQL's "left outer join", combining
    columns from this table and from :code:`right_table` anywhere that the
    :code:`left_key` and :code:`right_key` are equivalent.

    Where there is no match for :code:`left_key` the left columns will
    be included with the right columns set to :code:`None` unless
    the :code:`inner` argument is specified.

    If :code:`left_key` and :code:`right_key` are column names, only
    the left columns will be included in the output table.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on.
    :param right_key:
        Either the name of a column from :code:table` to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`.
    :returns:
        A new :class:`.Table`.
    """
    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is a function
    if left_key_is_func:
        left_data = [left_key(row) for row in self.rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self.columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self.columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Right key is a function
    if right_key_is_func:
        right_data = [right_key(row) for row in right_table.rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table.columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [
            right_table.columns._keys.index(key) for key in right_key
        ]
    # Right key is a column name/index
    else:
        right_column = right_table.columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table.columns._keys.index(right_key)]

    # Build names and type lists
    column_names = list(self.column_names)
    column_types = list(self.column_types)

    for i, column in enumerate(right_table.columns):
        name = column.name

        if columns is None and i in right_key_indices:
            continue

        if columns is not None and name not in columns:
            continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None:
        right_table = right_table.select(
            [n for n in right_table.column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table.rows[i])

    # Collect new rows
    rows = []

    if self.row_names is not None:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError(
                'Left key "%s" does not have a matching right key.' %
                left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self.rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self.row_names is not None:
                    row_names.append(self.row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self.rows[left_index])

            for k, v in enumerate(right_table.column_names):
                if columns is None and k in right_key_indices:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self.row_names is not None:
                row_names.append(self.row_names[left_index])

    return self._fork(rows, column_names, column_types, row_names=row_names)
Exemplo n.º 22
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?'
            )

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn(
                        'Column name not specified. "%s" will be used as name.'
                        % new_column_name, RuntimeWarning)
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError('Column names must be strings or None.')

                final_column_name = new_column_name
                duplicates = 0
                while final_column_name in final_column_names:
                    final_column_name = new_column_name + '_' + str(
                        duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()

        elif isinstance(column_types, dict):
            for v in six.itervalues(column_types):
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')
            column_types = TypeTester(force=column_types)

        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row,
                                [None] * (len(self.column_names) - len_row))

                new_rows.append(
                    Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)),
                        self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(
                zip(self._column_names, self._column_types)):
            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Exemplo n.º 23
0
def normalize(self,
              key,
              properties,
              property_column='property',
              value_column='value',
              column_types=None):
    """
    Create a new table with columns converted into rows values.

    For example:

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    can be normalized on columns 'gender', 'race' and 'age':

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    This is the opposite of :meth:`.Table.denormalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized self. Typically these
        are the tables unique identifiers and any metadata about them.
    :param properties:
        A column name or a sequence of column names that should be
        converted to properties in the new self.
    :param property_column:
        The name to use for the column containing the property names.
    :param value_column:
        The name to use for the column containing the property values.
    :param column_types:
        A sequence of two column types for the property and value column in
        that order or an instance of :class:`.TypeTester`. Defaults to a
        generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_rows = []

    if not utils.issequence(key):
        key = [key]

    if not utils.issequence(properties):
        properties = [properties]

    new_column_names = key + [property_column, value_column]

    row_names = []

    for row in self.rows:
        k = tuple(row[n] for n in key)
        left_row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in properties:
            new_rows.append(
                Row(tuple(left_row + [f, row[f]]), new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
Exemplo n.º 24
0
def homogenize(self, key, compare_values, default_row=None):
    """
    Fill in missing rows in a series.

    This can be used, for instance, to add rows for missing years in a time
    series.

    Missing rows are found by comparing the values in the :code:`key` columns
    with those provided as :code:`compare_values`.

    Values not found in the table will be used to generate new rows with
    the given :code:`default_row`.

    :code:`default_row` should be an array of values or an array-generating
    function. If not specified, the new rows will have :code:`None` in columns
    all columns not specified in :code:`key`.

    If :code:`default_row` is an array of values, its length should be row
    length minus the number of column names provided in the :code:`key`.

    If it is an array-generating function, the function should take an array
    of missing values for each new row and output a full row including those
    values.

    :param key:
        Either a column name or a sequence of such names.
    :param compare_values:
        Either an array of column values if key is a single column name or a
        sequence of arrays of values if key is a sequence of names. It can
        also be a generator that yields either of the two. A row is created for
        each value or list of values not found in the rows of the table.
    :param default_row:
        An array of values or a function to generate new rows. The length of
        the input array should be equal to row length minus column_names
        count. The length of array generated by the function should be the
        row length.
    :returns:
        A new :class:`.Table`.
    """
    rows = list(self._rows)

    if not utils.issequence(key):
        key = [key]

    if len(key) == 1:
        if any(not utils.issequence(compare_value) for compare_value in compare_values):
            compare_values = [[compare_value] for compare_value in compare_values]

    column_values = [self._columns.get(name) for name in key]
    column_indexes = [self._column_names.index(name) for name in key]

    column_values = zip(*column_values)
    differences = list(set(map(tuple, compare_values)) - set(column_values))

    for difference in differences:
        if callable(default_row):
            rows.append(Row(default_row(difference), self._column_names))
        else:
            if default_row is not None:
                new_row = default_row
            else:
                new_row = [None] * (len(self._column_names) - len(key))

            for i, d in zip(column_indexes, difference):
                new_row.insert(i, d)

            rows.append(Row(new_row, self._column_names))

    return self._fork(rows)
Exemplo n.º 25
0
def denormalize(self,
                key=None,
                property_column='property',
                value_column='value',
                default_value=utils.default,
                column_types=None):
    """
    Create a new table with row values converted into columns.

    For example:

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    Can be denormalized so that each unique value in `field` becomes a
    column with `value` used for its values.

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    If one or more keys are specified then the resulting table will
    automatically have :code:`row_names` set to those keys.

    This is the opposite of :meth:`.Table.normalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized table. Typically these
        are the tables unique identifiers and any metadata about them. Or,
        :code:`None` if there are no key columns.
    :param field_column:
        The column whose values should become column names in the new table.
    :param property_column:
        The column whose values should become the values of the property
        columns in the new table.
    :param default_value:
        Value to be used for missing values in the pivot table. If not
        specified :code:`Decimal(0)` will be used for aggregations that
        return :class:`.Number` data and :code:`None` will be used for
        all others.
    :param column_types:
        A sequence of column types with length equal to number of unique
        values in field_column or an instance of :class:`.TypeTester`.
        Defaults to a generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]

    field_names = []
    row_data = OrderedDict()

    for row in self.rows:
        row_key = tuple(row[k] for k in key)

        if row_key not in row_data:
            row_data[row_key] = OrderedDict()

        f = six.text_type(row[property_column])
        v = row[value_column]

        if f not in field_names:
            field_names.append(f)

        row_data[row_key][f] = v

    if default_value == utils.default:
        if isinstance(self.columns[value_column].data_type, Number):
            default_value = Decimal(0)
        else:
            default_value = None

    new_column_names = key + field_names

    new_rows = []
    row_names = []

    for k, v in row_data.items():
        row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in field_names:
            if f in v:
                row.append(v[f])
            else:
                row.append(default_value)

        new_rows.append(Row(row, new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
Exemplo n.º 26
0
    def __init__(self, rows, column_info, row_names=None, _is_fork=False):
        column_info = list(column_info)

        if isinstance(column_info[0], Column):
            self._column_names = tuple(c.name for c in column_info)
            self._column_types = tuple(c.data_type for c in column_info)
        else:
            column_names, self._column_types = zip(*column_info)

            self._column_names = []

            # Validation
            for i, column_name in enumerate(column_names):
                if not column_name:
                    self._column_names.append(letter_name(i))
                else:
                    if not isinstance(column_name, six.string_types):
                        raise ValueError('Column names must be strings.')

                    self._column_names.append(column_name)

            len_column_names = len(self._column_names)

            if len(set(self._column_names)) != len_column_names:
                raise ValueError('Duplicate column names are not allowed.')

            self._column_names = tuple(self._column_names)

            for column_type in self._column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError('Column types must be instances of DataType.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len(self.column_names) - len_row))

                new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif isinstance(row_names, Sequence):
                computed_row_names = row_names
            else:
                raise ValueError('row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)):
            column = Column(i, name, data_type, self._rows, row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)