示例#1
0
def aggregate(self, aggregations):
    """
    Apply one or more :class:`.Aggregation` instances to this table.

    :param aggregations:
        A single :class:`.Aggregation` instance or sequence of them.
    :returns:
        If the input was a single :class:`Aggregation` then a single result
        will be returned. If it was a sequence then a tuple of results will
        be returned.
    """
    if utils.issequence(aggregations):
        results = []

        for agg in aggregations:
            agg.validate(self)

        for agg in aggregations:
            results.append(agg.run(self))

        return tuple(results)
    else:
        aggregations.validate(self)

        return aggregations.run(self)
示例#2
0
文件: aggregate.py 项目: livlab/agate
def aggregate(self, aggregations):
    """
    Aggregate data from the columns in this self by applying a sequence of
    :class:`.Aggregation` instances.

    :param aggregations:
        A single :class:`.Aggregation` instance or sequence of them.
    :returns:
        If the input was a single :class:`Aggregation` then a single result
        will be returned. If it was a sequence then a tuple of results will
        be returned.
    """
    if utils.issequence(aggregations):
        results = []

        for agg in aggregations:
            agg.validate(self)

        for agg in aggregations:
            results.append(agg.run(self))

        return tuple(results)
    else:
        aggregations.validate(self)

        return aggregations.run(self)
def aggregate(self, aggregations):
    """
    Apply one or more :class:`.Aggregation` instances to this table.

    :param aggregations:
        A single :class:`.Aggregation` instance or a sequence of tuples in the
        format :code:`(name, aggregation)`, where each :code:`aggregation` is
        an instance of :class:`.Aggregation`.
    :returns:
        If the input was a single :class:`Aggregation` then a single result
        will be returned. If it was a sequence then an :class:`.OrderedDict` of
        results will be returned.
    """
    if utils.issequence(aggregations):
        results = OrderedDict()

        for name, agg in aggregations:
            agg.validate(self)

        for name, agg in aggregations:
            results[name] = agg.run(self)

        return results
    else:
        aggregations.validate(self)

        return aggregations.run(self)
示例#4
0
    def validate(self, table):
        if issequence(self._column_name):
            column_names = self._column_name
        else:
            column_names = [self._column_name]

        for column_name in column_names:
            column = table.columns[column_name]

            if not isinstance(column.data_type, Text):
                raise DataTypeError('Slug column must contain Text data.')

            if HasNulls(column_name).run(table):
                raise ValueError('Slug column cannot contain `None`.')
示例#5
0
文件: slug.py 项目: nbedi/agate
    def validate(self, table):
        if issequence(self._column_name):
            column_names = self._column_name
        else:
            column_names = [self._column_name]

        for column_name in column_names:
            column = table.columns[column_name]

            if not isinstance(column.data_type, Text):
                raise DataTypeError('Slug column must contain Text data.')

            if HasNulls(column_name).run(table):
                raise ValueError('Slug column cannot contain `None`.')
示例#6
0
文件: __init__.py 项目: ejmurra/agate
    def exclude(self, key):
        """
        Create a new table without the specified columns.

        :param key:
            Either the name of a single column to exclude or a sequence of such
            names.
        :returns:
            A new :class:`.Table`.
        """
        if not utils.issequence(key):
            key = [key]

        selected_column_names = [n for n in self._column_names if n not in key]

        return self.select(selected_column_names)
示例#7
0
    def exclude(self, key):
        """
        Create a new table without the specified columns.

        :param key:
            Either the name of a single column to exclude or a sequence of such
            names.
        :returns:
            A new :class:`.Table`.
        """
        if not utils.issequence(key):
            key = [key]

        selected_column_names = [n for n in self._column_names if n not in key]

        return self.select(selected_column_names)
示例#8
0
    def exclude(self, key):
        """
        Create a new table with the same rows as this one, but only columns
        not in the ``key``.

        :param key:
            Either the name of a column to exclude or a sequence of such names.
        :returns:
            A new :class:`Table`.
        """
        if not utils.issequence(key):
            key = [key]

        selected_column_names = [n for n in self._column_names if n not in key]

        return self.select(selected_column_names)
示例#9
0
文件: __init__.py 项目: ejmurra/agate
    def order_by(self, key, reverse=False):
        """
        Create a new table that is sorted.

        :param key:
            Either the name of a single column to sort by, a sequence of such
            names, or a :class:`function` that takes a row and returns a value
            to sort by.
        :param reverse:
            If `True` then sort in reverse (typically, descending) order.
        :returns:
            A new :class:`.Table`.
        """
        if len(self._rows) == 0:
            return self._fork(self._rows)
        else:
            key_is_row_function = hasattr(key, '__call__')
            key_is_sequence = utils.issequence(key)

            def sort_key(data):
                row = data[1]

                if key_is_row_function:
                    k = key(row)
                elif key_is_sequence:
                    k = tuple(row[n] for n in key)
                else:
                    k = row[key]

                if k is None:
                    return utils.NullOrder()

                return k

            results = sorted(enumerate(self._rows),
                             key=sort_key,
                             reverse=reverse)

            indices, rows = zip(*results)

            if self._row_names is not None:
                row_names = [self._row_names[i] for i in indices]
            else:
                row_names = None

            return self._fork(rows, row_names=row_names)
示例#10
0
    def order_by(self, key, reverse=False):
        """
        Sort this table by the :code:`key`. This can be either a
        column_name or callable that returns a value to sort by.

        :param key:
            Either the name of a column to sort by, a sequence of such names,
            or a :class:`function` that takes a row and returns a value to sort
            by.
        :param reverse:
            If `True` then sort in reverse (typically, descending) order.
        :returns:
            A new :class:`Table`.
        """
        if len(self._rows) == 0:
            return self._fork(self._rows)
        else:
            key_is_row_function = hasattr(key, "__call__")
            key_is_sequence = utils.issequence(key)

            def sort_key(data):
                row = data[1]

                if key_is_row_function:
                    k = key(row)
                elif key_is_sequence:
                    k = tuple(row[n] for n in key)
                else:
                    k = row[key]

                if k is None:
                    return utils.NullOrder()

                return k

            results = sorted(enumerate(self._rows), key=sort_key, reverse=reverse)

            indices, rows = zip(*results)

            if self._row_names is not None:
                row_names = [self._row_names[i] for i in indices]
            else:
                row_names = None

            return self._fork(rows, row_names=row_names)
示例#11
0
文件: slug.py 项目: nbedi/agate
    def run(self, table):
        """
        :returns:
            :class:`string`
        """
        new_column = []

        for row in table.rows:
            if issequence(self._column_name):
                column_value = ''
                for column_name in self._column_name:
                    column_value = column_value + ' ' + row[column_name]

                new_column.append(column_value)
            else:
                new_column.append(row[self._column_name])

        return slugify(new_column, ensure_unique=self._ensure_unique, **self._slug_args)
示例#12
0
    def run(self, table):
        """
        :returns:
            :class:`string`
        """
        new_column = []

        for row in table.rows:
            if issequence(self._column_name):
                column_value = ''
                for column_name in self._column_name:
                    column_value = column_value + ' ' + row[column_name]

                new_column.append(column_value)
            else:
                new_column.append(row[self._column_name])

        return slugify(new_column,
                       ensure_unique=self._ensure_unique,
                       **self._slug_args)
示例#13
0
    def select(self, key):
        """
        Create a new table with the same rows as this one, but only those
        columns in the ``key``.

        :param key:
            Either the name of a column to include or a sequence of such names.
        :returns:
            A new :class:`Table`.
        """
        if not utils.issequence(key):
            key = [key]

        column_types = [self.columns[name].data_type for name in key]
        new_rows = []

        for row in self._rows:
            new_rows.append(Row(tuple(row[n] for n in key), key))

        return self._fork(new_rows, key, column_types)
示例#14
0
文件: __init__.py 项目: ejmurra/agate
    def select(self, key):
        """
        Create a new table with only the specified columns.

        :param key:
            Either the name of a single column to include or a sequence of such
            names.
        :returns:
            A new :class:`.Table`.
        """
        if not utils.issequence(key):
            key = [key]

        column_types = [self.columns[name].data_type for name in key]
        new_rows = []

        for row in self._rows:
            new_rows.append(Row(tuple(row[n] for n in key), key))

        return self._fork(new_rows, key, column_types)
示例#15
0
def distinct(self, key=None):
    """
    Create a new table with only unique rows.

    :param key:
        Either the name of a single column to use to identify unique rows, a
        sequence of such column names, a :class:`function` that takes a
        row and returns a value to identify unique rows, or `None`, in
        which case the entire row will be checked for uniqueness.
    :returns:
        A new :class:`.Table`.
    """
    key_is_row_function = hasattr(key, '__call__')
    key_is_sequence = utils.issequence(key)

    uniques = []
    rows = []

    if self._row_names is not None:
        row_names = []
    else:
        row_names = None

    for i, row in enumerate(self._rows):
        if key_is_row_function:
            k = key(row)
        elif key_is_sequence:
            k = (row[j] for j in key)
        elif key is None:
            k = tuple(row)
        else:
            k = row[key]

        if k not in uniques:
            uniques.append(k)
            rows.append(row)

            if self._row_names is not None:
                row_names.append(self._row_names[i])

    return self._fork(rows, row_names=row_names)
示例#16
0
文件: __init__.py 项目: ejmurra/agate
    def distinct(self, key=None):
        """
        Create a new table with only unique rows.

        :param key:
            Either the name of a single column to use to identify unique rows, a
            sequence of such column names, a :class:`function` that takes a
            row and returns a value to identify unique rows, or `None`, in
            which case the entire row will be checked for uniqueness.
        :returns:
            A new :class:`.Table`.
        """
        key_is_row_function = hasattr(key, '__call__')
        key_is_sequence = utils.issequence(key)

        uniques = []
        rows = []

        if self._row_names is not None:
            row_names = []
        else:
            row_names = None

        for i, row in enumerate(self._rows):
            if key_is_row_function:
                k = key(row)
            elif key_is_sequence:
                k = (row[j] for j in key)
            elif key is None:
                k = tuple(row)
            else:
                k = row[key]

            if k not in uniques:
                uniques.append(k)
                rows.append(row)

                if self._row_names is not None:
                    row_names.append(self._row_names[i])

        return self._fork(rows, row_names=row_names)
示例#17
0
def select(self, key):
    """
    Create a new table with only the specified columns.

    :param key:
        Either the name of a single column to include or a sequence of such
        names.
    :returns:
        A new :class:`.Table`.
    """
    if not utils.issequence(key):
        key = [key]

    indexes = tuple(self._column_names.index(k) for k in key)
    column_types = tuple(self._column_types[i] for i in indexes)
    new_rows = []

    for row in self._rows:
        new_rows.append(Row((row[i] for i in indexes), key))

    return self._fork(new_rows, key, column_types)
示例#18
0
文件: select.py 项目: jean/agate
def select(self, key):
    """
    Create a new table with only the specified columns.

    :param key:
        Either the name of a single column to include or a sequence of such
        names.
    :returns:
        A new :class:`.Table`.
    """
    if not utils.issequence(key):
        key = [key]

    indexes = tuple(self._column_names.index(k) for k in key)
    column_types = tuple(self._column_types[i] for i in indexes)
    new_rows = []

    for row in self._rows:
        new_rows.append(Row((row[i] for i in indexes), key))

    return self._fork(new_rows, key, column_types)
示例#19
0
文件: __init__.py 项目: nbedi/agate
    def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn('Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning)
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError('Column names must be strings or None.')

                final_column_name = new_column_name
                duplicates = 0

                while final_column_name in final_column_names:
                    final_column_name = new_column_name + '_' + str(duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2)
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError('Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError('Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError('column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j]))

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError('row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i, name, data_type, self._rows, row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
示例#20
0
def denormalize(self,
                key=None,
                property_column='property',
                value_column='value',
                default_value=utils.default,
                column_types=None):
    """
    Create a new table with row values converted into columns.

    For example:

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    Can be denormalized so that each unique value in `field` becomes a
    column with `value` used for its values.

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    If one or more keys are specified then the resulting table will
    automatically have :code:`row_names` set to those keys.

    This is the opposite of :meth:`.Table.normalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized table. Typically these
        are the tables unique identifiers and any metadata about them. Or,
        :code:`None` if there are no key columns.
    :param field_column:
        The column whose values should become column names in the new table.
    :param property_column:
        The column whose values should become the values of the property
        columns in the new table.
    :param default_value:
        Value to be used for missing values in the pivot table. If not
        specified :code:`Decimal(0)` will be used for aggregations that
        return :class:`.Number` data and :code:`None` will be used for
        all others.
    :param column_types:
        A sequence of column types with length equal to number of unique
        values in field_column or an instance of :class:`.TypeTester`.
        Defaults to a generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]

    field_names = []
    row_data = OrderedDict()

    for row in self.rows:
        row_key = tuple(row[k] for k in key)

        if row_key not in row_data:
            row_data[row_key] = OrderedDict()

        f = six.text_type(row[property_column])
        v = row[value_column]

        if f not in field_names:
            field_names.append(f)

        row_data[row_key][f] = v

    if default_value == utils.default:
        if isinstance(self.columns[value_column].data_type, Number):
            default_value = Decimal(0)
        else:
            default_value = None

    new_column_names = key + field_names

    new_rows = []
    row_names = []

    for k, v in row_data.items():
        row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in field_names:
            if f in v:
                row.append(v[f])
            else:
                row.append(default_value)

        new_rows.append(Row(row, new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
示例#21
0
文件: join.py 项目: wireservice/agate
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False,
         columns=None):
    """
    Create a new table by joining two table's on common values. This method
    implements most varieties of SQL join, in addition to some unique features.

    If :code:`left_key` and :code:`right_key` are both :code:`None` then this
    method will perform a "sequential join", which is to say it will join on row
    number. The :code:`inner` and :code:`full_outer` arguments will determine
    whether dangling left-hand and right-hand rows are included, respectively.

    If :code:`left_key` is specified, then a "left outer join" will be
    performed. This will combine columns from the :code:`right_table` anywhere
    that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from
    the left table will be included with the right-hand columns set to
    :code:`None`.

    If :code:`inner` is :code:`True` then an "inner join" will be performed.
    Unmatched rows from either table will be left out.

    If :code:`full_outer` is :code:`True` then a "full outer join" will be
    performed. Unmatched rows from both tables will be included, with the
    columns in the other table set to :code:`None`.

    In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key`
    will be used for both tables.

    If :code:`left_key` and :code:`right_key` are column names, the right-hand
    identifier column will not be included in the output table.

    If :code:`require_match` is :code:`True` unmatched rows will raise an
    exception. This is like an "inner join" except any row that doesn't have a
    match will raise an exception instead of being dropped. This is useful for
    enforcing expectations about datasets that should match.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    A subset of columns from the right-hand table can be included in the joined
    table using the :code:`columns` argument.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, the index
        of a column, a sequence of such column identifiers, a
        :class:`function` that takes a row and returns a value to join on, or
        :code:`None` in which case the tables will be joined on row number.
    :param right_key:
        Either the name of a column from :code:table` to join on, the index of
        a column, a sequence of such column identifiers, or a :class:`function`
        that takes a ow and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both. If :code:`left_key` is
        :code:`None` then this value is ignored.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param full_outer:
        Perform a SQL-style "full outer" join rather than a left or a right.
        May not be used in combination with :code:`inner`.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`.
    :returns:
        A new :class:`.Table`.
    """
    if inner and full_outer:
        raise ValueError('A join can not be both "inner" and "full_outer".')

    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is None
    if left_key is None:
        left_data = tuple(range(len(self._rows)))
    # Left key is a function
    elif left_key_is_func:
        left_data = [left_key(row) for row in self._rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self._columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self._columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Sequential join
    if left_key is None:
        right_data = tuple(range(len(right_table._rows)))
    # Right key is a function
    elif right_key_is_func:
        right_data = [right_key(row) for row in right_table._rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table._columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [right_table._columns._keys.index(key) for key in right_key]
    # Right key is a column name/index
    else:
        right_column = right_table._columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table._columns.index(right_column)]

    # Build names and type lists
    column_names = list(self._column_names)
    column_types = list(self._column_types)

    for i, column in enumerate(right_table._columns):
        name = column.name

        if not full_outer:
            if columns is None and i in right_key_indices:
                continue

            if columns is not None and name not in columns:
                continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None and not full_outer:
        right_table = right_table.select([n for n in right_table._column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table._rows[i])

    # Collect new rows
    rows = []

    if self._row_names is not None and not full_outer:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError('Left key "%s" does not have a matching right key.' % left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self._rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices and not full_outer:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self._row_names is not None and not full_outer:
                    row_names.append(self._row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self._rows[left_index])

            for k, v in enumerate(right_table._column_names):
                if columns is None and k in right_key_indices and not full_outer:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self._row_names is not None and not full_outer:
                row_names.append(self._row_names[left_index])

    # Full outer join
    if full_outer:
        left_set = set(left_data)

        for right_index, right_value in enumerate(right_data):
            if right_value in left_set:
                continue

            new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index])

            rows.append(Row(new_row, column_names))

    return self._fork(rows, column_names, column_types, row_names=row_names)
示例#22
0
文件: join.py 项目: ejmurra/agate
def join(self,
         right_table,
         left_key,
         right_key=None,
         inner=False,
         require_match=False,
         columns=None):
    """
    Create a new table by joining two table's on common values.

    This method performs the equivalent of SQL's "left outer join", combining
    columns from this table and from :code:`right_table` anywhere that the
    :code:`left_key` and :code:`right_key` are equivalent.

    Where there is no match for :code:`left_key` the left columns will
    be included with the right columns set to :code:`None` unless
    the :code:`inner` argument is specified.

    If :code:`left_key` and :code:`right_key` are column names, only
    the left columns will be included in the output table.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on.
    :param right_key:
        Either the name of a column from :code:table` to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`.
    :returns:
        A new :class:`.Table`.
    """
    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is a function
    if left_key_is_func:
        left_data = [left_key(row) for row in self.rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self.columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self.columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Right key is a function
    if right_key_is_func:
        right_data = [right_key(row) for row in right_table.rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table.columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [
            right_table.columns._keys.index(key) for key in right_key
        ]
    # Right key is a column name/index
    else:
        right_column = right_table.columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table.columns._keys.index(right_key)]

    # Build names and type lists
    column_names = list(self.column_names)
    column_types = list(self.column_types)

    for i, column in enumerate(right_table.columns):
        name = column.name

        if columns is None and i in right_key_indices:
            continue

        if columns is not None and name not in columns:
            continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None:
        right_table = right_table.select(
            [n for n in right_table.column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table.rows[i])

    # Collect new rows
    rows = []

    if self.row_names is not None:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError(
                'Left key "%s" does not have a matching right key.' %
                left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self.rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self.row_names is not None:
                    row_names.append(self.row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self.rows[left_index])

            for k, v in enumerate(right_table.column_names):
                if columns is None and k in right_key_indices:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self.row_names is not None:
                row_names.append(self.row_names[left_index])

    return self._fork(rows, column_names, column_types, row_names=row_names)
示例#23
0
def homogenize(self, key, compare_values, default_row=None):
    """
    Fills missing rows in a dataset with default values.

    Determines what rows are missing by comparing the values in the given
    column_names with the expected compare_values.

    Values not found in the table will be used to generate new rows with
    the given default_row.

    Default_row should be an array of values or an array-generating
    function. If not specified, the new rows will have `None` in columns
    not given in column_names.

    If it is an array of values, the length should be row length minus
    column_names count and the gap will be filled with the missing values.

    If it is an array-generating function, the function should take an array
    of missing values for each new row and output a full row including those
    values.

    :param key:
        Either a column name or a sequence of such names.
    :param compare_values:
        Either an array of column values if key is a single column name or a
        sequence of arrays of values if key is a sequence of names. It can
        also be a generator that yields one of the two. A row is created for
        each value or list of values not found in the rows of the table.
    :param default_row:
        An array of values or a function to generate new rows. The length of
        the input array should be equal to row length minus column_names
        count. The length of array generated by the function should be the
        row length.
    :returns:
        A new :class:`Table`.
    """
    rows = list(self.rows)

    if not utils.issequence(key):
        key = [key]

    if len(key) == 1:
        if any(not utils.issequence(compare_value) for compare_value in compare_values):
            compare_values = [[compare_value] for compare_value in compare_values]

    column_values = [self.columns.get(name) for name in key]
    column_indexes = [self.column_names.index(name) for name in key]

    column_values = zip(*column_values)
    differences = list(set(map(tuple, compare_values)) - set(column_values))

    for difference in differences:
        if callable(default_row):
            rows.append(Row(default_row(difference), self.column_names))
        else:
            if default_row is not None:
                new_row = default_row
            else:
                new_row = [None] * (len(self.column_names) - len(key))

            for i, d in zip(column_indexes, difference):
                new_row.insert(i, d)

            rows.append(Row(new_row, self.column_names))

    return self._fork(rows, self.column_names, self.column_types)
示例#24
0
    def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                "When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?"
            )

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn(
                        'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning
                    )
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError("Column names must be strings or None.")

                final_column_name = new_column_name
                duplicates = 0
                while final_column_name in final_column_names:
                    final_column_name = new_column_name + "_" + str(duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' % str(self._column_names),
                RuntimeWarning,
                stacklevel=2,
            )
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError("Column types must be instances of DataType.")

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError("column_names and column_types must be the same length.")

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        "Row %i has %i values, but Table only has %i columns." % (i, len_row, len_column_names)
                    )
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len(self.column_names) - len_row))

                new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, "__call__"):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError("row_names must be a column name, function or sequence")

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)):
            column = Column(i, name, data_type, self._rows, row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
示例#25
0
def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', **kwargs):
    """
    Create a new table from a CSV.

    This method uses agate's builtin CSV reader, which supplies encoding
    support for both Python 2 and Python 3.

    :code:`kwargs` will be passed through to the CSV reader.

    :param path:
        Filepath or file-like object from which to read CSV data.
    :param column_names:
        See :meth:`.Table.__init__`.
    :param column_types:
        See :meth:`.Table.__init__`.
    :param row_names:
        See :meth:`.Table.__init__`.
    :param skip_lines:
        Either a single number indicating the number of lines to skip from
        the top of the file or a sequence of line indexes to skip where the
        first line is index 0.
    :param header:
        If `True`, the first row of the CSV is assumed to contains headers
        and will be skipped. If `header` and `column_names` are both
        specified then a row will be skipped, but `column_names` will be
        used.
    :param sniff_limit:
        Limit CSV dialect sniffing to the specified number of bytes. Set to
        None to sniff the entire file. Defaults to 0 or no sniffing.
    :param encoding:
        Character encoding of the CSV file. Note: if passing in a file
        handle it is assumed you have already opened it with the correct
        encoding specified.
    """
    from agate import csv
    from agate.table import Table

    if hasattr(path, 'read'):
        lines = path.readlines()
    else:
        with io.open(path, encoding=encoding) as f:
            lines = f.readlines()

    if utils.issequence(skip_lines):
        lines = [line for i, line in enumerate(lines) if i not in skip_lines]
        contents = ''.join(lines)
    elif isinstance(skip_lines, int):
        contents = ''.join(lines[skip_lines:])
    else:
        raise ValueError('skip_lines argument must be an int or sequence')

    if sniff_limit is None:
        kwargs['dialect'] = csv.Sniffer().sniff(contents)
    elif sniff_limit > 0:
        kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit])

    if six.PY2:
        contents = contents.encode('utf-8')

    rows = list(csv.reader(six.StringIO(contents), header=header, **kwargs))

    if header:
        if column_names is None:
            column_names = rows.pop(0)
        else:
            rows.pop(0)

    return Table(rows, column_names, column_types, row_names=row_names)
示例#26
0
def homogenize(self, key, compare_values, default_row=None):
    """
    Fill in missing rows in a series.

    This can be used, for instance, to add rows for missing years in a time
    series.

    Missing rows are found by comparing the values in the :code:`key` columns
    with those provided as :code:`compare_values`.

    Values not found in the table will be used to generate new rows with
    the given :code:`default_row`.

    :code:`default_row` should be an array of values or an array-generating
    function. If not specified, the new rows will have :code:`None` in columns
    all columns not specified in :code:`key`.

    If :code:`default_row` is an array of values, its length should be row
    length minus the number of column names provided in the :code:`key`.

    If it is an array-generating function, the function should take an array
    of missing values for each new row and output a full row including those
    values.

    :param key:
        Either a column name or a sequence of such names.
    :param compare_values:
        Either an array of column values if key is a single column name or a
        sequence of arrays of values if key is a sequence of names. It can
        also be a generator that yields either of the two. A row is created for
        each value or list of values not found in the rows of the table.
    :param default_row:
        An array of values or a function to generate new rows. The length of
        the input array should be equal to row length minus column_names
        count. The length of array generated by the function should be the
        row length.
    :returns:
        A new :class:`.Table`.
    """
    rows = list(self._rows)

    if not utils.issequence(key):
        key = [key]

    if len(key) == 1:
        if any(not utils.issequence(compare_value) for compare_value in compare_values):
            compare_values = [[compare_value] for compare_value in compare_values]

    column_values = [self._columns.get(name) for name in key]
    column_indexes = [self._column_names.index(name) for name in key]

    column_values = zip(*column_values)
    differences = list(set(map(tuple, compare_values)) - set(column_values))

    for difference in differences:
        if callable(default_row):
            rows.append(Row(default_row(difference), self._column_names))
        else:
            if default_row is not None:
                new_row = default_row
            else:
                new_row = [None] * (len(self._column_names) - len(key))

            for i, d in zip(column_indexes, difference):
                new_row.insert(i, d)

            rows.append(Row(new_row, self._column_names))

    return self._fork(rows)
示例#27
0
def join(self, right_table, left_key, right_key=None, inner=False, require_match=False, columns=None):
    """
    Create a new table by joining two table's on common values.

    This method performs the equivalent of SQL's "left outer join", combining
    columns from this table and from :code:`right_table` anywhere that the
    :code:`left_key` and :code:`right_key` are equivalent.

    Where there is no match for :code:`left_key` the left columns will
    be included with the right columns set to :code:`None` unless
    the :code:`inner` argument is specified.

    If :code:`left_key` and :code:`right_key` are column names, only
    the left columns will be included in the output table.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on.
    :param right_key:
        Either the name of a column from :code:table` to join on, a
        sequence of such column names, or a :class:`function` that takes a
        row and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`.
    :returns:
        A new :class:`.Table`.
    """
    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is a function
    if left_key_is_func:
        left_data = [left_key(row) for row in self.rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self.columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self.columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Right key is a function
    if right_key_is_func:
        right_data = [right_key(row) for row in right_table.rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table.columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [right_table.columns._keys.index(key) for key in right_key]
    # Right key is a column name/index
    else:
        right_column = right_table.columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table.columns._keys.index(right_key)]

    # Build names and type lists
    column_names = list(self.column_names)
    column_types = list(self.column_types)

    for i, column in enumerate(right_table.columns):
        name = column.name

        if columns is None and i in right_key_indices:
            continue

        if columns is not None and name not in columns:
            continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None:
        right_table = right_table.select([n for n in right_table.column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table.rows[i])

    # Collect new rows
    rows = []

    if self.row_names is not None:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError('Left key "%s" does not have a matching right key.' % left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self.rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self.row_names is not None:
                    row_names.append(self.row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self.rows[left_index])

            for k, v in enumerate(right_table.column_names):
                if columns is None and k in right_key_indices:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self.row_names is not None:
                row_names.append(self.row_names[left_index])

    return self._fork(rows, column_names, column_types, row_names=row_names)
示例#28
0
文件: normalize.py 项目: jean/agate
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None):
    """
    Create a new table with columns converted into rows values.

    For example:

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    can be normalized on columns 'gender', 'race' and 'age':

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    This is the opposite of :meth:`.Table.denormalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized self. Typically these
        are the tables unique identifiers and any metadata about them.
    :param properties:
        A column name or a sequence of column names that should be
        converted to properties in the new self.
    :param property_column:
        The name to use for the column containing the property names.
    :param value_column:
        The name to use for the column containing the property values.
    :param column_types:
        A sequence of two column types for the property and value column in
        that order or an instance of :class:`.TypeTester`. Defaults to a
        generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_rows = []

    if not utils.issequence(key):
        key = [key]

    if not utils.issequence(properties):
        properties = [properties]

    new_column_names = key + [property_column, value_column]

    row_names = []

    for row in self._rows:
        k = tuple(row[n] for n in key)
        left_row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in properties:
            new_rows.append(Row((left_row + [f, row[f]]), new_column_names))

    key_column_types = [self._column_types[self._column_names.index(name)] for name in key]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
示例#29
0
文件: join.py 项目: skorasaurus/agate
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False, columns=None):
    """
    Create a new table by joining two table's on common values. This method
    implements most varieties of SQL join, in addition to some unique features.

    If :code:`left_key` and :code:`right_key` are both :code:`None` then this
    method will peform a "sequential join", which is to say it will join on row
    number. The :code:`inner` and :code:`full_outer` arguments will determine
    whether dangling left-hand and right-hand rows are included, respectively.

    If :code:`left_key` is specified, then a "left outer join" will be
    performed. This will combine columns from the :code:`right_table` anywhere
    that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from
    the left table will be included with the right-hand columns set to
    :code:`None`.

    If :code:`inner` is :code:`True` then an "inner join" will be performed.
    Unmatched rows from either table will be left out.

    If :code:`full_outer` is :code:`True` then a "full outer join" will be
    performed. Unmatched rows from both tables will be included, with the
    columns in the other table set to :code:`None`.

    In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key`
    will be used for both tables.

    If :code:`left_key` and :code:`right_key` are column names, the right-hand
    identifier column will not be included in the output table.

    If :code:`require_match` is :code:`True` unmatched rows will raise an
    exception. This is like an "inner join" except any row that doesn't have a
    match will raise an exception instead of being dropped. This is useful for
    enforcing expectations about datasets that should match.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    A subset of columns from the right-hand table can be included in the joined
    table using the :code:`columns` argument.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, the index
        of a column, a sequence of such column identifiers, a
        :class:`function` that takes a row and returns a value to join on, or
        :code:`None` in which case the tables will be joined on row number.
    :param right_key:
        Either the name of a column from :code:table` to join on, the index of
        a column, a sequence of such column identifiers, or a :class:`function`
        that takes a ow and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both. If :code:`left_key` is
        :code:`None` then this value is ignored.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param full_outer:
        Perform a SQL-style "full outer" join rather than a left or a right.
        May not be used in combination with :code:`inner`.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`.
    :returns:
        A new :class:`.Table`.
    """
    if inner and full_outer:
        raise ValueError('A join can not be both "inner" and "full_outer".')

    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is None
    if left_key is None:
        left_data = tuple(range(len(self._rows)))
    # Left key is a function
    elif left_key_is_func:
        left_data = [left_key(row) for row in self._rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self._columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self._columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Sequential join
    if left_key is None:
        right_data = tuple(range(len(right_table._rows)))
    # Right key is a function
    elif right_key_is_func:
        right_data = [right_key(row) for row in right_table._rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table._columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [right_table._columns._keys.index(key) for key in right_key]
    # Right key is a column name/index
    else:
        right_column = right_table._columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table._columns.index(right_column)]

    # Build names and type lists
    column_names = list(self._column_names)
    column_types = list(self._column_types)

    for i, column in enumerate(right_table._columns):
        name = column.name

        if not full_outer:
            if columns is None and i in right_key_indices:
                continue

            if columns is not None and name not in columns:
                continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None and not full_outer:
        right_table = right_table.select([n for n in right_table._column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table._rows[i])

    # Collect new rows
    rows = []

    if self._row_names is not None and not full_outer:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError('Left key "%s" does not have a matching right key.' % left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self._rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices and not full_outer:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self._row_names is not None and not full_outer:
                    row_names.append(self._row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self._rows[left_index])

            for k, v in enumerate(right_table._column_names):
                if columns is None and k in right_key_indices and not full_outer:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self._row_names is not None and not full_outer:
                row_names.append(self._row_names[left_index])

    # Full outer join
    if full_outer:
        left_set = set(left_data)

        for right_index, right_value in enumerate(right_data):
            if right_value in left_set:
                continue

            new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index])

            rows.append(Row(new_row, column_names))

    return self._fork(rows, column_names, column_types, row_names=row_names)
示例#30
0
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None):
    """
    Denormalize a dataset so that unique values in a column become their
    own columns.

    For example:

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    Can be denormalized so that each unique value in `field` becomes a
    column with `value` used for its values.

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    If one or more keys are specified then the resulting table will
    automatically have `row_names` set to those keys.

    This is the opposite of :meth:`Table.normalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized table. Typically these
        are the tables unique identifiers and any metadata about them. Or,
        :code:`None` if there are no key columns.
    :param field_column:
        The column whose values should become column names in the new table.
    :param property_column:
        The column whose values should become the values of the property
        columns in the new table.
    :param default_value:
        Value to be used for missing values in the pivot table. If not
        specified :code:`Decimal(0)` will be used for aggregations that
        return :class:`.Number` data and :code:`None` will be used for
        all others.
    :param column_types:
        A sequence of column types with length equal to number of unique
        values in field_column or an instance of :class:`.TypeTester`.
        Defaults to a generic :class:`.TypeTester`.
    :returns:
        A new :class:`Table`.
    """
    from agate.table import Table

    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]

    field_names = []
    row_data = OrderedDict()

    for row in self.rows:
        row_key = tuple(row[k] for k in key)

        if row_key not in row_data:
            row_data[row_key] = OrderedDict()

        f = six.text_type(row[property_column])
        v = row[value_column]

        if f not in field_names:
            field_names.append(f)

        row_data[row_key][f] = v

    if default_value == utils.default:
        if isinstance(self.columns[value_column].data_type, Number):
            default_value = Decimal(0)
        else:
            default_value = None

    new_column_names = key + field_names

    new_rows = []
    row_names = []

    for k, v in row_data.items():
        row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in field_names:
            if f in v:
                row.append(v[f])
            else:
                row.append(default_value)

        new_rows.append(Row(row, new_column_names))

    key_column_types = [self.column_types[self.column_names.index(name)] for name in key]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
示例#31
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. '
                'Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            self._column_names = utils.deduplicate(column_names,
                                                   column_names=True)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = tuple()

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(
                            str(e) + ' Error at row %s column %s.' %
                            (i, self._column_names[j]))

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError(
                        'Row names cannot be of type int. Use Decimal for numbered row names.'
                    )

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
示例#32
0
def homogenize(self, key, compare_values, default_row=None):
    """
    Fill in missing rows in a series.

    This can be used, for instance, to add rows for missing years in a time
    series.

    Missing rows are found by comparing the values in the :code:`key` columns
    with those provided as :code:`compare_values`.

    Values not found in the table will be used to generate new rows with
    the given :code:`default_row`.

    :code:`default_row` should be an array of values or an array-generating
    function. If not specified, the new rows will have :code:`None` in columns
    all columns not specified in :code:`key`.

    If :code:`default_row` is an array of values, its length should be row
    length minus the number of column names provided in the :code:`key`.

    If it is an array-generating function, the function should take an array
    of missing values for each new row and output a full row including those
    values.

    :param key:
        Either a column name or a sequence of such names.
    :param compare_values:
        Either an array of column values if key is a single column name or a
        sequence of arrays of values if key is a sequence of names. It can
        also be a generator that yields either of the two. A row is created for
        each value or list of values not found in the rows of the table.
    :param default_row:
        An array of values or a function to generate new rows. The length of
        the input array should be equal to row length minus column_names
        count. The length of array generated by the function should be the
        row length.
    :returns:
        A new :class:`.Table`.
    """
    rows = list(self.rows)

    if not utils.issequence(key):
        key = [key]

    if len(key) == 1:
        if any(not utils.issequence(compare_value) for compare_value in compare_values):
            compare_values = [[compare_value] for compare_value in compare_values]

    column_values = [self.columns.get(name) for name in key]
    column_indexes = [self.column_names.index(name) for name in key]

    column_values = zip(*column_values)
    differences = list(set(map(tuple, compare_values)) - set(column_values))

    for difference in differences:
        if callable(default_row):
            rows.append(Row(default_row(difference), self.column_names))
        else:
            if default_row is not None:
                new_row = default_row
            else:
                new_row = [None] * (len(self.column_names) - len(key))

            for i, d in zip(column_indexes, difference):
                new_row.insert(i, d)

            rows.append(Row(new_row, self.column_names))

    return self._fork(rows, self.column_names, self.column_types)
示例#33
0
def normalize(self,
              key,
              properties,
              property_column='property',
              value_column='value',
              column_types=None):
    """
    Create a new table with columns converted into rows values.

    For example:

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    can be normalized on columns 'gender', 'race' and 'age':

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    This is the opposite of :meth:`.Table.denormalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized self. Typically these
        are the tables unique identifiers and any metadata about them.
    :param properties:
        A column name or a sequence of column names that should be
        converted to properties in the new self.
    :param property_column:
        The name to use for the column containing the property names.
    :param value_column:
        The name to use for the column containing the property values.
    :param column_types:
        A sequence of two column types for the property and value column in
        that order or an instance of :class:`.TypeTester`. Defaults to a
        generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_rows = []

    if not utils.issequence(key):
        key = [key]

    if not utils.issequence(properties):
        properties = [properties]

    new_column_names = key + [property_column, value_column]

    row_names = []

    for row in self.rows:
        k = tuple(row[n] for n in key)
        left_row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in properties:
            new_rows.append(
                Row(tuple(left_row + [f, row[f]]), new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
示例#34
0
文件: pivot.py 项目: livlab/agate
def pivot(self, key=None, pivot=None, aggregation=None, computation=None, default_value=utils.default, key_name=None):
    """
    Pivot reorganizes the data in a table by grouping the data, aggregating
    those groups, optionally applying a computation, and then organizing
    the groups into new rows and columns.

    For example:

    +---------+---------+-- ------+
    |  name   |  race   | gender |
    +=========+=========+========+
    |  Joe    |  white  | male   |
    +---------+---------+--------+
    |  Jane   |  black  | female |
    +---------+---------+--------+
    |  Josh   |  black  | male   |
    +---------+---------+--------+
    |  Jim    |  asian  | female |
    +---------+---------+--------+

    This table can be pivoted with :code:`key` equal to "race" and
    :code:`columns` equal to "gender". The default aggregation is
    :class:`.Count`. This would result in the following table.

    +---------+---------+--------+
    |  race   |  male   | female |
    +=========+=========+========+
    |  white  |  1      | 0      |
    +---------+---------+--------+
    |  black  |  1      | 1      |
    +---------+---------+--------+
    |  asian  |  0      | 1      |
    +---------+---------+--------+

    If one or more keys are specified then the resulting table will
    automatically have `row_names` set to those keys.

    See also the related method :meth:`Table.denormalize`.

    :param key:
        Either the name of a column from the this table to group by, a
        sequence of such column names, a :class:`function` that takes a
        row and returns a value to group by, or :code:`None`, in which case
        there will be only a single row in the output table.
    :param columns:
        A column name whose unique values will become columns in the new
        table, or :code:`None` in which case there will be a single value
        column in the output table.
    :param aggregation:
        An instance of an :class:`.Aggregation` to perform on each group of
        data in the pivot table. (Each cell is the result of an aggregation
        of the grouped data.)

        If not specified this defaults to :class:`.Count` with no arguments.
    :param computation:
        An optional :class:`.Computation` instance to be applied to the
        aggregated sequence of values before they are transposed into the
        pivot table.

        Use the class name of the aggregation as your column name argument
        when constructing your computation. (This is "Count" if using the
        default value for :code:`aggregation`.)
    :param default_value:
        Value to be used for missing values in the pivot table. Defaults to
        :code:`Decimal(0)`. If performing non-mathematical aggregations you
        may wish to set this to :code:`None`.
    :param key_name:
        A name for the key column in the output table. This is most
        useful when the provided key is a function. This argument is not
        valid when :code:`key` is a sequence.
    :returns:
        A new :class:`Table`.
    """
    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]
    elif key_name:
        raise ValueError('key_name is not a valid argument when key is a sequence.')

    if aggregation is None:
        aggregation = Count()

    groups = self

    for k in key:
        groups = groups.group_by(k, key_name=key_name)

    aggregation_name = six.text_type(aggregation)
    computation_name = six.text_type(computation) if computation else None

    def apply_computation(table):
        computed = table.compute([
            (computation_name, computation)
        ])

        excluded = computed.exclude([aggregation_name])

        return excluded

    if pivot is not None:
        groups = groups.group_by(pivot)

        column_type = aggregation.get_aggregate_data_type(groups)

        table = groups.aggregate([
            (aggregation_name, aggregation)
        ])

        pivot_count = len(set(table.columns[pivot].values()))

        if computation is not None:
            column_types = computation.get_computed_data_type(table)
            table = apply_computation(table)

        column_types = [column_type] * pivot_count

        table = table.denormalize(key, pivot, computation_name or aggregation_name, default_value=default_value, column_types=column_types)
    else:
        table = groups.aggregate([
            (aggregation_name, aggregation)
        ])

        if computation:
            table = apply_computation(table)

    return table
示例#35
0
文件: __init__.py 项目: ejmurra/agate
    def from_csv(cls,
                 path,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 skip_lines=0,
                 header=True,
                 sniff_limit=0,
                 encoding='utf-8',
                 **kwargs):
        """
        Create a new table from a CSV.

        This method uses agate's builtin CSV reader, which supplies encoding
        support for both Python 2 and Python 3.

        :code:`kwargs` will be passed through to the CSV reader.

        :param path:
            Filepath or file-like object from which to read CSV data.
        :param column_names:
            See :meth:`.Table.__init__`.
        :param column_types:
            See :meth:`.Table.__init__`.
        :param row_names:
            See :meth:`.Table.__init__`.
        :param skip_lines:
            Either a single number indicating the number of lines to skip from
            the top of the file or a sequence of line indexes to skip where the
            first line is index 0.
        :param header:
            If `True`, the first row of the CSV is assumed to contains headers
            and will be skipped. If `header` and `column_names` are both
            specified then a row will be skipped, but `column_names` will be
            used.
        :param sniff_limit:
            Limit CSV dialect sniffing to the specified number of bytes. Set to
            None to sniff the entire file. Defaults to 0 or no sniffing.
        :param encoding:
            Character encoding of the CSV file. Note: if passing in a file
            handle it is assumed you have already opened it with the correct
            encoding specified.
        """
        if hasattr(path, 'read'):
            lines = path.readlines()
        else:
            with io.open(path, encoding=encoding) as f:
                lines = f.readlines()

        if utils.issequence(skip_lines):
            lines = [
                line for i, line in enumerate(lines) if i not in skip_lines
            ]
            contents = ''.join(lines)
        elif isinstance(skip_lines, int):
            contents = ''.join(lines[skip_lines:])
        else:
            raise ValueError('skip_lines argument must be an int or sequence')

        if sniff_limit is None:
            kwargs['dialect'] = csv.Sniffer().sniff(contents)
        elif sniff_limit > 0:
            kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit])

        if six.PY2:
            contents = contents.encode('utf-8')

        rows = list(csv.reader(six.StringIO(contents), header=header,
                               **kwargs))

        if header:
            if column_names is None:
                column_names = rows.pop(0)
            else:
                rows.pop(0)

        return Table(rows, column_names, column_types, row_names=row_names)
示例#36
0
def pivot(self,
          key=None,
          pivot=None,
          aggregation=None,
          computation=None,
          default_value=utils.default,
          key_name=None):
    """
    Create a new table by grouping the data, aggregating those groups,
    applying a computation, and then organizing the groups into new rows and
    columns.

    This is sometimes called a "crosstab".

    +---------+---------+--------+
    |  name   |  race   | gender |
    +=========+=========+========+
    |  Joe    |  white  | male   |
    +---------+---------+--------+
    |  Jane   |  black  | female |
    +---------+---------+--------+
    |  Josh   |  black  | male   |
    +---------+---------+--------+
    |  Jim    |  asian  | female |
    +---------+---------+--------+

    This table can be pivoted with :code:`key` equal to "race" and
    :code:`columns` equal to "gender". The default aggregation is
    :class:`.Count`. This would result in the following table.

    +---------+---------+--------+
    |  race   |  male   | female |
    +=========+=========+========+
    |  white  |  1      | 0      |
    +---------+---------+--------+
    |  black  |  1      | 1      |
    +---------+---------+--------+
    |  asian  |  0      | 1      |
    +---------+---------+--------+

    If one or more keys are specified then the resulting table will
    automatically have :code:`row_names` set to those keys.

    See also the related method :meth:`.Table.denormalize`.

    :param key:
        Either the name of a column from the this table to group by, a
        sequence of such column names, a :class:`function` that takes a
        row and returns a value to group by, or :code:`None`, in which case
        there will be only a single row in the output table.
    :param pivot:
        A column name whose unique values will become columns in the new
        table, or :code:`None` in which case there will be a single value
        column in the output table.
    :param aggregation:
        An instance of an :class:`.Aggregation` to perform on each group of
        data in the pivot table. (Each cell is the result of an aggregation
        of the grouped data.)

        If not specified this defaults to :class:`.Count` with no arguments.
    :param computation:
        An optional :class:`.Computation` instance to be applied to the
        aggregated sequence of values before they are transposed into the
        pivot table.

        Use the class name of the aggregation as your column name argument
        when constructing your computation. (This is "Count" if using the
        default value for :code:`aggregation`.)
    :param default_value:
        Value to be used for missing values in the pivot table. Defaults to
        :code:`Decimal(0)`. If performing non-mathematical aggregations you
        may wish to set this to :code:`None`.
    :param key_name:
        A name for the key column in the output table. This is most
        useful when the provided key is a function. This argument is not
        valid when :code:`key` is a sequence.
    :returns:
        A new :class:`.Table`.
    """
    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]
    elif key_name:
        raise ValueError(
            'key_name is not a valid argument when key is a sequence.')

    if aggregation is None:
        aggregation = Count()

    groups = self

    for k in key:
        groups = groups.group_by(k, key_name=key_name)

    aggregation_name = six.text_type(aggregation)
    computation_name = six.text_type(computation) if computation else None

    def apply_computation(table):
        computed = table.compute([(computation_name, computation)])

        excluded = computed.exclude([aggregation_name])

        return excluded

    if pivot is not None:
        groups = groups.group_by(pivot)

        column_type = aggregation.get_aggregate_data_type(self)

        table = groups.aggregate([(aggregation_name, aggregation)])

        pivot_count = len(set(table.columns[pivot].values()))

        if computation is not None:
            column_types = computation.get_computed_data_type(table)
            table = apply_computation(table)

        column_types = [column_type] * pivot_count

        table = table.denormalize(key,
                                  pivot,
                                  computation_name or aggregation_name,
                                  default_value=default_value,
                                  column_types=column_types)
    else:
        table = groups.aggregate([(aggregation_name, aggregation)])

        if computation:
            table = apply_computation(table)

    return table
示例#37
0
文件: __init__.py 项目: ejmurra/agate
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?'
            )

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn(
                        'Column name not specified. "%s" will be used as name.'
                        % new_column_name, RuntimeWarning)
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError('Column names must be strings or None.')

                final_column_name = new_column_name
                duplicates = 0
                while final_column_name in final_column_names:
                    final_column_name = new_column_name + '_' + str(
                        duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()

        elif isinstance(column_types, dict):
            for v in six.itervalues(column_types):
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')
            column_types = TypeTester(force=column_types)

        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row,
                                [None] * (len(self.column_names) - len_row))

                new_rows.append(
                    Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)),
                        self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(
                zip(self._column_names, self._column_types)):
            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)