Exemplo n.º 1
0
 def element_strategies(self):
     from hypothesis.strategies import check_strategy
     if self.__element_strategies is None:
         strategies = []
         for arg in self.original_strategies:
             check_strategy(arg)
             if not arg.is_empty:
                 strategies.extend(
                     [s for s in arg.branches if not s.is_empty])
         pruned = []
         seen = set()
         for s in strategies:
             if s is self:
                 continue
             if s in seen:
                 continue
             seen.add(s)
             pruned.append(s)
         branch_labels = []
         shift = bit_length(len(pruned))
         for i, p in enumerate(pruned):
             branch_labels.append(
                 (((self.label ^ p.label) << shift) + i) & LABEL_MASK)
         self.__element_strategies = pruned
         self.__branch_labels = tuple(branch_labels)
     return self.__element_strategies
Exemplo n.º 2
0
def fill_for(elements, unique, fill, name=''):
    if fill is None:
        if unique or not elements.has_reusable_values:
            fill = st.nothing()
        else:
            fill = elements
    else:
        st.check_strategy(fill, '%s.fill' % (name, ) if name else 'fill')
    return fill
Exemplo n.º 3
0
def fill_for(elements, unique, fill, name=''):
    if fill is None:
        if unique or not elements.has_reusable_values:
            fill = st.nothing()
        else:
            fill = elements
    else:
        st.check_strategy(fill, '%s.fill' % (name,) if name else 'fill')
    return fill
Exemplo n.º 4
0
def elements_and_dtype(elements, dtype, source=None):

    if source is None:
        prefix = ''
    else:
        prefix = '%s.' % (source,)

    if elements is not None:
        st.check_strategy(elements, '%selements' % (prefix,))
    else:
        with check('dtype is not None'):
            if dtype is None:
                raise InvalidArgument((
                    'At least one of %(prefix)selements or %(prefix)sdtype '
                    'must be provided.') % {'prefix': prefix})

    with check('is_categorical_dtype'):
        if is_categorical_dtype(dtype):
            raise InvalidArgument(
                '%sdtype is categorical, which is currently unsupported' % (
                    prefix,
                ))

    dtype = st.try_convert(np.dtype, dtype, 'dtype')

    if elements is None:
        elements = npst.from_dtype(dtype)
    elif dtype is not None:
        def convert_element(value):
            name = 'draw(%selements)' % (prefix,)
            try:
                return np.array([value], dtype=dtype)[0]
            except TypeError:
                raise InvalidArgument(
                    'Cannot convert %s=%r of type %s to dtype %s' % (
                        name, value, type(value).__name__, dtype.str
                    )
                )
            except ValueError:
                raise InvalidArgument(
                    'Cannot convert %s=%r to type %s' % (
                        name, value, dtype.str,
                    )
                )
        elements = elements.map(convert_element)
    assert elements is not None

    return elements, dtype
Exemplo n.º 5
0
def elements_and_dtype(elements, dtype, source=None):

    if source is None:
        prefix = ''
    else:
        prefix = '%s.' % (source,)

    if elements is not None:
        st.check_strategy(elements, '%selements' % (prefix,))
    else:
        with check('dtype is not None'):
            if dtype is None:
                raise InvalidArgument((
                    'At least one of %(prefix)selements or %(prefix)sdtype '
                    'must be provided.') % {'prefix': prefix})

    with check('is_categorical_dtype'):
        if is_categorical_dtype(dtype):
            raise InvalidArgument(
                '%sdtype is categorical, which is currently unsupported' % (
                    prefix,
                ))

    dtype = st.try_convert(np.dtype, dtype, 'dtype')

    if elements is None:
        elements = npst.from_dtype(dtype)
    elif dtype is not None:
        def convert_element(value):
            name = 'draw(%selements)' % (prefix,)
            try:
                return np.array([value], dtype=dtype)[0]
            except TypeError:
                raise InvalidArgument(
                    'Cannot convert %s=%r of type %s to dtype %s' % (
                        name, value, type(value).__name__, dtype.str
                    )
                )
            except ValueError:
                raise InvalidArgument(
                    'Cannot convert %s=%r to type %s' % (
                        name, value, dtype.str,
                    )
                )
        elements = elements.map(convert_element)
    assert elements is not None

    return elements, dtype
Exemplo n.º 6
0
 def element_strategies(self):
     from hypothesis.strategies import check_strategy
     if self.__element_strategies is None:
         strategies = []
         for arg in self.original_strategies:
             check_strategy(arg)
             if not arg.is_empty:
                 strategies.extend(
                     [s for s in arg.branches if not s.is_empty])
         pruned = []
         seen = set()
         for s in strategies:
             if s is self:
                 continue
             if s in seen:
                 continue
             seen.add(s)
             pruned.append(s)
         self.__element_strategies = pruned
     return self.__element_strategies
Exemplo n.º 7
0
 def element_strategies(self):
     from hypothesis.strategies import check_strategy
     if self.__element_strategies is None:
         strategies = []
         for arg in self.original_strategies:
             check_strategy(arg)
             if not arg.is_empty:
                 strategies.extend(
                     [s for s in arg.branches if not s.is_empty])
         pruned = []
         seen = set()
         for s in strategies:
             if s is self:
                 continue
             if s in seen:
                 continue
             seen.add(s)
             pruned.append(s)
         self.__element_strategies = pruned
     return self.__element_strategies
Exemplo n.º 8
0
def data_frames(columns=None, rows=None, index=None):
    """Provides a strategy for producing a :class:`pandas.DataFrame`.

    Arguments:

    * columns: An iterable of :class:`column` objects describing the shape
      of the generated DataFrame.

    * rows: A strategy for generating a row object. Should generate
      either dicts mapping column names to values or a sequence mapping
      column position to the value in that position (note that unlike the
      :class:`pandas.DataFrame` constructor, single values are not allowed
      here. Passing e.g. an integer is an error, even if there is only one
      column).

      At least one of rows and columns must be provided. If both are
      provided then the generated rows will be validated against the
      columns and an error will be raised if they don't match.

      Caveats on using rows:

      * In general you should prefer using columns to rows, and only use
        rows if the columns interface is insufficiently flexible to
        describe what you need - you will get better performance and
        example quality that way.
      * If you provide rows and not columns, then the shape and dtype of
        the resulting DataFrame may vary. e.g. if you have a mix of int
        and float in the values for one column in your row entries, the
        column will sometimes have an integral dtype and sometimes a float.

    * index: If not None, a strategy for generating indexes for the
      resulting DataFrame. This can generate either :class:`pandas.Index`
      objects or any sequence of values (which will be passed to the
      Index constructor).

      You will probably find it most convenient to use the
      :func:`~hypothesis.extra.pandas.indexes` or
      :func:`~hypothesis.extra.pandas.range_indexes` function to produce
      values for this argument.

    Usage:

    The expected usage pattern is that you use :class:`column` and
    :func:`columns` to specify a fixed shape of the DataFrame you want as
    follows. For example the following gives a two column data frame:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import column, data_frames
        >>> data_frames([
        ... column('A', dtype=int), column('B', dtype=float)]).example()
                    A              B
        0  2021915903  1.793898e+232
        1  1146643993            inf
        2 -2096165693   1.000000e+07

    If you want the values in different columns to interact in some way you
    can use the rows argument. For example the following gives a two column
    DataFrame where the value in the first column is always at most the value
    in the second:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import column, data_frames
        >>> import hypothesis.strategies as st
        >>> data_frames(
        ...     rows=st.tuples(st.floats(allow_nan=False),
        ...                    st.floats(allow_nan=False)).map(sorted)
        ... ).example()
                       0             1
        0  -3.402823e+38  9.007199e+15
        1 -1.562796e-298  5.000000e-01

    You can also combine the two:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import column, data_frames
        >>> import hypothesis.strategies as st
        >>> data_frames(
        ...     columns=columns(["lo", "hi"], dtype=float),
        ...     rows=st.tuples(st.floats(allow_nan=False),
        ...                    st.floats(allow_nan=False)).map(sorted)
        ... ).example()
                 lo            hi
        0   9.314723e-49  4.353037e+45
        1  -9.999900e-01  1.000000e+07
        2 -2.152861e+134 -1.069317e-73

    (Note that the column dtype must still be specified and will not be
    inferred from the rows. This restriction may be lifted in future).

    Combining rows and columns has the following behaviour:

    * The column names and dtypes will be used.
    * If the column is required to be unique, this will be enforced.
    * Any values missing from the generated rows will be provided using the
      column's fill.
    * Any values in the row not present in the column specification (if
      dicts are passed, if there are keys with no corresponding column name,
      if sequences are passed if there are too many items) will result in
      InvalidArgument being raised.

    """

    if index is None:
        index = range_indexes()
    else:
        st.check_strategy(index)

    index_strategy = index

    if columns is None:
        if rows is None:
            raise InvalidArgument(
                'At least one of rows and columns must be provided')
        else:

            @st.composite
            def rows_only(draw):
                index = draw(index_strategy)

                @check_function
                def row():
                    result = draw(rows)
                    st.check_type(Iterable, result, 'draw(row)')
                    return result

                if len(index) > 0:
                    return pandas.DataFrame([row() for _ in index],
                                            index=index)
                else:
                    # If we haven't drawn any rows we need to draw one row and
                    # then discard it so that we get a consistent shape for the
                    # DataFrame.
                    base = pandas.DataFrame([row()])
                    return base.drop(0)

            return rows_only()

    assert columns is not None
    columns = st.try_convert(tuple, columns, 'columns')

    rewritten_columns = []
    column_names = set()

    for i, c in enumerate(columns):
        st.check_type(column, c, 'columns[%d]' % (i, ))

        c = copy(c)
        if c.name is None:
            label = 'columns[%d]' % (i, )
            c.name = i
        else:
            label = c.name
            try:
                hash(c.name)
            except TypeError:
                raise InvalidArgument(
                    'Column names must be hashable, but columns[%d].name was '
                    '%r of type %s, which cannot be hashed.' % (
                        i,
                        c.name,
                        type(c.name).__name__,
                    ))

        if c.name in column_names:
            raise InvalidArgument('duplicate definition of column name %r' %
                                  (c.name, ))

        column_names.add(c.name)

        c.elements, c.dtype = elements_and_dtype(c.elements, c.dtype, label)

        if c.dtype is None and rows is not None:
            raise InvalidArgument(
                'Must specify a dtype for all columns when combining rows with'
                ' columns.')

        c.fill = npst.fill_for(fill=c.fill,
                               elements=c.elements,
                               unique=c.unique,
                               name=label)

        rewritten_columns.append(c)

    if rows is None:

        @st.composite
        def just_draw_columns(draw):
            index = draw(index_strategy)
            local_index_strategy = st.just(index)

            data = OrderedDict((c.name, None) for c in rewritten_columns)

            # Depending on how the columns are going to be generated we group
            # them differently to get better shrinking. For columns with fill
            # enabled, the elements can be shrunk independently of the size,
            # so we can just shrink by shrinking the index then shrinking the
            # length and are generally much more free to move data around.

            # For columns with no filling the problem is harder, and drawing
            # them like that would result in rows being very far apart from
            # eachother in the underlying data stream, which gets in the way
            # of shrinking. So what we do is reorder and draw those columns
            # row wise, so that the values of each row are next to each other.
            # This makes life easier for the shrinker when deleting blocks of
            # data.
            columns_without_fill = [
                c for c in rewritten_columns if c.fill.is_empty
            ]

            if columns_without_fill:
                for c in columns_without_fill:
                    data[c.name] = pandas.Series(
                        np.zeros(shape=len(index), dtype=c.dtype),
                        index=index,
                    )
                seen = {
                    c.name: set()
                    for c in columns_without_fill if c.unique
                }

                for i in hrange(len(index)):
                    for c in columns_without_fill:
                        if c.unique:
                            for _ in range(5):
                                value = draw(c.elements)
                                if value not in seen[c.name]:
                                    seen[c.name].add(value)
                                    break
                            else:
                                reject()
                        else:
                            value = draw(c.elements)
                        data[c.name][i] = value

            for c in rewritten_columns:
                if not c.fill.is_empty:
                    data[c.name] = draw(
                        series(index=local_index_strategy,
                               dtype=c.dtype,
                               elements=c.elements,
                               fill=c.fill,
                               unique=c.unique))

            return pandas.DataFrame(data, index=index)

        return just_draw_columns()
    else:

        @st.composite
        def assign_rows(draw):
            index = draw(index_strategy)

            result = pandas.DataFrame(OrderedDict(
                (c.name,
                 pandas.Series(np.zeros(dtype=c.dtype, shape=len(index)),
                               dtype=c.dtype)) for c in rewritten_columns),
                                      index=index)

            fills = {}

            any_unique = any(c.unique for c in rewritten_columns)

            if any_unique:
                all_seen = [
                    set() if c.unique else None for c in rewritten_columns
                ]
                while all_seen[-1] is None:
                    all_seen.pop()

            for row_index in hrange(len(index)):
                for _ in hrange(5):
                    original_row = draw(rows)
                    row = original_row
                    if isinstance(row, dict):
                        as_list = [None] * len(rewritten_columns)
                        for i, c in enumerate(rewritten_columns):
                            try:
                                as_list[i] = row[c.name]
                            except KeyError:
                                try:
                                    as_list[i] = fills[i]
                                except KeyError:
                                    fills[i] = draw(c.fill)
                                    as_list[i] = fills[i]
                        for k in row:
                            if k not in column_names:
                                raise InvalidArgument(
                                    ('Row %r contains column %r not in '
                                     'columns %r)' %
                                     (row, k,
                                      [c.name for c in rewritten_columns])))
                        row = as_list
                    if any_unique:
                        has_duplicate = False
                        for seen, value in zip(all_seen, row):
                            if seen is None:
                                continue
                            if value in seen:
                                has_duplicate = True
                                break
                            seen.add(value)
                        if has_duplicate:
                            continue
                    row = list(st.try_convert(tuple, row, 'draw(rows)'))

                    if len(row) > len(rewritten_columns):
                        raise InvalidArgument(
                            ('Row %r contains too many entries. Has %d but '
                             'expected at most %d') %
                            (original_row, len(row), len(rewritten_columns)))
                    while len(row) < len(rewritten_columns):
                        row.append(draw(rewritten_columns[len(row)].fill))
                    result.iloc[row_index] = row
                    break
                else:
                    reject()
            return result

        return assign_rows()
Exemplo n.º 9
0
def series(elements=None, dtype=None, index=None, fill=None, unique=False):
    """Provides a strategy for producing a :class:`pandas.Series`.

    Arguments:

    * elements: a strategy that will be used to generate the individual
      values in the series. If None, we will attempt to infer a suitable
      default from the dtype.

    * dtype: the dtype of the resulting series and may be any value
      that can be passed to :class:`numpy.dtype`. If None, will use
      pandas's standard behaviour to infer it from the type of the elements
      values. Note that if the type of values that comes out of your
      elements strategy varies, then so will the resulting dtype of the
      series.

    * index: If not None, a strategy for generating indexes for the
      resulting Series. This can generate either :class:`pandas.Index`
      objects or any sequence of values (which will be passed to the
      Index constructor).

      You will probably find it most convenient to use the
      :func:`~hypothesis.extra.pandas.indexes` or
      :func:`~hypothesis.extra.pandas.range_indexes` function to produce
      values for this argument.

    Usage:

    .. code-block:: pycon

        >>> series(dtype=int).example()
        0   -2001747478
        1    1153062837

    """
    if index is None:
        index = range_indexes()
    else:
        st.check_strategy(index)

    elements, dtype = elements_and_dtype(elements, dtype)
    index_strategy = index

    @st.composite
    def result(draw):
        index = draw(index_strategy)

        if len(index) > 0:
            if dtype is not None:
                result_data = draw(
                    npst.arrays(
                        dtype=dtype,
                        elements=elements,
                        shape=len(index),
                        fill=fill,
                        unique=unique,
                    ))
            else:
                result_data = list(
                    draw(
                        npst.arrays(
                            dtype=object,
                            elements=elements,
                            shape=len(index),
                            fill=fill,
                            unique=unique,
                        )))

            return pandas.Series(result_data, index=index, dtype=dtype)
        else:
            return pandas.Series((),
                                 index=index,
                                 dtype=dtype if dtype is not None else draw(
                                     dtype_for_elements_strategy(elements)))

    return result()
Exemplo n.º 10
0
def arrays(
    draw, dtype, shape, elements=None, fill=None, unique=False
):
    """Returns a strategy for generating :class:`numpy's
    ndarrays<numpy.ndarray>`.

    * ``dtype`` may be any valid input to :class:`numpy.dtype <numpy.dtype>`
      (this includes ``dtype`` objects), or a strategy that generates such
      values.
    * `shape` may be an integer >= 0, a tuple of length >= 0 of such
      integers, or a strategy that generates such values.
    * `elements` is a strategy for generating values to put in the array.
      If it is None a suitable value will be inferred based on the dtype,
      which may give any legal value (including eg ``NaN`` for floats).
      If you have more specific requirements, you should supply your own
      elements strategy.
    * `fill` is a strategy that may be used to generate a single background
      value for the array. If None, a suitable default will be inferred
      based on the other arguments. If set to
      :func:`st.nothing() <hypothesis.strategies.nothing>` then filling
      behaviour will be disabled entirely and every element will be generated
      independently.
    * `unique` specifies if the elements of the array should all be
      distinct from one another. Note that in this case multiple NaN values
      may still be allowed. If fill is also set, the only valid values for
      it to return are NaN values (anything for which
      :func:`numpy.isnan <numpy.isnan>` returns True. So e.g. for complex
      numbers (nan+1j) is also a valid fill). Note that if unique is set to
      True the generated values must be hashable.

    Arrays of specified `dtype` and `shape` are generated for example
    like this:

    .. code-block:: pycon

      >>> import numpy as np
      >>> arrays(np.int8, (2, 3)).example()
      array([[-8,  6,  3],
             [-6,  4,  6]], dtype=int8)

    - see :doc:`What you can generate and how <data>`.

    .. code-block:: pycon

      >>> import numpy as np
      >>> from hypothesis.strategies import floats
      >>> arrays(np.float, 3, elements=floats(0, 1)).example()
      array([ 0.88974794,  0.77387938,  0.1977879 ])

    Array values are generated in two parts:

    1. Some subset of the coordinates of the array are populated with a value
       drawn from the elements strategy (or its inferred form).
    2. If any coordinates were not assigned in the previous step, a single
       value is drawn from the fill strategy and is assigned to all remaining
       places.

    You can set fill to :func:`~hypothesis.strategies.nothing` if you want to
    disable this behaviour and draw a value for every element.

    If fill is set to None then it will attempt to infer the correct behaviour
    automatically: If unique is True, no filling will occur by default.
    Otherwise, if it looks safe to reuse the values of elements across
    multiple coordinates (this will be the case for any inferred strategy, and
    for most of the builtins, but is not the case for mutable values or
    strategies built with flatmap, map, composite, etc) then it will use the
    elements strategy as the fill, else it will default to having no fill.

    Having a fill helps Hypothesis craft high quality examples, but its
    main importance is when the array generated is large: Hypothesis is
    primarily designed around testing small examples. If you have arrays with
    hundreds or more elements, having a fill value is essential if you want
    your tests to run in reasonable time.

    """
    if isinstance(dtype, SearchStrategy):
        dtype = draw(dtype)
    dtype = np.dtype(dtype)
    if elements is None:
        elements = from_dtype(dtype)
    if isinstance(shape, SearchStrategy):
        shape = draw(shape)
    if isinstance(shape, int):
        shape = (shape,)
    shape = tuple(shape)
    if not shape:
        if dtype.kind != u'O':
            return draw(elements)
    if fill is None:
        if unique or not elements.has_reusable_values:
            fill = st.nothing()
        else:
            fill = elements
    else:
        st.check_strategy(fill, 'fill')
    return draw(ArrayStrategy(elements, shape, dtype, fill, unique))
Exemplo n.º 11
0
def data_frames(
    columns=None, rows=None, index=None
):
    """Provides a strategy for producing a :class:`pandas.DataFrame`.

    Arguments:

    * columns: An iterable of :class:`column` objects describing the shape
      of the generated DataFrame.

    * rows: A strategy for generating a row object. Should generate
      either dicts mapping column names to values or a sequence mapping
      column position to the value in that position (note that unlike the
      :class:`pandas.DataFrame` constructor, single values are not allowed
      here. Passing e.g. an integer is an error, even if there is only one
      column).

      At least one of rows and columns must be provided. If both are
      provided then the generated rows will be validated against the
      columns and an error will be raised if they don't match.

      Caveats on using rows:

      * In general you should prefer using columns to rows, and only use
        rows if the columns interface is insufficiently flexible to
        describe what you need - you will get better performance and
        example quality that way.
      * If you provide rows and not columns, then the shape and dtype of
        the resulting DataFrame may vary. e.g. if you have a mix of int
        and float in the values for one column in your row entries, the
        column will sometimes have an integral dtype and sometimes a float.

    * index: If not None, a strategy for generating indexes for the
      resulting DataFrame. This can generate either :class:`pandas.Index`
      objects or any sequence of values (which will be passed to the
      Index constructor).

      You will probably find it most convenient to use the
      :func:`~hypothesis.extra.pandas.indexes` or
      :func:`~hypothesis.extra.pandas.range_indexes` function to produce
      values for this argument.

    Usage:

    The expected usage pattern is that you use :class:`column` and
    :func:`columns` to specify a fixed shape of the DataFrame you want as
    follows. For example the following gives a two column data frame:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import column, data_frames
        >>> data_frames([
        ... column('A', dtype=int), column('B', dtype=float)]).example()
                    A              B
        0  2021915903  1.793898e+232
        1  1146643993            inf
        2 -2096165693   1.000000e+07

    If you want the values in different columns to interact in some way you
    can use the rows argument. For example the following gives a two column
    DataFrame where the value in the first column is always at most the value
    in the second:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import column, data_frames
        >>> import hypothesis.strategies as st
        >>> data_frames(
        ...     rows=st.tuples(st.floats(allow_nan=False),
        ...                    st.floats(allow_nan=False)).map(sorted)
        ... ).example()
                       0             1
        0  -3.402823e+38  9.007199e+15
        1 -1.562796e-298  5.000000e-01

    You can also combine the two:

    .. code-block:: pycon

        >>> from hypothesis.extra.pandas import columns, data_frames
        >>> import hypothesis.strategies as st
        >>> data_frames(
        ...     columns=columns(["lo", "hi"], dtype=float),
        ...     rows=st.tuples(st.floats(allow_nan=False),
        ...                    st.floats(allow_nan=False)).map(sorted)
        ... ).example()
                 lo            hi
        0   9.314723e-49  4.353037e+45
        1  -9.999900e-01  1.000000e+07
        2 -2.152861e+134 -1.069317e-73

    (Note that the column dtype must still be specified and will not be
    inferred from the rows. This restriction may be lifted in future).

    Combining rows and columns has the following behaviour:

    * The column names and dtypes will be used.
    * If the column is required to be unique, this will be enforced.
    * Any values missing from the generated rows will be provided using the
      column's fill.
    * Any values in the row not present in the column specification (if
      dicts are passed, if there are keys with no corresponding column name,
      if sequences are passed if there are too many items) will result in
      InvalidArgument being raised.

    """

    if index is None:
        index = range_indexes()
    else:
        st.check_strategy(index)

    index_strategy = index

    if columns is None:
        if rows is None:
            raise InvalidArgument(
                'At least one of rows and columns must be provided'
            )
        else:
            @st.composite
            def rows_only(draw):
                index = draw(index_strategy)

                @check_function
                def row():
                    result = draw(rows)
                    st.check_type(Iterable, result, 'draw(row)')
                    return result

                if len(index) > 0:
                    return pandas.DataFrame(
                        [row() for _ in index],
                        index=index
                    )
                else:
                    # If we haven't drawn any rows we need to draw one row and
                    # then discard it so that we get a consistent shape for the
                    # DataFrame.
                    base = pandas.DataFrame([row()])
                    return base.drop(0)
            return rows_only()

    assert columns is not None
    columns = st.try_convert(tuple, columns, 'columns')

    rewritten_columns = []
    column_names = set()

    for i, c in enumerate(columns):
        st.check_type(column, c, 'columns[%d]' % (i,))

        c = copy(c)
        if c.name is None:
            label = 'columns[%d]' % (i,)
            c.name = i
        else:
            label = c.name
            try:
                hash(c.name)
            except TypeError:
                raise InvalidArgument(
                    'Column names must be hashable, but columns[%d].name was '
                    '%r of type %s, which cannot be hashed.' % (
                        i, c.name, type(c.name).__name__,))

        if c.name in column_names:
            raise InvalidArgument(
                'duplicate definition of column name %r' % (c.name,))

        column_names.add(c.name)

        c.elements, c.dtype = elements_and_dtype(
            c.elements, c.dtype, label
        )

        if c.dtype is None and rows is not None:
            raise InvalidArgument(
                'Must specify a dtype for all columns when combining rows with'
                ' columns.'
            )

        c.fill = npst.fill_for(
            fill=c.fill, elements=c.elements, unique=c.unique,
            name=label
        )

        rewritten_columns.append(c)

    if rows is None:
        @st.composite
        def just_draw_columns(draw):
            index = draw(index_strategy)
            local_index_strategy = st.just(index)

            data = OrderedDict((c.name, None) for c in rewritten_columns)

            # Depending on how the columns are going to be generated we group
            # them differently to get better shrinking. For columns with fill
            # enabled, the elements can be shrunk independently of the size,
            # so we can just shrink by shrinking the index then shrinking the
            # length and are generally much more free to move data around.

            # For columns with no filling the problem is harder, and drawing
            # them like that would result in rows being very far apart from
            # each other in the underlying data stream, which gets in the way
            # of shrinking. So what we do is reorder and draw those columns
            # row wise, so that the values of each row are next to each other.
            # This makes life easier for the shrinker when deleting blocks of
            # data.
            columns_without_fill = [
                c for c in rewritten_columns if c.fill.is_empty]

            if columns_without_fill:
                for c in columns_without_fill:
                    data[c.name] = pandas.Series(
                        np.zeros(shape=len(index), dtype=c.dtype),
                        index=index,
                    )
                seen = {
                    c.name: set() for c in columns_without_fill if c.unique}

                for i in hrange(len(index)):
                    for c in columns_without_fill:
                        if c.unique:
                            for _ in range(5):
                                value = draw(c.elements)
                                if value not in seen[c.name]:
                                    seen[c.name].add(value)
                                    break
                            else:
                                reject()
                        else:
                            value = draw(c.elements)
                        data[c.name][i] = value

            for c in rewritten_columns:
                if not c.fill.is_empty:
                    data[c.name] = draw(series(
                        index=local_index_strategy, dtype=c.dtype,
                        elements=c.elements, fill=c.fill, unique=c.unique))

            return pandas.DataFrame(data, index=index)
        return just_draw_columns()
    else:
        @st.composite
        def assign_rows(draw):
            index = draw(index_strategy)

            result = pandas.DataFrame(OrderedDict(
                (c.name, pandas.Series(
                    np.zeros(dtype=c.dtype, shape=len(index)), dtype=c.dtype))
                for c in rewritten_columns
            ), index=index)

            fills = {}

            any_unique = any(c.unique for c in rewritten_columns)

            if any_unique:
                all_seen = [
                    set() if c.unique else None for c in rewritten_columns]
                while all_seen[-1] is None:
                    all_seen.pop()

            for row_index in hrange(len(index)):
                for _ in hrange(5):
                    original_row = draw(rows)
                    row = original_row
                    if isinstance(row, dict):
                        as_list = [None] * len(rewritten_columns)
                        for i, c in enumerate(rewritten_columns):
                            try:
                                as_list[i] = row[c.name]
                            except KeyError:
                                try:
                                    as_list[i] = fills[i]
                                except KeyError:
                                    fills[i] = draw(c.fill)
                                    as_list[i] = fills[i]
                        for k in row:
                            if k not in column_names:
                                raise InvalidArgument((
                                    'Row %r contains column %r not in '
                                    'columns %r)' % (
                                        row, k, [
                                            c.name for c in rewritten_columns
                                        ])))
                        row = as_list
                    if any_unique:
                        has_duplicate = False
                        for seen, value in zip(all_seen, row):
                            if seen is None:
                                continue
                            if value in seen:
                                has_duplicate = True
                                break
                            seen.add(value)
                        if has_duplicate:
                            continue
                    row = list(st.try_convert(tuple, row, 'draw(rows)'))

                    if len(row) > len(rewritten_columns):
                        raise InvalidArgument((
                            'Row %r contains too many entries. Has %d but '
                            'expected at most %d') % (
                                original_row, len(row), len(rewritten_columns)
                        ))
                    while len(row) < len(rewritten_columns):
                        row.append(draw(rewritten_columns[len(row)].fill))
                    result.iloc[row_index] = row
                    break
                else:
                    reject()
            return result
        return assign_rows()
Exemplo n.º 12
0
def series(elements=None, dtype=None, index=None, fill=None, unique=False):
    """Provides a strategy for producing a :class:`pandas.Series`.

    Arguments:

    * elements: a strategy that will be used to generate the individual
      values in the series. If None, we will attempt to infer a suitable
      default from the dtype.

    * dtype: the dtype of the resulting series and may be any value
      that can be passed to :class:`numpy.dtype`. If None, will use
      pandas's standard behaviour to infer it from the type of the elements
      values. Note that if the type of values that comes out of your
      elements strategy varies, then so will the resulting dtype of the
      series.

    * index: If not None, a strategy for generating indexes for the
      resulting Series. This can generate either :class:`pandas.Index`
      objects or any sequence of values (which will be passed to the
      Index constructor).

      You will probably find it most convenient to use the
      :func:`~hypothesis.extra.pandas.indexes` or
      :func:`~hypothesis.extra.pandas.range_indexes` function to produce
      values for this argument.

    Usage:

    .. code-block:: pycon

        >>> series(dtype=int).example()
        0   -2001747478
        1    1153062837

    """
    if index is None:
        index = range_indexes()
    else:
        st.check_strategy(index)

    elements, dtype = elements_and_dtype(elements, dtype)
    index_strategy = index

    @st.composite
    def result(draw):
        index = draw(index_strategy)

        if len(index) > 0:
            if dtype is not None:
                result_data = draw(npst.arrays(
                    dtype=dtype, elements=elements, shape=len(index),
                    fill=fill, unique=unique,
                ))
            else:
                result_data = list(draw(npst.arrays(
                    dtype=object, elements=elements, shape=len(index),
                    fill=fill, unique=unique,
                )))

            return pandas.Series(
                result_data, index=index, dtype=dtype
            )
        else:
            return pandas.Series(
                (), index=index,
                dtype=dtype if dtype is not None else draw(
                    dtype_for_elements_strategy(elements)))

    return result()
Exemplo n.º 13
0
def arrays(draw, dtype, shape, elements=None, fill=None):
    """`dtype` may be any valid input to ``np.dtype`` (this includes
    ``np.dtype`` objects), or a strategy that generates such values.  `shape`
    may be an integer >= 0, a tuple of length >= of such integers, or a
    strategy that generates such values.

    Arrays of specified `dtype` and `shape` are generated for example
    like this:

    .. code-block:: pycon

      >>> import numpy as np
      >>> arrays(np.int8, (2, 3)).example()
      array([[-8,  6,  3],
             [-6,  4,  6]], dtype=int8)

    If elements is None, Hypothesis infers a strategy based on the dtype,
    which may give any legal value (including eg ``NaN`` for floats).  If you
    have more specific requirements, you can supply your own elements strategy
    - see :doc:`What you can generate and how <data>`.

    .. code-block:: pycon

      >>> import numpy as np
      >>> from hypothesis.strategies import floats
      >>> arrays(np.float, 3, elements=floats(0, 1)).example()
      array([ 0.88974794,  0.77387938,  0.1977879 ])

    The fill argument provides a 'background noise' value for the array. Array
    values are generated in two parts:

    1. Some subset of the coordinates of the array are populated with a value
       drawn from the elements strategy (or its inferred form).
    2. If any coordinates were not assigned in the previous step, a single
       value is drawn from the fill strategy and is assigned to all remaining
       places.

    You can set fill to :func:`~hypothesis.strategies.nothing` if you want to
    disable this behaviour and draw a value for every element.

    If fill is set to None then it will attempt to infer the correct behaviour
    automatically: If it looks safe to reuse the values of elements across
    multiple coordinates (this will be the case for any inferred strategy, and
    for most of the builtins, but is not the case for mutable values or
    strategies built with flatmap, map, composite, etc) then it will use the
    elements strategy as the fill, else it will default to having no fill.

    Having a fill helps Hypothesis craft high quality examples, but its
    main importance is when the array generated is large: Hypothesis is
    primarily designed around testing small examples. If you have arrays with
    hundreds or more elements, having a fill value is essential if you want
    your tests to run in reasonable time.

    """
    if isinstance(dtype, SearchStrategy):
        dtype = draw(dtype)
    dtype = np.dtype(dtype)
    if elements is None:
        elements = from_dtype(dtype)
    if isinstance(shape, SearchStrategy):
        shape = draw(shape)
    if isinstance(shape, int):
        shape = (shape, )
    shape = tuple(shape)
    if not shape:
        if dtype.kind != u'O':
            return draw(elements)
    if fill is None:
        if elements.has_reusable_values:
            fill = elements
        else:
            fill = st.nothing()
    else:
        st.check_strategy(fill, 'fill')
    return draw(ArrayStrategy(elements, shape, dtype, fill))