Exemplo n.º 1
0
def new_dataset(expr, missing_values, domain):
    """
    Creates or returns a dataset from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The blaze expression representing the values.
    missing_values : frozenset((name, value) pairs
        Association pairs column name and missing_value for that column.

        This needs to be a frozenset rather than a dict or tuple of tuples
        because we want a collection that's unordered but still hashable.
    domain : zipline.pipeline.domain.Domain
        Domain of the dataset to be created.

    Returns
    -------
    ds : type
        A new dataset type.

    Notes
    -----
    This function is memoized. repeated calls with the same inputs will return
    the same type.
    """
    missing_values = dict(missing_values)
    class_dict = {'ndim': 2 if SID_FIELD_NAME in expr.fields else 1}
    for name, type_ in expr.dshape.measure.fields:
        # Don't generate a column for sid or timestamp, since they're
        # implicitly the labels if the arrays that will be passed to pipeline
        # Terms.
        if name in (SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        type_ = datashape_type_to_numpy(type_)
        if can_represent_dtype(type_):
            col = Column(
                type_,
                missing_values.get(name, NotSpecified),
            )
        else:
            col = NonPipelineField(name, type_)
        class_dict[name] = col

    if 'domain' in class_dict:
        raise ValueError("Got a column named 'domain' in new_dataset(). "
                         "'domain' is reserved.")
    class_dict['domain'] = domain

    name = expr._name
    if name is None:
        name = next(_new_names)

    # unicode is a name error in py3 but the branch is only hit
    # when we are in python 2.
    if PY2 and isinstance(name, unicode):  # pragma: no cover # noqa
        name = name.encode('utf-8')

    return type(name, (DataSet, ), class_dict)
Exemplo n.º 2
0
def new_dataset(expr, missing_values, domain):
    """
    Creates or returns a dataset from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The blaze expression representing the values.
    missing_values : frozenset((name, value) pairs
        Association pairs column name and missing_value for that column.

        This needs to be a frozenset rather than a dict or tuple of tuples
        because we want a collection that's unordered but still hashable.
    domain : zipline.pipeline.domain.Domain
        Domain of the dataset to be created.

    Returns
    -------
    ds : type
        A new dataset type.

    Notes
    -----
    This function is memoized. repeated calls with the same inputs will return
    the same type.
    """
    missing_values = dict(missing_values)
    class_dict = {'ndim': 2 if SID_FIELD_NAME in expr.fields else 1}
    for name, type_ in expr.dshape.measure.fields:
        # Don't generate a column for sid or timestamp, since they're
        # implicitly the labels if the arrays that will be passed to pipeline
        # Terms.
        if name in (SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        type_ = datashape_type_to_numpy(type_)
        if can_represent_dtype(type_):
            col = Column(
                type_,
                missing_values.get(name, NotSpecified),
            )
        else:
            col = NonPipelineField(name, type_)
        class_dict[name] = col

    if 'domain' in class_dict:
        raise ValueError("Got a column named 'domain' in new_dataset(). "
                         "'domain' is reserved.")
    class_dict['domain'] = domain

    name = expr._name
    if name is None:
        name = next(_new_names)

    # unicode is a name error in py3 but the branch is only hit
    # when we are in python 2.
    if PY2 and isinstance(name, unicode):  # pragma: no cover # noqa
        name = name.encode('utf-8')

    return type(name, (DataSet,), class_dict)
Exemplo n.º 3
0
def validate_dtype(termname, dtype, missing_value):
    """
    Validate a `dtype` and `missing_value` passed to Term.__new__.

    Ensures that we know how to represent ``dtype``, and that missing_value
    is specified for types without default missing values.

    Returns
    -------
    validated_dtype, validated_missing_value : np.dtype, any
        The dtype and missing_value to use for the new term.

    Raises
    ------
    DTypeNotSpecified
        When no dtype was passed to the instance, and the class doesn't
        provide a default.
    NotDType
        When either the class or the instance provides a value not
        coercible to a numpy dtype.
    NoDefaultMissingValue
        When dtype requires an explicit missing_value, but
        ``missing_value`` is NotSpecified.
    """
    if dtype is NotSpecified:
        raise DTypeNotSpecified(termname=termname)

    try:
        dtype = dtype_class(dtype)
    except TypeError:
        raise NotDType(dtype=dtype, termname=termname)

    if not can_represent_dtype(dtype):
        raise UnsupportedDType(dtype=dtype, termname=termname)

    if missing_value is NotSpecified:
        missing_value = default_missing_value_for_dtype(dtype)

    try:
        if dtype == categorical_dtype:
            # This check is necessary because we use object dtype for
            # categoricals, and numpy will allow us to promote numerical
            # values to object even though we don't support them.
            _assert_valid_categorical_missing_value(missing_value)

        # For any other type, we can check if the missing_value is safe by
        # making an array of that value and trying to safely convert it to
        # the desired type.
        # 'same_kind' allows casting between things like float32 and
        # float64, but not str and int.
        array([missing_value]).astype(dtype=dtype, casting="same_kind")
    except TypeError as e:
        raise TypeError(
            "Missing value {value!r} is not a valid choice "
            "for term {termname} with dtype {dtype}.\n\n"
            "Coercion attempt failed with: {error}".format(termname=termname, value=missing_value, dtype=dtype, error=e)
        )

    return dtype, missing_value
Exemplo n.º 4
0
def new_dataset(expr, deltas, missing_values):
    """
    Creates or returns a dataset from a pair of blaze expressions.

    Parameters
    ----------
    expr : Expr
        The blaze expression representing the first known values.
    deltas : Expr
        The blaze expression representing the deltas to the data.
    missing_values : frozenset((name, value) pairs
        Association pairs column name and missing_value for that column.

        This needs to be a frozenset rather than a dict or tuple of tuples
        because we want a collection that's unordered but still hashable.

    Returns
    -------
    ds : type
        A new dataset type.

    Notes
    -----
    This function is memoized. repeated calls with the same inputs will return
    the same type.
    """
    missing_values = dict(missing_values)
    columns = {}
    for name, type_ in expr.dshape.measure.fields:
        # Don't generate a column for sid or timestamp, since they're
        # implicitly the labels if the arrays that will be passed to pipeline
        # Terms.
        if name in (SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        type_ = datashape_type_to_numpy(type_)
        if can_represent_dtype(type_):
            col = Column(
                type_,
                missing_values.get(name, NotSpecified),
            )
        else:
            col = NonPipelineField(name, type_)
        columns[name] = col

    name = expr._name
    if name is None:
        name = next(_new_names)

    # unicode is a name error in py3 but the branch is only hit
    # when we are in python 2.
    if PY2 and isinstance(name, unicode):  # noqa
        name = name.encode('utf-8')

    return type(name, (DataSet,), columns)
Exemplo n.º 5
0
def new_dataset(expr, deltas, missing_values):
    """
    Creates or returns a dataset from a pair of blaze expressions.

    Parameters
    ----------
    expr : Expr
        The blaze expression representing the first known values.
    deltas : Expr
        The blaze expression representing the deltas to the data.
    missing_values : frozenset((name, value) pairs
        Association pairs column name and missing_value for that column.

        This needs to be a frozenset rather than a dict or tuple of tuples
        because we want a collection that's unordered but still hashable.

    Returns
    -------
    ds : type
        A new dataset type.

    Notes
    -----
    This function is memoized. repeated calls with the same inputs will return
    the same type.
    """
    missing_values = dict(missing_values)
    columns = {}
    for name, type_ in expr.dshape.measure.fields:
        # Don't generate a column for sid or timestamp, since they're
        # implicitly the labels if the arrays that will be passed to pipeline
        # Terms.
        if name in (SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        type_ = datashape_type_to_numpy(type_)
        if can_represent_dtype(type_):
            col = Column(
                type_,
                missing_values.get(name, NotSpecified),
            )
        else:
            col = NonPipelineField(name, type_)
        columns[name] = col

    name = expr._name
    if name is None:
        name = next(_new_names)

    # unicode is a name error in py3 but the branch is only hit
    # when we are in python 2.
    if PY2 and isinstance(name, unicode):  # noqa
        name = name.encode('utf-8')

    return type(name, (DataSet, ), columns)
Exemplo n.º 6
0
def validate_dtype(termname, dtype, missing_value):
    """
    Validate a `dtype` and `missing_value` passed to Term.__new__.

    Ensures that we know how to represent ``dtype``, and that missing_value
    is specified for types without default missing values.

    Returns
    -------
    validated_dtype, validated_missing_value : np.dtype, any
        The dtype and missing_value to use for the new term.

    Raises
    ------
    DTypeNotSpecified
        When no dtype was passed to the instance, and the class doesn't
        provide a default.
    NotDType
        When either the class or the instance provides a value not
        coercible to a numpy dtype.
    NoDefaultMissingValue
        When dtype requires an explicit missing_value, but
        ``missing_value`` is NotSpecified.
    """
    if dtype is NotSpecified:
        raise DTypeNotSpecified(termname=termname)

    try:
        dtype = dtype_class(dtype)
    except TypeError:
        raise NotDType(dtype=dtype, termname=termname)

    if not can_represent_dtype(dtype):
        raise UnsupportedDType(dtype=dtype, termname=termname)

    if missing_value is NotSpecified:
        missing_value = default_missing_value_for_dtype(dtype)

    try:
        _coerce_to_dtype(missing_value, dtype)
    except TypeError as e:
        raise TypeError(
            "Missing value {value!r} is not a valid choice "
            "for term {termname} with dtype {dtype}.\n\n"
            "Coercion attempt failed with: {error}".format(
                termname=termname,
                value=missing_value,
                dtype=dtype,
                error=e,
            )
        )

    return dtype, missing_value
Exemplo n.º 7
0
    def validate_dtype(termname, dtype, missing_value):
        """
        Validate a `dtype` and `missing_value` passed to Term.__new__.

        Ensures that we know how to represent ``dtype``, and that missing_value
        is specified for types without default missing values.

        Returns
        -------
        validated_dtype, validated_missing_value : np.dtype, any
            The dtype and missing_value to use for the new term.

        Raises
        ------
        DTypeNotSpecified
            When no dtype was passed to the instance, and the class doesn't
            provide a default.
        NotDType
            When either the class or the instance provides a value not
            coercible to a numpy dtype.
        NoDefaultMissingValue
            When dtype requires an explicit missing_value, but
            ``missing_value`` is NotSpecified.
        """
        if dtype is NotSpecified:
            raise DTypeNotSpecified(termname=termname)
        try:
            dtype = dtype_class(dtype)
        except TypeError:
            raise NotDType(dtype=dtype, termname=termname)

        if not can_represent_dtype(dtype):
            raise UnsupportedDType(dtype=dtype, termname=termname)

        if missing_value is NotSpecified:
            missing_value = default_missing_value_for_dtype(dtype)

        return dtype, missing_value
Exemplo n.º 8
0
    def validate_dtype(termname, dtype, missing_value):
        """
        Validate a `dtype` and `missing_value` passed to Term.__new__.

        Ensures that we know how to represent ``dtype``, and that missing_value
        is specified for types without default missing values.

        Returns
        -------
        validated_dtype, validated_missing_value : np.dtype, any
            The dtype and missing_value to use for the new term.

        Raises
        ------
        DTypeNotSpecified
            When no dtype was passed to the instance, and the class doesn't
            provide a default.
        NotDType
            When either the class or the instance provides a value not
            coercible to a numpy dtype.
        NoDefaultMissingValue
            When dtype requires an explicit missing_value, but
            ``missing_value`` is NotSpecified.
        """
        if dtype is NotSpecified:
            raise DTypeNotSpecified(termname=termname)
        try:
            dtype = dtype_class(dtype)
        except TypeError:
            raise NotDType(dtype=dtype, termname=termname)

        if not can_represent_dtype(dtype):
            raise UnsupportedDType(dtype=dtype, termname=termname)

        if missing_value is NotSpecified:
            missing_value = default_missing_value_for_dtype(dtype)

        return dtype, missing_value
Exemplo n.º 9
0
def validate_dtype(termname, dtype, missing_value):
    """
    Validate a `dtype` and `missing_value` passed to Term.__new__.

    Ensures that we know how to represent ``dtype``, and that missing_value
    is specified for types without default missing values.

    Returns
    -------
    validated_dtype, validated_missing_value : np.dtype, any
        The dtype and missing_value to use for the new term.

    Raises
    ------
    DTypeNotSpecified
        When no dtype was passed to the instance, and the class doesn't
        provide a default.
    NotDType
        When either the class or the instance provides a value not
        coercible to a numpy dtype.
    NoDefaultMissingValue
        When dtype requires an explicit missing_value, but
        ``missing_value`` is NotSpecified.
    """
    if dtype is NotSpecified:
        raise DTypeNotSpecified(termname=termname)

    try:
        dtype = dtype_class(dtype)
    except TypeError:
        raise NotDType(dtype=dtype, termname=termname)

    if not can_represent_dtype(dtype):
        raise UnsupportedDType(dtype=dtype, termname=termname)

    if missing_value is NotSpecified:
        missing_value = default_missing_value_for_dtype(dtype)

    try:
        if (dtype == categorical_dtype):
            # This check is necessary because we use object dtype for
            # categoricals, and numpy will allow us to promote numerical
            # values to object even though we don't support them.
            _assert_valid_categorical_missing_value(missing_value)

        # For any other type, we can check if the missing_value is safe by
        # making an array of that value and trying to safely convert it to
        # the desired type.
        # 'same_kind' allows casting between things like float32 and
        # float64, but not str and int.
        array([missing_value]).astype(dtype=dtype, casting='same_kind')
    except TypeError as e:
        raise TypeError(
            "Missing value {value!r} is not a valid choice "
            "for term {termname} with dtype {dtype}.\n\n"
            "Coercion attempt failed with: {error}".format(
                termname=termname,
                value=missing_value,
                dtype=dtype,
                error=e,
            )
        )

    return dtype, missing_value