Exemplo n.º 1
0
def test_get_valid_dtype(sample_series):
    valid_dtype = _get_valid_dtype(type(sample_series), Categorical)
    if ks and isinstance(sample_series, ks.Series):
        assert valid_dtype == 'string'
    else:
        assert valid_dtype == 'category'

    valid_dtype = _get_valid_dtype(type(sample_series), Boolean)
    assert valid_dtype == 'bool'
Exemplo n.º 2
0
            def wrapper(*args, **kwargs):
                # Make Series call and intercept the result
                result = series_attr(*args, **kwargs)

                # Try to initialize Woodwork with the existing schema
                if _is_series(result):
                    valid_dtype = _get_valid_dtype(type(result),
                                                   self._schema.logical_type)
                    if str(result.dtype) == valid_dtype:
                        result.ww.init(
                            logical_type=self._schema.logical_type,
                            semantic_tags=copy.deepcopy(
                                self._schema.semantic_tags),
                            description=self._schema.description,
                            metadata=copy.deepcopy(self._schema.metadata),
                            use_standard_tags=self._schema.use_standard_tags)
                    else:
                        invalid_schema_message = 'dtype mismatch between original dtype, ' \
                            f'{valid_dtype}, and returned dtype, {result.dtype}'
                        warning_message = TypingInfoMismatchWarning(
                        ).get_warning_message(attr, invalid_schema_message,
                                              'Series')
                        warnings.warn(warning_message,
                                      TypingInfoMismatchWarning)
                # Always return the results of the Series operation whether or not Woodwork is initialized
                return result
Exemplo n.º 3
0
def _get_invalid_schema_message(dataframe, schema):
    dataframe_cols = set(dataframe.columns)
    schema_cols = set(schema.columns.keys())

    df_cols_not_in_schema = dataframe_cols - schema_cols
    if df_cols_not_in_schema:
        return f'The following columns in the DataFrame were missing from the typing information: '\
            f'{df_cols_not_in_schema}'
    schema_cols_not_in_df = schema_cols - dataframe_cols
    if schema_cols_not_in_df:
        return f'The following columns in the typing information were missing from the DataFrame: '\
            f'{schema_cols_not_in_df}'
    for name in dataframe.columns:
        df_dtype = dataframe[name].dtype
        valid_dtype = _get_valid_dtype(type(dataframe[name]),
                                       schema.logical_types[name])
        if str(df_dtype) != valid_dtype:
            return f'dtype mismatch for column {name} between DataFrame dtype, '\
                f'{df_dtype}, and {schema.logical_types[name]} dtype, {valid_dtype}'
    if schema.index is not None and isinstance(dataframe, pd.DataFrame):
        # Index validation not performed for Dask/Koalas
        if not all(dataframe.index == dataframe[schema.index]):
            return 'Index mismatch between DataFrame and typing information'
        elif not dataframe[schema.index].is_unique:
            return 'Index column is not unique'
Exemplo n.º 4
0
 def physical_types(self):
     """A dictionary containing physical types for each column"""
     if self._schema is None:
         _raise_init_error()
     return {
         col_name: _get_valid_dtype(type(self._dataframe[col_name]),
                                    self._schema.logical_types[col_name])
         for col_name in self._dataframe.columns
     }
Exemplo n.º 5
0
def test_init_series_valid_conversion_specified_ltype(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    series = init_series(sample_series, logical_type='categorical')
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'category'}

    series = init_series(sample_series, logical_type='natural_language')
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), NaturalLanguage)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == NaturalLanguage
    assert series.ww.semantic_tags == set()
Exemplo n.º 6
0
def _validate_schema(schema, series):
    if not isinstance(schema, ColumnSchema):
        raise TypeError(
            'Provided schema must be a Woodwork.ColumnSchema object.')

    valid_dtype = _get_valid_dtype(type(series), schema.logical_type)
    if str(series.dtype) != valid_dtype:
        raise ValueError(
            f"dtype mismatch between Series dtype {series.dtype}, and {schema.logical_type} dtype, {valid_dtype}"
        )
Exemplo n.º 7
0
def test_init_series_valid_conversion_inferred_ltype(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    series = init_series(sample_series)
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'category'}
Exemplo n.º 8
0
    def _validate_logical_type(self, logical_type):
        """Validates that a logical type is consistent with the series dtype. Performs additional type
        specific validation, as required."""
        valid_dtype = _get_valid_dtype(type(self._series), logical_type)
        if valid_dtype != str(self._series.dtype):
            raise ValueError(
                f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is "
                f"incompatible with {logical_type} dtype. Try converting series "
                f"dtype to '{valid_dtype}' before initializing or use the "
                "woodwork.init_series function to initialize.")

        if isinstance(logical_type, Ordinal):
            logical_type._validate_data(self._series)
        elif logical_type == LatLong:
            if not _is_valid_latlong_series(self._series):
                raise ValueError(
                    "Cannot initialize Woodwork. Series does not contain properly formatted "
                    "LatLong data. Try reformatting before initializing or use the "
                    "woodwork.init_series function to initialize.")
Exemplo n.º 9
0
            def wrapper(*args, **kwargs):
                # Make Series call and intercept the result
                result = series_attr(*args, **kwargs)

                # Try to initialize Woodwork with the existing schema
                if _is_series(result):
                    valid_dtype = _get_valid_dtype(type(result),
                                                   self._schema.logical_type)
                    if str(result.dtype) == valid_dtype:
                        result.ww.init(schema=self.schema, validate=False)
                    else:
                        invalid_schema_message = 'dtype mismatch between original dtype, ' \
                            f'{valid_dtype}, and returned dtype, {result.dtype}'
                        warning_message = TypingInfoMismatchWarning(
                        ).get_warning_message(attr, invalid_schema_message,
                                              'Series')
                        warnings.warn(warning_message,
                                      TypingInfoMismatchWarning)
                # Always return the results of the Series operation whether or not Woodwork is initialized
                return result
Exemplo n.º 10
0
def test_init_series_all_parameters(sample_series):
    if ks and isinstance(sample_series, ks.Series):
        sample_series = sample_series.astype('str')
    else:
        sample_series = sample_series.astype('object')

    metadata = {'meta_key': 'meta_value'}
    description = 'custom description'
    series = init_series(sample_series,
                         logical_type='categorical',
                         semantic_tags=['custom_tag'],
                         metadata=metadata,
                         description=description,
                         use_standard_tags=False)
    assert series is not sample_series
    correct_dtype = _get_valid_dtype(type(sample_series), Categorical)
    assert series.dtype == correct_dtype
    assert series.ww.logical_type == Categorical
    assert series.ww.semantic_tags == {'custom_tag'}
    assert series.ww.metadata == metadata
    assert series.ww.description == description