Exemplo n.º 1
0
def test_exclude_variables(input_files, filename, as_file, exclude_variables, error):
    input_data = check_input_file(input_files, filename)
    if not as_file:
        with open(input_data, 'rb') as file_:
            data = file_.read()
    else:
        data = input_data

    if not error:
        result = read._read_spss(data, exclude_variables = exclude_variables)
        assert result is not None
        assert isinstance(result, tuple) is True
        assert len(result) == 2
        assert result[0] is not None
        assert result[1] is not None
        assert checkers.is_type(result[0], 'DataFrame') is True
        assert checkers.is_type(result[1], 'Metadata') is True

        assert len(result[0]) == result[1].rows
        for variable in exclude_variables:
            metadata = result[1]
            assert variable not in metadata.column_metadata
            assert variable not in result[0]

    else:
        with pytest.raises(error):
            result = read._read_spss(data)
Exemplo n.º 2
0
def test_limit_offset(input_files, filename, as_file, limit, offset, error):
    input_data = check_input_file(input_files, filename)
    if not as_file:
        with open(input_data, 'rb') as file_:
            data = file_.read()
    else:
        data = input_data

    if not error:
        result = read._read_spss(data, limit = limit)
        assert result is not None
        assert isinstance(result, tuple) is True
        assert len(result) == 2
        assert result[0] is not None
        assert result[1] is not None
        assert checkers.is_type(result[0], 'DataFrame') is True
        assert checkers.is_type(result[1], 'Metadata') is True

        if limit is None:
            assert len(result[0]) == result[1].rows
        else:
            assert len(result[0]) == max((limit - offset), limit)

    else:
        with pytest.raises(error):
            result = read._read_spss(data)
Exemplo n.º 3
0
def _handle_failure(on_failure = None,
                    error = None):
    """Handle the failure of a function called by :ref:`backoff`.

    :param on_failure: The :class:`Exception <python:Exception>` or function to call
      when all retry attempts have failed. If :class:`None <python:None>`, will raise the last-caught
      :class:`Exception <python:Exception>`. If an :class:`Exception <python:Exception>`,
      will raise the exception with the same message as the last-caught exception.
      If a function, will call the function and pass the last-raised exception, its
      message, and stacktrace to the function. Defaults to :class:`None <python:None>`.
    :type on_failure: :class:`Exception <python:Exception>` / function / :class:`None <python:None>`

    :param error: The :class:`Exception <python:Exception>` that was raised. Defaults
      to :class:`Exception <python:Exception>`.
    :type error: :class:`Exception <python:Exception>`
    """
    if error is None:
        error = Exception

    is_on_failure_an_exception = False
    if is_py2:
        if isinstance(on_failure, Exception):
            is_on_failure_an_exception = True
        elif checkers.is_type(on_failure, 'type'):
            is_on_failure_an_exception = isinstance(on_failure(), Exception)
        else:
            is_on_failure_an_exception = False
    else:
        is_on_failure_an_exception = checkers.is_type(on_failure,
                                                      ('type', 'Exception')) and \
                                     hasattr(on_failure, '__cause__')


    if on_failure is None:
        raise error
    elif is_on_failure_an_exception:
        raise on_failure(error.args[0])
    else:
        try:
            on_failure(error, error.args[0], sys.exc_info()[2])
        except Exception as nested_error:
            raise nested_error
Exemplo n.º 4
0
def apply_metadata(df: DataFrame,
                   metadata: Union[Metadata, dict,
                                   pyreadstat.metadata_container],
                   as_category: bool = True):
    """Updates the :class:`DataFrame <pandas:DataFrame>` ``df`` based on the ``metadata``.

    :param df: The :class:`DataFrame <pandas:pandas.DataFrame>` to update.
    :type df: :class:`pandas.DataFrame <pandas:pandas.DataFrame>`

    :param metadata: The :class:`Metadata` to apply to ``df``.
    :type metadata: :class:`Metadata`, :class:`pyreadstat.metadata_container`, or
      compatible :class:`dict <python:dict>`

    :param as_category: if ``True``, will variables with formats will be transformed into
      categories in the :class:`DataFrame <pandas:pandas.DataFrame>`. Defaults to
      ``True``.
    :type as_category: :class:`bool <python:bool>`

    :returns: A copy of ``df`` updated to reflect ``metadata``.
    :rtype: :class:`DataFrame <pandas:pandas.DataFrame>`
    """
    if not checkers.is_type(df, 'DataFrame'):
        raise ValueError(
            f'df must be a pandas.DataFrame. Was: {df.__class__.__name__}')
    if not checkers.is_type(metadata,
                            ('Metadata', 'metadata_container', 'dict')):
        raise ValueError(
            f'metadata must be a Metadata instance or compatible object. '
            f'Was: {metadata.__class__.__name__}')
    elif checkers.is_type(metadata, 'metadata_container'):
        metadata = Metadata.from_pyreadstat(metadata)
    elif checkers.is_type(metadata, 'dict'):
        metadata = Metadata.from_dict(metadata)

    as_pyreadstat = metadata.to_pyreadstat()

    return pyreadstat.set_value_labels(df,
                                       metadata=as_pyreadstat.value_labels,
                                       formats_as_category=as_category)
Exemplo n.º 5
0
def test_default_params(input_files, filename, as_file, error):
    input_data = check_input_file(input_files, filename)
    if not as_file:
        with open(input_data, 'rb') as file_:
            data = file_.read()
    else:
        data = input_data

    if not error:
        result = read._read_spss(data)
        assert result is not None
        assert isinstance(result, tuple) is True
        assert len(result) == 2
        assert result[0] is not None
        assert result[1] is not None
        assert checkers.is_type(result[0], 'DataFrame') is True
        assert checkers.is_type(result[1], 'Metadata') is True

        assert len(result[0]) == result[1].rows

    else:
        with pytest.raises(error):
            result = read._read_spss(data)
Exemplo n.º 6
0
    def column_metadata(self, value):
        value = validators.dict(value, allow_empty = True)
        if not value:
            self._column_metadata = None
        else:
            result = {}
            for key in value:
                key = validators.variable_name(key, allow_empty = False)
                if checkers.is_type(value[key], 'ColumnMetadata'):
                    result[key] = value[key]
                else:
                    result[key] = ColumnMetadata.from_dict(result[key])

            self._column_metadata = result
Exemplo n.º 7
0
def test_get_metadata(input_files, filename, as_file, error):
    input_data = check_input_file(input_files, filename)
    if not as_file:
        with open(input_data, 'rb') as file_:
            data = file_.read()
    else:
        data = input_data

    if not error:
        result = read.get_metadata(data)
        assert result is not None
        assert checkers.is_type(result, 'Metadata') is True
        assert result.column_metadata is not None
    else:
        with pytest.raises(error):
            result = read.get_metadata(data)
def get_attribute_names(obj,
                        include_callable=False,
                        include_nested=True,
                        include_private=False,
                        include_special=False,
                        include_utilities=False):
    """Return a list of attribute names within ``obj``.

    :param include_callable: If ``True``, will include callable attributes (methods).
      Defaults to ``False``.
    :type include_callable: :class:`bool <python:bool>`

    :param include_nested: If ``True``, will include attributes that are
      arbitrarily-nestable types (such as a :term:`model class` or
      :class:`dict <python:dict>`). Defaults to ``False``.
    :type include_nested: :class:`bool <python:bool>`

    :param include_private: If ``True``, will include attributes whose names
      begin with ``_`` (but *not* ``__``). Defaults to ``False``.
    :type include_private: :class:`bool <python:bool>`

    :param include_special: If ``True``, will include atributes whose names begin
      with ``__``. Defaults to ``False``.
    :type include_special: :class:`bool <python:bool>`

    :param include_utilities: If ``True``, will include utility properties
      added by SQLAlchemy or **SQLAthanor**. Defaults to ``False``.
    :type include_utilities: :class:`bool <python:bool>`

    :returns: :term:`Model Attribute` names attached to ``obj``.
    :rtype: :class:`list <python:list>` of :class:`str <python:str>`

    """
    attribute_names = [x for x in dir(obj)
                       if (include_utilities and x in UTILITY_COLUMNS) or \
                          (x not in UTILITY_COLUMNS)]
    attributes = []
    for attribute in attribute_names:
        if (attribute[0] == '_'
                and attribute[0:2] != '__') and not include_private:
            continue

        if attribute[0:2] == '__' and not include_special:
            continue

        try:
            attribute_value = getattr(obj, attribute)
        except SA_InvalidRequestError:
            if not include_nested:
                continue

            attributes.append(attribute)
            continue

        if not include_nested:
            if checkers.is_type(attribute_value,
                                ('BaseModel', 'RelationshipProperty',
                                 'AssociationProxy', dict)):
                continue

            try:
                is_iterable = checkers.is_iterable(attribute_value,
                                                   forbid_literals=(str, bytes,
                                                                    dict))
            except SA_InvalidRequestError as error:
                if not include_nested:
                    continue
                else:
                    is_iterable = False

            if is_iterable:
                loop = False

                try:
                    for item in attribute_value:
                        if checkers.is_type(
                                item, ('BaseModel', 'RelationshipProperty',
                                       'AssociationProxy', dict)):
                            loop = True
                            break
                except (NotImplementedError, TypeError):
                    pass

                if loop:
                    continue

        if not include_callable and checkers.is_callable(attribute_value):
            continue

        attributes.append(attribute)

    return attributes
Exemplo n.º 9
0
def from_dataframe(df: DataFrame,
                   target: Optional[Union['PathLike[Any]', BytesIO]] = None,
                   metadata: Optional[Metadata] = None,
                   compress: bool = False):
    """Create an SPSS dataset from a `Pandas <https://pandas.pydata.org/>`_
    :class:`DataFrame <pandas:DataFrame>`.

    :param df: The :class:`DataFrame` to serialize to an SPSS dataset.
    :type df: :class:`pandas.DataFrame <pandas:DataFrame>`

    :param target: The target to which the SPSS dataset should be written. Accepts either
      a filename/path, a :class:`BytesIO <python:io.BytesIO>` object, or
      :obj:`None <python:None>`. If :obj:`None <python:None>` will return a
      :class:`BytesIO <python:io.BytesIO>` object containing the SPSS dataset. Defaults to
      :obj:`None <python:None>`.
    :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` /
      :obj:`None <python:None>`

    :param metadata: The :class:`Metadata` associated with the dataset. If
      :obj:`None <python:None>`, will attempt to derive it form ``df``. Defaults to
      :obj:`None <python:None>`.
    :type metadata: :class:`Metadata` / :obj:`None <python:None>`

    :param compress: If ``True``, will return data in the compressed ZSAV format. If
      ``False``, will return data in the standards SAV format. Defaults to ``False``.
    :type compress: :class:`bool <python:bool>`

    :returns: A :class:`BytesIO <python:io.BytesIO>` object containing the SPSS data if
      ``target`` is :obj:`None <python:None>` or not a filename, otherwise
      :obj:`None <python:None>`
    :rtype: :class:`BytesIO <python:io.BytesIO>` or :obj:`None <python:None>`

    :raises ValueError: if ``df`` is not a :class:`pandas.DataFrame <pandas:DataFrame>`
    :raises ValueError: if ``metadata`` is not a :class:`Metadata`

    """
    if not checkers.is_type(df, 'DataFrame'):
        raise ValueError(
            f'df must be a pandas.DataFrame. Was: {df.__class__.__name__}')
    if metadata and not checkers.is_type(
            metadata, ('Metadata', 'metadata_container', 'dict')):
        raise ValueError(
            f'metadata must be a Metadata instance or compatible object. '
            f'Was: {metadata.__class__.__name__}')
    elif metadata and checkers.is_type(metadata, 'metadata_container'):
        metadata = Metadata.from_pyreadstat(metadata)
    elif metadata and checkers.is_type(metadata, 'dict'):
        metadata = Metadata.from_dict(metadata)

    is_file = False
    if target and checkers.is_pathlike(target):
        is_file = True
    elif target:
        target = validators.bytesIO(target, allow_empty=False)

    if metadata:
        as_pyreadstat = metadata.to_pyreadstat()
    else:
        as_pyreadstat = None

    if target and is_file:
        with open(target, 'wb') as target_file:
            if as_pyreadstat:
                pyreadstat.write_sav(
                    df=df,
                    dst_path=target_file,
                    file_label=as_pyreadstat.file_label,
                    column_labels=as_pyreadstat.column_labels,
                    compress=compress,
                    note=as_pyreadstat.notes,
                    variable_value_labels=as_pyreadstat.variable_value_labels,
                    missing_ranges=as_pyreadstat.missing_ranges,
                    variable_display_width=as_pyreadstat.
                    variable_display_width,
                    variable_measure=as_pyreadstat.variable_measure)
            else:
                pyreadstat.write_sav(df=df,
                                     dst_path=target_file,
                                     compress=compress)

    else:
        with tempfile.NamedTemporaryFile() as temp_file:
            if as_pyreadstat:
                pyreadstat.write_sav(
                    df=df,
                    dst_path=temp_file.name,
                    file_label=as_pyreadstat.file_label,
                    column_labels=as_pyreadstat.column_labels,
                    compress=compress,
                    note=as_pyreadstat.notes,
                    variable_value_labels=as_pyreadstat.variable_value_labels,
                    missing_ranges=as_pyreadstat.missing_ranges,
                    variable_display_width=as_pyreadstat.
                    variable_display_width,
                    variable_measure=as_pyreadstat.variable_measure)
            else:
                pyreadstat.write_sav(df=df,
                                     dst_path=temp_file.name,
                                     compress=compress)

            if target:
                target.write(temp_file.read())
            else:
                target = BytesIO(temp_file.read())

            return target
Exemplo n.º 10
0
def backoff(to_execute,
            args = None,
            kwargs = None,
            strategy = None,
            retry_execute = None,
            retry_args = None,
            retry_kwargs = None,
            max_tries = None,
            max_delay = None,
            catch_exceptions = None,
            on_failure = None,
            on_success = None):
    """Retry a function call multiple times with a delay per the strategy given.

    :param to_execute: The function call that is to be attempted.
    :type to_execute: callable

    :param args: The positional arguments to pass to the function on the first attempt.

      If ``retry_args`` is :class:`None <python:None>`, will re-use these
      arguments on retry attempts as well.
    :type args: iterable / :class:`None <python:None>`.

    :param kwargs: The keyword arguments to pass to the function on the first attempt.

      If ``retry_kwargs`` is :class:`None <python:None>`, will re-use these keyword
      arguments on retry attempts as well.
    :type kwargs: :class:`dict <python:dict>` / :class:`None <python:None>`

    :param strategy: The :class:`BackoffStrategy` to use when determining the
      delay between retry attempts.

      If :class:`None <python:None>`, defaults to :class:`Exponential`.
    :type strategy: :class:`BackoffStrategy`

    :param retry_execute: The function to call on retry attempts.

      If :class:`None <python:None>`, will retry ``to_execute``.

      Defaults to :class:`None <python:None>`.
    :type retry_execute: callable / :class:`None <python:None>`

    :param retry_args: The positional arguments to pass to the function on retry attempts.

      If :class:`None <python:None>`, will re-use ``args``.

      Defaults to :class:`None <python:None>`.
    :type retry_args: iterable / :class:`None <python:None>`

    :param retry_kwargs: The keyword arguments to pass to the function on retry attempts.

      If :class:`None <python:None>`, will re-use ``kwargs``.

      Defaults to :class:`None <python:None>`.
    :type subsequent_kwargs: :class:`dict <python:dict>` / :class:`None <python:None>`

    :param max_tries: The maximum number of times to attempt the call.

      If :class:`None <python:None>`, will apply an environment variable
      ``BACKOFF_DEFAULT_TRIES``. If that environment variable is not set, will
      apply a default of ``3``.
    :type max_tries: int / :class:`None <python:None>`

    :param max_delay: The maximum number of seconds to wait befor giving up
      once and for all. If :class:`None <python:None>`, will apply an environment variable
      ``BACKOFF_DEFAULT_DELAY`` if that environment variable is set. If it is not
      set, will not apply a max delay at all.
    :type max_delay: :class:`None <python:None>` / int

    :param catch_exceptions: The ``type(exception)`` to catch and retry. If
      :class:`None <python:None>`, will catch all exceptions.

      Defaults to :class:`None <python:None>`.

      .. caution::

        The iterable must contain one or more types of exception *instances*, and not
        class objects. For example:

        .. code-block:: python

          # GOOD:
          catch_exceptions = (type(ValueError()), type(TypeError()))

          # BAD:
          catch_exceptions = (type(ValueError), type(ValueError))

          # BAD:
          catch_exceptions = (ValueError, TypeError)

          # BAD:
          catch_exceptions = (ValueError(), TypeError())

    :type catch_exceptions: iterable of form ``[type(exception()), ...]``

    :param on_failure: The :class:`exception <python:Exception>` or function to call
      when all retry attempts have failed.

      If :class:`None <python:None>`, will raise the last-caught
      :class:`exception <python:Exception>`.

      If an :class:`exception <python:Exception>`, will raise the exception with
      the same message as the last-caught exception.

      If a function, will call the function and pass the last-raised exception, its
      message, and stacktrace to the function.

      Defaults to :class:`None <python:None>`.
    :type on_failure: :class:`Exception <python:Exception>` / function /
      :class:`None <python:None>`

    :param on_success: The function to call when the operation was successful.
      The function receives the result of the ``to_execute`` or ``retry_execute``
      function that was successful, and is called before that result is returned
      to whatever code called the backoff function. If :class:`None <python:None>`,
      will just return the result of ``to_execute`` or ``retry_execute`` without
      calling a handler.

      Defaults to :class:`None <python:None>`.
    :type on_success: callable / :class:`None <python:None>`

    :returns: The result of the attempted function.

    Example:

    .. code-block:: python

      from backoff_utils import backoff

      def some_function(arg1, arg2, kwarg1 = None):
          # Function does something
          pass

      result = backoff(some_function,
                       args = ['value1', 'value2'],
                       kwargs = { 'kwarg1': 'value3' },
                       max_tries = 3,
                       max_delay = 30,
                       strategy = strategies.Exponential)

    """
    # pylint: disable=too-many-branches,too-many-statements

    if to_execute is None:
        raise ValueError('to_execute cannot be None')
    elif not checkers.is_callable(to_execute):
        raise TypeError('to_execute must be callable')

    if strategy is None:
        strategy = strategies.Exponential

    if not hasattr(strategy, 'IS_INSTANTIATED'):
        raise TypeError('strategy must be a BackoffStrategy or descendent')
    if not strategy.IS_INSTANTIATED:
        test_strategy = strategy(attempt = 0)
    else:
        test_strategy = strategy

    if not checkers.is_type(test_strategy, 'BackoffStrategy'):
        raise TypeError('strategy must be a BackoffStrategy or descendent')

    if args:
        args = validators.iterable(args)
    if kwargs:
        kwargs = validators.dict(kwargs)

    if retry_execute is None:
        retry_execute = to_execute
    elif not checkers.is_callable(retry_execute):
        raise TypeError('retry_execute must be None or a callable')

    if not retry_args:
        retry_args = args
    else:
        retry_args = validators.iterable(retry_args)

    if not retry_kwargs:
        retry_kwargs = kwargs
    else:
        retry_kwargs = validators.dict(retry_kwargs)

    if max_tries is None:
        max_tries = DEFAULT_MAX_TRIES

    max_tries = validators.integer(max_tries)

    if max_delay is None:
        max_delay = DEFAULT_MAX_DELAY

    if catch_exceptions is None:
        catch_exceptions = [type(Exception())]
    else:
        if not checkers.is_iterable(catch_exceptions):
            catch_exceptions = [catch_exceptions]

        catch_exceptions = validators.iterable(catch_exceptions)

    if on_failure is not None and not checkers.is_callable(on_failure):
        raise TypeError('on_failure must be None or a callable')

    if on_success is not None and not checkers.is_callable(on_success):
        raise TypeError('on_success must be None or a callable')

    cached_error = None

    return_value = None
    returned = False
    failover_counter = 0
    start_time = datetime.utcnow()
    while failover_counter <= (max_tries):
        elapsed_time = (datetime.utcnow() - start_time).total_seconds()
        if max_delay is not None and elapsed_time >= max_delay:
            if cached_error is None:
                raise BackoffTimeoutError('backoff timed out after:'
                                          ' {}s'.format(elapsed_time))
            else:
                _handle_failure(on_failure, cached_error)
        if failover_counter == 0:
            try:
                if args is not None and kwargs is not None:
                    return_value = to_execute(*args, **kwargs)
                elif args is not None:
                    return_value = to_execute(*args)
                elif kwargs is not None:
                    return_value = to_execute(**kwargs)
                else:
                    return_value = to_execute()
                returned = True
                break
            except Exception as error:                                          # pylint: disable=broad-except
                if type(error) in catch_exceptions:
                    cached_error = error
                    strategy.delay(failover_counter)
                    failover_counter += 1
                    continue
                else:
                    _handle_failure(on_failure = on_failure,
                                    error = error)
                    return
        else:
            try:
                if retry_args is not None and retry_kwargs is not None:
                    return_value = retry_execute(*retry_args, **retry_kwargs)
                elif retry_args is not None:
                    return_value = retry_execute(*retry_args)
                elif retry_kwargs is not None:
                    return_value = retry_execute(**retry_kwargs)
                else:
                    return_value = retry_execute()
                returned = True
                break
            except Exception as error:                                          # pylint: disable=broad-except
                if type(error) in catch_exceptions:
                    strategy.delay(failover_counter)
                    cached_error = error
                    failover_counter += 1
                    continue
                else:
                    _handle_failure(on_failure = on_failure,
                                    error = error)
                    return

    if not returned:
        _handle_failure(on_failure = on_failure,
                        error = cached_error)
        return
    elif returned and on_success is not None:
        on_success(return_value)

    return return_value
Exemplo n.º 11
0
def to_excel(data: Union['os.PathLike[Any]', BytesIO, bytes],
             target: Optional[Union['os.PathLike[Any]', BytesIO,
                                    ExcelWriter]] = None,
             sheet_name: str = 'Sheet1',
             start_row: int = 0,
             start_column: int = 0,
             null_text: str = 'NaN',
             include_header: bool = True,
             limit: Optional[int] = None,
             offset: int = 0,
             exclude_variables: Optional[List[str]] = None,
             include_variables: Optional[List[str]] = None,
             metadata_only: bool = False,
             apply_labels: bool = False,
             labels_as_categories: bool = True,
             missing_as_NaN: bool = False,
             convert_datetimes: bool = True,
             dates_as_datetime64: bool = False,
             **kwargs):
    r"""Convert the SPSS ``data`` into an Excel file where each row represents a record of
    SPSS data.

    :param data: The SPSS data to load. Accepts either a series of bytes or a filename.
    :type data: Path-like filename, :class:`bytes <python:bytes>` or
      :class:`BytesIO <python:io.bytesIO>`

    :param target: The destination where the Excel file should be stored. Accepts
      either a filename, file-pointer or a :class:`BytesIO <python:io.BytesIO>`, or
      an :class:`ExcelWriter <pandas:pandas.ExcelWriter>` instance.
    :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` /
      :class:`ExcelWriter <pandas:pandas.ExcelWriter>`

    :param sheet_name: The worksheet on which the SPSS data should be written. Defaults to
      ``'Sheet1'``.
    :type sheet_name: :class:`str <python:str>`

    :param start_row: The row number (starting at 0) where the SPSS data should begin.
      Defaults to ``0``.
    :type start_row: :class:`int <python:int>`

    :param start_column: The column number (starting at 0) where the SPSS data should
      begin. Defaults to ``0``.
    :type start_column: :class:`int <python:int>`

    :param null_text: The way that missing values should be represented in the Excel
      file. Defaults to ``''`` (an empty string).
    :type null_text: :class:`str <python:str>`

    :param include_header: If ``True``, will include a header row with column
      labels. If ``False``, will not include a header row. Defaults to ``True``.
    :type include_header: :class:`bool <python:bool>`

    :param limit: The number of records to read from the data. If :obj:`None <python:None>`
      will return all records. Defaults to :obj:`None <python:None>`.
    :type limit: :class:`int <python:int>` or :obj:`None <python:None>`

    :param offset: The record at which to start reading the data. Defaults to 0 (first
      record).
    :type offset: :class:`int <python:int>`

    :param exclude_variables: A list of the variables that should be ignored when reading
      data. Defaults to :obj:`None <python:None>`.
    :type exclude_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param include_variables: A list of the variables that should be explicitly included
      when reading data. Defaults to :obj:`None <python:None>`.
    :type include_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param metadata_only: If ``True``, will return no data records in the resulting
      :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete
      :class:`Metadata` instance. Defaults to ``False``.
    :type metadata_only: :class:`bool <python:bool>`

    :param apply_labels: If ``True``, converts the numerically-coded values in the raw
      data to their human-readable labels. Defaults to ``False``.
    :type apply_labels: :class:`bool <python:bool>`

    :param labels_as_categories: If ``True``, will convert labeled or formatted values to
      Pandas :term:`categories <pandas:category>`. Defaults to ``True``.

      .. caution::

        This parameter will only have an effect if the ``apply_labels`` parameter is
        ``True``.

    :type labels_as_categories: :class:`bool <python:bool>`

    :param missing_as_NaN: If ``True``, will return any missing values as
      :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the
      configuration of missing value representation stored in the underlying SPSS data.
      Defaults to ``False``, which applies the missing value representation configured in
      the SPSS data itself.
    :type missing_as_NaN: :class:`bool <python:bool>`

    :param convert_datetimes: if ``True``, will convert the native integer representation
      of datetime values in the SPSS data to Pythonic
      :class:`datetime <python:datetime.datetime>`, or
      :class:`date <python:datetime.date>`, etc. representations (or Pandas
      :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64``
      parameter). If ``False``, will leave the original integer representation. Defaults to
      ``True``.
    :type convert_datetimes: :class:`bool <python:bool>`

    :param dates_as_datetime64: If ``True``, will return any date values as Pandas
      :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``.

      .. caution::

        This parameter is only applied if ``convert_datetimes`` is set to ``True``.

    :type dates_as_datetime64: :class:`bool <python:bool>`

    :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`,
      otherwise a :class:`BytesIO <python:BytesIO>` representation of the Excel file.
    :rtype: :obj:`None <python:None>` or :class:`str <python:str>`

    """
    if target and \
       not checkers.is_pathlike(target) and \
       not checkers.is_bytesIO(target) and \
       not checkers.is_type(target, 'ExcelWriter'):
        raise errors.InvalidDataFormatError(
            'target must be a filename, BytesIO, '
            f'ExcelWriter, or None. '
            f'Was: {data.__class__.__name__}')

    df, metadata = _read_spss(data,
                              limit=limit,
                              offset=offset,
                              exclude_variables=exclude_variables,
                              include_variables=include_variables,
                              metadata_only=metadata_only,
                              apply_labels=apply_labels,
                              labels_as_categories=labels_as_categories,
                              missing_as_NaN=missing_as_NaN,
                              convert_datetimes=convert_datetimes,
                              dates_as_datetime64=dates_as_datetime64,
                              **kwargs)

    return_target = False
    if not target or checkers.is_bytesIO(target):
        return_target = True
        target = BytesIO()

    df.to_excel(target,
                sheet_name=sheet_name,
                na_rep=null_text,
                header=include_header,
                startrow=start_row,
                startcol=start_column)

    if return_target:
        return target
Exemplo n.º 12
0
def _read_spss(data: Union[bytes, BytesIO, 'os.PathLike[Any]'],
               limit: Optional[int] = None,
               offset: int = 0,
               exclude_variables: Optional[List[str]] = None,
               include_variables: Optional[List[str]] = None,
               metadata_only: bool = False,
               apply_labels: bool = False,
               labels_as_categories: bool = True,
               missing_as_NaN: bool = False,
               convert_datetimes: bool = True,
               dates_as_datetime64: bool = False,
               **kwargs):
    """Internal function that reads an SPSS (.sav or .zsav) file and returns a
    :class:`tuple <python:tuple>` with a Pandas
    :class:`DataFrame <pandas:pandas.DataFrame>` object and a metadata
    :class:`dict <python:dict>`.

    :param data: The SPSS data to load. Accepts either a series of bytes or a filename.
    :type data: Path-like filename, :class:`bytes <python:bytes>` or
      :class:`BytesIO <python:io.bytesIO>`

    :param limit: The number of records to read from the data. If :obj:`None <python:None>`
      will return all records. Defaults to :obj:`None <python:None>`.
    :type limit: :class:`int <python:int>` or :obj:`None <python:None>`

    :param offset: The record at which to start reading the data. Defaults to 0 (first
      record).
    :type offset: :class:`int <python:int>`

    :param exclude_variables: A list of the variables that should be ignored when reading
      data. Defaults to :obj:`None <python:None>`.
    :type exclude_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param include_variables: A list of the variables that should be explicitly included
      when reading data. Defaults to :obj:`None <python:None>`.
    :type include_variables: iterable of :class:`str <python:str>` or
      :obj:`None <python:None>`

    :param metadata_only: If ``True``, will return no data records in the resulting
      :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete metadata
      :class:`dict <python:dict>`. Defaults to ``False``.
    :type metadata_only: :class:`bool <python:bool>`

    :param apply_labels: If ``True``, converts the numerically-coded values in the raw
      data to their human-readable labels. Defaults to ``False``.
    :type apply_labels: :class:`bool <python:bool>`

    :param labels_as_categories: If ``True``, will convert labeled or formatted values to
      Pandas :term:`categories <pandas:category>`. Defaults to ``True``.

      .. caution::

        This parameter will only have an effect if the ``apply_labels`` parameter is
        ``True``.

    :type labels_as_categories: :class:`bool <python:bool>`

    :param missing_as_NaN: If ``True``, will return any missing values as
      :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the
      configuration of missing value representation stored in the underlying SPSS data.
      Defaults to ``False``, which applies the missing value representation configured in
      the SPSS data itself.
    :type missing_as_NaN: :class:`bool <python:bool>`

    :param convert_datetimes: if ``True``, will convert the native integer representation
      of datetime values in the SPSS data to Pythonic
      :class:`datetime <python:datetime.datetime>`, or
      :class:`date <python:datetime.date>`, etc. representations (or Pandas
      :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64``
      parameter). If ``False``, will leave the original integer representation. Defaults
      to ``True``.
    :type convert_datetimes: :class:`bool <python:bool>`

    :param dates_as_datetime64: If ``True``, will return any date values as Pandas
      :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``.

      .. caution::

        This parameter is only applied if ``convert_datetimes`` is set to ``True``.

    :type dates_as_datetime64: :class:`bool <python:bool>`

    :returns: A :class:`DataFrame <pandas:DataFrame>` representation of the SPSS data (or
      :obj:`None <python:None>`) and a :class:`Metadata` representation of the dataset's
      metadata / data map.
    :rtype: :class:`pandas.DataFrame <pandas:DataFrame>`/:obj:`None <python:None>` and
      :class:`Metadata`

    """
    if not any([
            checkers.is_file(data),
            checkers.is_bytesIO(data),
            checkers.is_type(data, bytes)
    ]):
        raise errors.InvalidDataFormatError(
            'data must be a filename, BytesIO, or bytes '
            f'object. Was: {data.__class__.__name__}')

    limit = validators.integer(limit, allow_empty=True, minimum=0)
    offset = validators.integer(offset, minimum=0)

    exclude_variables = validators.iterable(exclude_variables,
                                            allow_empty=True)
    if exclude_variables:
        exclude_variables = [validators.string(x) for x in exclude_variables]

    include_variables = validators.iterable(include_variables,
                                            allow_empty=True)
    if include_variables:
        include_variables = [validators.string(x) for x in include_variables]

    if not checkers.is_file(data):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_file.write(data)
            temp_file_name = temp_file.name

        df, meta = pyreadstat.read_sav(
            temp_file_name,
            metadataonly=metadata_only,
            dates_as_pandas_datetime=dates_as_datetime64,
            apply_value_formats=apply_labels,
            formats_as_category=labels_as_categories,
            usecols=include_variables,
            user_missing=not missing_as_NaN,
            disable_datetime_conversion=not convert_datetimes,
            row_limit=limit or 0,
            row_offset=offset,
            **kwargs)
        os.remove(temp_file_name)
    else:
        df, meta = pyreadstat.read_sav(
            data,
            metadataonly=metadata_only,
            dates_as_pandas_datetime=dates_as_datetime64,
            apply_value_formats=apply_labels,
            formats_as_category=labels_as_categories,
            usecols=include_variables,
            user_missing=not missing_as_NaN,
            disable_datetime_conversion=not convert_datetimes,
            row_limit=limit or 0,
            row_offset=offset,
            **kwargs)

    metadata = Metadata.from_pyreadstat(meta)

    if exclude_variables:
        df = df.drop(exclude_variables, axis=1)
        if metadata.column_metadata:
            for variable in exclude_variables:
                metadata.column_metadata.pop(variable, None)

    return df, metadata