def test_exclude_variables(input_files, filename, as_file, exclude_variables, error): input_data = check_input_file(input_files, filename) if not as_file: with open(input_data, 'rb') as file_: data = file_.read() else: data = input_data if not error: result = read._read_spss(data, exclude_variables = exclude_variables) assert result is not None assert isinstance(result, tuple) is True assert len(result) == 2 assert result[0] is not None assert result[1] is not None assert checkers.is_type(result[0], 'DataFrame') is True assert checkers.is_type(result[1], 'Metadata') is True assert len(result[0]) == result[1].rows for variable in exclude_variables: metadata = result[1] assert variable not in metadata.column_metadata assert variable not in result[0] else: with pytest.raises(error): result = read._read_spss(data)
def test_limit_offset(input_files, filename, as_file, limit, offset, error): input_data = check_input_file(input_files, filename) if not as_file: with open(input_data, 'rb') as file_: data = file_.read() else: data = input_data if not error: result = read._read_spss(data, limit = limit) assert result is not None assert isinstance(result, tuple) is True assert len(result) == 2 assert result[0] is not None assert result[1] is not None assert checkers.is_type(result[0], 'DataFrame') is True assert checkers.is_type(result[1], 'Metadata') is True if limit is None: assert len(result[0]) == result[1].rows else: assert len(result[0]) == max((limit - offset), limit) else: with pytest.raises(error): result = read._read_spss(data)
def _handle_failure(on_failure = None, error = None): """Handle the failure of a function called by :ref:`backoff`. :param on_failure: The :class:`Exception <python:Exception>` or function to call when all retry attempts have failed. If :class:`None <python:None>`, will raise the last-caught :class:`Exception <python:Exception>`. If an :class:`Exception <python:Exception>`, will raise the exception with the same message as the last-caught exception. If a function, will call the function and pass the last-raised exception, its message, and stacktrace to the function. Defaults to :class:`None <python:None>`. :type on_failure: :class:`Exception <python:Exception>` / function / :class:`None <python:None>` :param error: The :class:`Exception <python:Exception>` that was raised. Defaults to :class:`Exception <python:Exception>`. :type error: :class:`Exception <python:Exception>` """ if error is None: error = Exception is_on_failure_an_exception = False if is_py2: if isinstance(on_failure, Exception): is_on_failure_an_exception = True elif checkers.is_type(on_failure, 'type'): is_on_failure_an_exception = isinstance(on_failure(), Exception) else: is_on_failure_an_exception = False else: is_on_failure_an_exception = checkers.is_type(on_failure, ('type', 'Exception')) and \ hasattr(on_failure, '__cause__') if on_failure is None: raise error elif is_on_failure_an_exception: raise on_failure(error.args[0]) else: try: on_failure(error, error.args[0], sys.exc_info()[2]) except Exception as nested_error: raise nested_error
def apply_metadata(df: DataFrame, metadata: Union[Metadata, dict, pyreadstat.metadata_container], as_category: bool = True): """Updates the :class:`DataFrame <pandas:DataFrame>` ``df`` based on the ``metadata``. :param df: The :class:`DataFrame <pandas:pandas.DataFrame>` to update. :type df: :class:`pandas.DataFrame <pandas:pandas.DataFrame>` :param metadata: The :class:`Metadata` to apply to ``df``. :type metadata: :class:`Metadata`, :class:`pyreadstat.metadata_container`, or compatible :class:`dict <python:dict>` :param as_category: if ``True``, will variables with formats will be transformed into categories in the :class:`DataFrame <pandas:pandas.DataFrame>`. Defaults to ``True``. :type as_category: :class:`bool <python:bool>` :returns: A copy of ``df`` updated to reflect ``metadata``. :rtype: :class:`DataFrame <pandas:pandas.DataFrame>` """ if not checkers.is_type(df, 'DataFrame'): raise ValueError( f'df must be a pandas.DataFrame. Was: {df.__class__.__name__}') if not checkers.is_type(metadata, ('Metadata', 'metadata_container', 'dict')): raise ValueError( f'metadata must be a Metadata instance or compatible object. ' f'Was: {metadata.__class__.__name__}') elif checkers.is_type(metadata, 'metadata_container'): metadata = Metadata.from_pyreadstat(metadata) elif checkers.is_type(metadata, 'dict'): metadata = Metadata.from_dict(metadata) as_pyreadstat = metadata.to_pyreadstat() return pyreadstat.set_value_labels(df, metadata=as_pyreadstat.value_labels, formats_as_category=as_category)
def test_default_params(input_files, filename, as_file, error): input_data = check_input_file(input_files, filename) if not as_file: with open(input_data, 'rb') as file_: data = file_.read() else: data = input_data if not error: result = read._read_spss(data) assert result is not None assert isinstance(result, tuple) is True assert len(result) == 2 assert result[0] is not None assert result[1] is not None assert checkers.is_type(result[0], 'DataFrame') is True assert checkers.is_type(result[1], 'Metadata') is True assert len(result[0]) == result[1].rows else: with pytest.raises(error): result = read._read_spss(data)
def column_metadata(self, value): value = validators.dict(value, allow_empty = True) if not value: self._column_metadata = None else: result = {} for key in value: key = validators.variable_name(key, allow_empty = False) if checkers.is_type(value[key], 'ColumnMetadata'): result[key] = value[key] else: result[key] = ColumnMetadata.from_dict(result[key]) self._column_metadata = result
def test_get_metadata(input_files, filename, as_file, error): input_data = check_input_file(input_files, filename) if not as_file: with open(input_data, 'rb') as file_: data = file_.read() else: data = input_data if not error: result = read.get_metadata(data) assert result is not None assert checkers.is_type(result, 'Metadata') is True assert result.column_metadata is not None else: with pytest.raises(error): result = read.get_metadata(data)
def get_attribute_names(obj, include_callable=False, include_nested=True, include_private=False, include_special=False, include_utilities=False): """Return a list of attribute names within ``obj``. :param include_callable: If ``True``, will include callable attributes (methods). Defaults to ``False``. :type include_callable: :class:`bool <python:bool>` :param include_nested: If ``True``, will include attributes that are arbitrarily-nestable types (such as a :term:`model class` or :class:`dict <python:dict>`). Defaults to ``False``. :type include_nested: :class:`bool <python:bool>` :param include_private: If ``True``, will include attributes whose names begin with ``_`` (but *not* ``__``). Defaults to ``False``. :type include_private: :class:`bool <python:bool>` :param include_special: If ``True``, will include atributes whose names begin with ``__``. Defaults to ``False``. :type include_special: :class:`bool <python:bool>` :param include_utilities: If ``True``, will include utility properties added by SQLAlchemy or **SQLAthanor**. Defaults to ``False``. :type include_utilities: :class:`bool <python:bool>` :returns: :term:`Model Attribute` names attached to ``obj``. :rtype: :class:`list <python:list>` of :class:`str <python:str>` """ attribute_names = [x for x in dir(obj) if (include_utilities and x in UTILITY_COLUMNS) or \ (x not in UTILITY_COLUMNS)] attributes = [] for attribute in attribute_names: if (attribute[0] == '_' and attribute[0:2] != '__') and not include_private: continue if attribute[0:2] == '__' and not include_special: continue try: attribute_value = getattr(obj, attribute) except SA_InvalidRequestError: if not include_nested: continue attributes.append(attribute) continue if not include_nested: if checkers.is_type(attribute_value, ('BaseModel', 'RelationshipProperty', 'AssociationProxy', dict)): continue try: is_iterable = checkers.is_iterable(attribute_value, forbid_literals=(str, bytes, dict)) except SA_InvalidRequestError as error: if not include_nested: continue else: is_iterable = False if is_iterable: loop = False try: for item in attribute_value: if checkers.is_type( item, ('BaseModel', 'RelationshipProperty', 'AssociationProxy', dict)): loop = True break except (NotImplementedError, TypeError): pass if loop: continue if not include_callable and checkers.is_callable(attribute_value): continue attributes.append(attribute) return attributes
def from_dataframe(df: DataFrame, target: Optional[Union['PathLike[Any]', BytesIO]] = None, metadata: Optional[Metadata] = None, compress: bool = False): """Create an SPSS dataset from a `Pandas <https://pandas.pydata.org/>`_ :class:`DataFrame <pandas:DataFrame>`. :param df: The :class:`DataFrame` to serialize to an SPSS dataset. :type df: :class:`pandas.DataFrame <pandas:DataFrame>` :param target: The target to which the SPSS dataset should be written. Accepts either a filename/path, a :class:`BytesIO <python:io.BytesIO>` object, or :obj:`None <python:None>`. If :obj:`None <python:None>` will return a :class:`BytesIO <python:io.BytesIO>` object containing the SPSS dataset. Defaults to :obj:`None <python:None>`. :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` / :obj:`None <python:None>` :param metadata: The :class:`Metadata` associated with the dataset. If :obj:`None <python:None>`, will attempt to derive it form ``df``. Defaults to :obj:`None <python:None>`. :type metadata: :class:`Metadata` / :obj:`None <python:None>` :param compress: If ``True``, will return data in the compressed ZSAV format. If ``False``, will return data in the standards SAV format. Defaults to ``False``. :type compress: :class:`bool <python:bool>` :returns: A :class:`BytesIO <python:io.BytesIO>` object containing the SPSS data if ``target`` is :obj:`None <python:None>` or not a filename, otherwise :obj:`None <python:None>` :rtype: :class:`BytesIO <python:io.BytesIO>` or :obj:`None <python:None>` :raises ValueError: if ``df`` is not a :class:`pandas.DataFrame <pandas:DataFrame>` :raises ValueError: if ``metadata`` is not a :class:`Metadata` """ if not checkers.is_type(df, 'DataFrame'): raise ValueError( f'df must be a pandas.DataFrame. Was: {df.__class__.__name__}') if metadata and not checkers.is_type( metadata, ('Metadata', 'metadata_container', 'dict')): raise ValueError( f'metadata must be a Metadata instance or compatible object. ' f'Was: {metadata.__class__.__name__}') elif metadata and checkers.is_type(metadata, 'metadata_container'): metadata = Metadata.from_pyreadstat(metadata) elif metadata and checkers.is_type(metadata, 'dict'): metadata = Metadata.from_dict(metadata) is_file = False if target and checkers.is_pathlike(target): is_file = True elif target: target = validators.bytesIO(target, allow_empty=False) if metadata: as_pyreadstat = metadata.to_pyreadstat() else: as_pyreadstat = None if target and is_file: with open(target, 'wb') as target_file: if as_pyreadstat: pyreadstat.write_sav( df=df, dst_path=target_file, file_label=as_pyreadstat.file_label, column_labels=as_pyreadstat.column_labels, compress=compress, note=as_pyreadstat.notes, variable_value_labels=as_pyreadstat.variable_value_labels, missing_ranges=as_pyreadstat.missing_ranges, variable_display_width=as_pyreadstat. variable_display_width, variable_measure=as_pyreadstat.variable_measure) else: pyreadstat.write_sav(df=df, dst_path=target_file, compress=compress) else: with tempfile.NamedTemporaryFile() as temp_file: if as_pyreadstat: pyreadstat.write_sav( df=df, dst_path=temp_file.name, file_label=as_pyreadstat.file_label, column_labels=as_pyreadstat.column_labels, compress=compress, note=as_pyreadstat.notes, variable_value_labels=as_pyreadstat.variable_value_labels, missing_ranges=as_pyreadstat.missing_ranges, variable_display_width=as_pyreadstat. variable_display_width, variable_measure=as_pyreadstat.variable_measure) else: pyreadstat.write_sav(df=df, dst_path=temp_file.name, compress=compress) if target: target.write(temp_file.read()) else: target = BytesIO(temp_file.read()) return target
def backoff(to_execute, args = None, kwargs = None, strategy = None, retry_execute = None, retry_args = None, retry_kwargs = None, max_tries = None, max_delay = None, catch_exceptions = None, on_failure = None, on_success = None): """Retry a function call multiple times with a delay per the strategy given. :param to_execute: The function call that is to be attempted. :type to_execute: callable :param args: The positional arguments to pass to the function on the first attempt. If ``retry_args`` is :class:`None <python:None>`, will re-use these arguments on retry attempts as well. :type args: iterable / :class:`None <python:None>`. :param kwargs: The keyword arguments to pass to the function on the first attempt. If ``retry_kwargs`` is :class:`None <python:None>`, will re-use these keyword arguments on retry attempts as well. :type kwargs: :class:`dict <python:dict>` / :class:`None <python:None>` :param strategy: The :class:`BackoffStrategy` to use when determining the delay between retry attempts. If :class:`None <python:None>`, defaults to :class:`Exponential`. :type strategy: :class:`BackoffStrategy` :param retry_execute: The function to call on retry attempts. If :class:`None <python:None>`, will retry ``to_execute``. Defaults to :class:`None <python:None>`. :type retry_execute: callable / :class:`None <python:None>` :param retry_args: The positional arguments to pass to the function on retry attempts. If :class:`None <python:None>`, will re-use ``args``. Defaults to :class:`None <python:None>`. :type retry_args: iterable / :class:`None <python:None>` :param retry_kwargs: The keyword arguments to pass to the function on retry attempts. If :class:`None <python:None>`, will re-use ``kwargs``. Defaults to :class:`None <python:None>`. :type subsequent_kwargs: :class:`dict <python:dict>` / :class:`None <python:None>` :param max_tries: The maximum number of times to attempt the call. If :class:`None <python:None>`, will apply an environment variable ``BACKOFF_DEFAULT_TRIES``. If that environment variable is not set, will apply a default of ``3``. :type max_tries: int / :class:`None <python:None>` :param max_delay: The maximum number of seconds to wait befor giving up once and for all. If :class:`None <python:None>`, will apply an environment variable ``BACKOFF_DEFAULT_DELAY`` if that environment variable is set. If it is not set, will not apply a max delay at all. :type max_delay: :class:`None <python:None>` / int :param catch_exceptions: The ``type(exception)`` to catch and retry. If :class:`None <python:None>`, will catch all exceptions. Defaults to :class:`None <python:None>`. .. caution:: The iterable must contain one or more types of exception *instances*, and not class objects. For example: .. code-block:: python # GOOD: catch_exceptions = (type(ValueError()), type(TypeError())) # BAD: catch_exceptions = (type(ValueError), type(ValueError)) # BAD: catch_exceptions = (ValueError, TypeError) # BAD: catch_exceptions = (ValueError(), TypeError()) :type catch_exceptions: iterable of form ``[type(exception()), ...]`` :param on_failure: The :class:`exception <python:Exception>` or function to call when all retry attempts have failed. If :class:`None <python:None>`, will raise the last-caught :class:`exception <python:Exception>`. If an :class:`exception <python:Exception>`, will raise the exception with the same message as the last-caught exception. If a function, will call the function and pass the last-raised exception, its message, and stacktrace to the function. Defaults to :class:`None <python:None>`. :type on_failure: :class:`Exception <python:Exception>` / function / :class:`None <python:None>` :param on_success: The function to call when the operation was successful. The function receives the result of the ``to_execute`` or ``retry_execute`` function that was successful, and is called before that result is returned to whatever code called the backoff function. If :class:`None <python:None>`, will just return the result of ``to_execute`` or ``retry_execute`` without calling a handler. Defaults to :class:`None <python:None>`. :type on_success: callable / :class:`None <python:None>` :returns: The result of the attempted function. Example: .. code-block:: python from backoff_utils import backoff def some_function(arg1, arg2, kwarg1 = None): # Function does something pass result = backoff(some_function, args = ['value1', 'value2'], kwargs = { 'kwarg1': 'value3' }, max_tries = 3, max_delay = 30, strategy = strategies.Exponential) """ # pylint: disable=too-many-branches,too-many-statements if to_execute is None: raise ValueError('to_execute cannot be None') elif not checkers.is_callable(to_execute): raise TypeError('to_execute must be callable') if strategy is None: strategy = strategies.Exponential if not hasattr(strategy, 'IS_INSTANTIATED'): raise TypeError('strategy must be a BackoffStrategy or descendent') if not strategy.IS_INSTANTIATED: test_strategy = strategy(attempt = 0) else: test_strategy = strategy if not checkers.is_type(test_strategy, 'BackoffStrategy'): raise TypeError('strategy must be a BackoffStrategy or descendent') if args: args = validators.iterable(args) if kwargs: kwargs = validators.dict(kwargs) if retry_execute is None: retry_execute = to_execute elif not checkers.is_callable(retry_execute): raise TypeError('retry_execute must be None or a callable') if not retry_args: retry_args = args else: retry_args = validators.iterable(retry_args) if not retry_kwargs: retry_kwargs = kwargs else: retry_kwargs = validators.dict(retry_kwargs) if max_tries is None: max_tries = DEFAULT_MAX_TRIES max_tries = validators.integer(max_tries) if max_delay is None: max_delay = DEFAULT_MAX_DELAY if catch_exceptions is None: catch_exceptions = [type(Exception())] else: if not checkers.is_iterable(catch_exceptions): catch_exceptions = [catch_exceptions] catch_exceptions = validators.iterable(catch_exceptions) if on_failure is not None and not checkers.is_callable(on_failure): raise TypeError('on_failure must be None or a callable') if on_success is not None and not checkers.is_callable(on_success): raise TypeError('on_success must be None or a callable') cached_error = None return_value = None returned = False failover_counter = 0 start_time = datetime.utcnow() while failover_counter <= (max_tries): elapsed_time = (datetime.utcnow() - start_time).total_seconds() if max_delay is not None and elapsed_time >= max_delay: if cached_error is None: raise BackoffTimeoutError('backoff timed out after:' ' {}s'.format(elapsed_time)) else: _handle_failure(on_failure, cached_error) if failover_counter == 0: try: if args is not None and kwargs is not None: return_value = to_execute(*args, **kwargs) elif args is not None: return_value = to_execute(*args) elif kwargs is not None: return_value = to_execute(**kwargs) else: return_value = to_execute() returned = True break except Exception as error: # pylint: disable=broad-except if type(error) in catch_exceptions: cached_error = error strategy.delay(failover_counter) failover_counter += 1 continue else: _handle_failure(on_failure = on_failure, error = error) return else: try: if retry_args is not None and retry_kwargs is not None: return_value = retry_execute(*retry_args, **retry_kwargs) elif retry_args is not None: return_value = retry_execute(*retry_args) elif retry_kwargs is not None: return_value = retry_execute(**retry_kwargs) else: return_value = retry_execute() returned = True break except Exception as error: # pylint: disable=broad-except if type(error) in catch_exceptions: strategy.delay(failover_counter) cached_error = error failover_counter += 1 continue else: _handle_failure(on_failure = on_failure, error = error) return if not returned: _handle_failure(on_failure = on_failure, error = cached_error) return elif returned and on_success is not None: on_success(return_value) return return_value
def to_excel(data: Union['os.PathLike[Any]', BytesIO, bytes], target: Optional[Union['os.PathLike[Any]', BytesIO, ExcelWriter]] = None, sheet_name: str = 'Sheet1', start_row: int = 0, start_column: int = 0, null_text: str = 'NaN', include_header: bool = True, limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): r"""Convert the SPSS ``data`` into an Excel file where each row represents a record of SPSS data. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param target: The destination where the Excel file should be stored. Accepts either a filename, file-pointer or a :class:`BytesIO <python:io.BytesIO>`, or an :class:`ExcelWriter <pandas:pandas.ExcelWriter>` instance. :type target: Path-like / :class:`BytesIO <python:io.BytesIO>` / :class:`ExcelWriter <pandas:pandas.ExcelWriter>` :param sheet_name: The worksheet on which the SPSS data should be written. Defaults to ``'Sheet1'``. :type sheet_name: :class:`str <python:str>` :param start_row: The row number (starting at 0) where the SPSS data should begin. Defaults to ``0``. :type start_row: :class:`int <python:int>` :param start_column: The column number (starting at 0) where the SPSS data should begin. Defaults to ``0``. :type start_column: :class:`int <python:int>` :param null_text: The way that missing values should be represented in the Excel file. Defaults to ``''`` (an empty string). :type null_text: :class:`str <python:str>` :param include_header: If ``True``, will include a header row with column labels. If ``False``, will not include a header row. Defaults to ``True``. :type include_header: :class:`bool <python:bool>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete :class:`Metadata` instance. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: :obj:`None <python:None>` if ``target`` was not :obj:`None <python:None>`, otherwise a :class:`BytesIO <python:BytesIO>` representation of the Excel file. :rtype: :obj:`None <python:None>` or :class:`str <python:str>` """ if target and \ not checkers.is_pathlike(target) and \ not checkers.is_bytesIO(target) and \ not checkers.is_type(target, 'ExcelWriter'): raise errors.InvalidDataFormatError( 'target must be a filename, BytesIO, ' f'ExcelWriter, or None. ' f'Was: {data.__class__.__name__}') df, metadata = _read_spss(data, limit=limit, offset=offset, exclude_variables=exclude_variables, include_variables=include_variables, metadata_only=metadata_only, apply_labels=apply_labels, labels_as_categories=labels_as_categories, missing_as_NaN=missing_as_NaN, convert_datetimes=convert_datetimes, dates_as_datetime64=dates_as_datetime64, **kwargs) return_target = False if not target or checkers.is_bytesIO(target): return_target = True target = BytesIO() df.to_excel(target, sheet_name=sheet_name, na_rep=null_text, header=include_header, startrow=start_row, startcol=start_column) if return_target: return target
def _read_spss(data: Union[bytes, BytesIO, 'os.PathLike[Any]'], limit: Optional[int] = None, offset: int = 0, exclude_variables: Optional[List[str]] = None, include_variables: Optional[List[str]] = None, metadata_only: bool = False, apply_labels: bool = False, labels_as_categories: bool = True, missing_as_NaN: bool = False, convert_datetimes: bool = True, dates_as_datetime64: bool = False, **kwargs): """Internal function that reads an SPSS (.sav or .zsav) file and returns a :class:`tuple <python:tuple>` with a Pandas :class:`DataFrame <pandas:pandas.DataFrame>` object and a metadata :class:`dict <python:dict>`. :param data: The SPSS data to load. Accepts either a series of bytes or a filename. :type data: Path-like filename, :class:`bytes <python:bytes>` or :class:`BytesIO <python:io.bytesIO>` :param limit: The number of records to read from the data. If :obj:`None <python:None>` will return all records. Defaults to :obj:`None <python:None>`. :type limit: :class:`int <python:int>` or :obj:`None <python:None>` :param offset: The record at which to start reading the data. Defaults to 0 (first record). :type offset: :class:`int <python:int>` :param exclude_variables: A list of the variables that should be ignored when reading data. Defaults to :obj:`None <python:None>`. :type exclude_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param include_variables: A list of the variables that should be explicitly included when reading data. Defaults to :obj:`None <python:None>`. :type include_variables: iterable of :class:`str <python:str>` or :obj:`None <python:None>` :param metadata_only: If ``True``, will return no data records in the resulting :class:`DataFrame <pandas:pandas.DataFrame>` but will return a complete metadata :class:`dict <python:dict>`. Defaults to ``False``. :type metadata_only: :class:`bool <python:bool>` :param apply_labels: If ``True``, converts the numerically-coded values in the raw data to their human-readable labels. Defaults to ``False``. :type apply_labels: :class:`bool <python:bool>` :param labels_as_categories: If ``True``, will convert labeled or formatted values to Pandas :term:`categories <pandas:category>`. Defaults to ``True``. .. caution:: This parameter will only have an effect if the ``apply_labels`` parameter is ``True``. :type labels_as_categories: :class:`bool <python:bool>` :param missing_as_NaN: If ``True``, will return any missing values as :class:`NaN <pandas:NaN>`. Otherwise will return missing values as per the configuration of missing value representation stored in the underlying SPSS data. Defaults to ``False``, which applies the missing value representation configured in the SPSS data itself. :type missing_as_NaN: :class:`bool <python:bool>` :param convert_datetimes: if ``True``, will convert the native integer representation of datetime values in the SPSS data to Pythonic :class:`datetime <python:datetime.datetime>`, or :class:`date <python:datetime.date>`, etc. representations (or Pandas :class:`datetime64 <pandas:datetime64>`, depending on the ``dates_as_datetime64`` parameter). If ``False``, will leave the original integer representation. Defaults to ``True``. :type convert_datetimes: :class:`bool <python:bool>` :param dates_as_datetime64: If ``True``, will return any date values as Pandas :class:`datetime64 <pandas.datetime64>` types. Defaults to ``False``. .. caution:: This parameter is only applied if ``convert_datetimes`` is set to ``True``. :type dates_as_datetime64: :class:`bool <python:bool>` :returns: A :class:`DataFrame <pandas:DataFrame>` representation of the SPSS data (or :obj:`None <python:None>`) and a :class:`Metadata` representation of the dataset's metadata / data map. :rtype: :class:`pandas.DataFrame <pandas:DataFrame>`/:obj:`None <python:None>` and :class:`Metadata` """ if not any([ checkers.is_file(data), checkers.is_bytesIO(data), checkers.is_type(data, bytes) ]): raise errors.InvalidDataFormatError( 'data must be a filename, BytesIO, or bytes ' f'object. Was: {data.__class__.__name__}') limit = validators.integer(limit, allow_empty=True, minimum=0) offset = validators.integer(offset, minimum=0) exclude_variables = validators.iterable(exclude_variables, allow_empty=True) if exclude_variables: exclude_variables = [validators.string(x) for x in exclude_variables] include_variables = validators.iterable(include_variables, allow_empty=True) if include_variables: include_variables = [validators.string(x) for x in include_variables] if not checkers.is_file(data): with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(data) temp_file_name = temp_file.name df, meta = pyreadstat.read_sav( temp_file_name, metadataonly=metadata_only, dates_as_pandas_datetime=dates_as_datetime64, apply_value_formats=apply_labels, formats_as_category=labels_as_categories, usecols=include_variables, user_missing=not missing_as_NaN, disable_datetime_conversion=not convert_datetimes, row_limit=limit or 0, row_offset=offset, **kwargs) os.remove(temp_file_name) else: df, meta = pyreadstat.read_sav( data, metadataonly=metadata_only, dates_as_pandas_datetime=dates_as_datetime64, apply_value_formats=apply_labels, formats_as_category=labels_as_categories, usecols=include_variables, user_missing=not missing_as_NaN, disable_datetime_conversion=not convert_datetimes, row_limit=limit or 0, row_offset=offset, **kwargs) metadata = Metadata.from_pyreadstat(meta) if exclude_variables: df = df.drop(exclude_variables, axis=1) if metadata.column_metadata: for variable in exclude_variables: metadata.column_metadata.pop(variable, None) return df, metadata