Exemplo n.º 1
0
def recode_nan_binary(data):
    """
    Replaces the NaN's in the data by a 1 and all other values by a 0.

    :param data:            1-dimensional numpy array
    :return:                1-dimensional numpy array with a 1 for every NaN and a 0 for every other value

    """

    # Check if input is valid
    check_numpy_array_1d(data, 'data')

    if not is_type_homogeneous(data, verbose=False):
        raise TypeError(
            "Argument for 'data' contains multiple types ({0}). Please use only type homogeneous types."
            .format(get_contained_types(data, unique=True, as_string=True)))

    if not contains_nan(data):
        raise TypeError(
            "Argument for 'data' does not contain any NaN. All result values would be zero."
        )

    # Find NaN's
    if isinstance(data[0], str):
        binary_array = match_by_pattern(data, ['nan', 'NaN', 'NAN', 'N/A'])
    else:
        binary_array = [np.isnan(value) for value in data]

    # convert from True and False to 1 and 0
    binary_array = np.asarray([int(value) for value in binary_array])

    return binary_array
Exemplo n.º 2
0
def count_nan(data):
    """
    Count number of NaN's in the data. Works also for an array of strings.

    :param data:                1-dimensional numpy array
    :return:                    Integer. Number of NaN's found.

    """

    # Check if input is valid
    check_numpy_array_1d(data, 'data')

    nans = 0

    # Check string values for NaN's
    if contains_types(data, 'str', exclusively=False, verbose=False):
        string_values = data[match_by_type(data, 'str')]
        string_values = string_values[match_by_pattern(
            string_values, ['nan', 'NaN', 'NAN', 'N/A'])]
        nans += string_values.size

    # Check non-string values for NaN's
    if not contains_types(data, 'str', exclusively=True, verbose=False):
        non_string_values = data[[
            type(element) is not str for element in data
        ]]
        try:
            nans += np.sum(pd.isnull(non_string_values))
        except TypeError:
            types = get_contained_types(data, unique=True, as_string=True)
            raise TypeError(
                "Argument for 'data' contains types which cannot be checked for NaN. Found types are: {0}."
                .format(types))

    return nans
Exemplo n.º 3
0
def find_and_replace(data, pattern, replacement):
    """
    If a value in the data matches at least one pattern exactly, replace it with the specified replacement.


    :param data:            1-dimensional numpy array. Contains the input values.
    :param pattern:         Single value or list. Must be the same type as the values in 'data'.
    :param replacement:     Single value. Must be the same type as the values in 'data'.

    :return:                1-dimensional numpy array
    """

    # Transform pattern parameter to list
    if type(pattern) is not list:
        pattern = [pattern]

    # Check if inputs are valid
    check_numpy_array_1d(data, 'data')

    for element in pattern:
        if not isinstance(data[0], type(element)):
            raise TypeError(
                "Type of pattern {0} ({1}) must match type of values in 'data' ({2})"
                .format(element, type(element), type(data[0])))

    if not isinstance(data[0], type(replacement)):
        print(
            "Warning: Type of replacement ({0}) does not match type of the values in 'data' ({1}). "
            .format(type(replacement), type(data[0])))

    # Replace values
    data[np.in1d(data, pattern)] = replacement

    return data
Exemplo n.º 4
0
def count_elements_with_category(data, categories, verbose=False):
    """
    Counts all observations in 'data' which match the given category. Returns the sum of it.

    :param data:     1-dimensional numpy array
    :param categories:      List or single value. Must match the type of the values in 'data'.
    :param verbose:         True or False. True for verbose output.

    :return:                Integer. Number of found occurrences.

    """

    check_numpy_array_1d(data, 'data')

    check_boolean(verbose, 'verbose')

    # Convert category to a list, if it is not already one
    if type(categories) is not list:
        categories = [categories]

    # Check for type homogeneity
    if not is_type_homogeneous(data, verbose=False):
        raise TypeError(
            "Argument for 'data' contains values with different types {0}. Please use only type homogeneous arrays."
            .format(get_contained_types(data, unique=True, as_string=True)))

    # Check if types of data and category-argument match
    for category in categories:
        if not isinstance(category, type(data[0])):
            raise TypeError(
                "Type of 'category' ({0}) does not match type of values in 'data' ({1})."
                .format(type(category), type(data[0]))
            )  # TODO: maybe add automatic conversion in the future

    # Find matches for each category, get the sum of occurrences and add the sums of all categories together
    sum_found_observations = 0
    for category in categories:
        found_observations = np.sum(data[data == category])
        if verbose:
            print("Found {0} observations of the category '{1]'.".format(
                found_observations, category))
        sum_found_observations += found_observations

    if verbose:
        print("Found {0} matching observations in total.".format(
            sum_found_observations))

    return sum_found_observations
Exemplo n.º 5
0
def is_within_range(data, lower_bound, upper_bound, verbose=True):
    """
    Check whether the values contained in the data are within the specified range

    :param data:            1-dimensional numpy array
    :param lower_bound:     number in the same type as the values in 'data'
    :param upper_bound:     number in the same type as the values in 'data'
    :param verbose:         True or False. Prints verbose output if set to True.

    :return:                True or False

    """

    # Check if inputs are valid
    check_numpy_array_1d(data, 'data')

    if not isinstance(data[0], type(lower_bound)):
        raise TypeError(
            "Type of lower bound ({0}) must match type of the values of 'data' ({1})."
            .format(type(lower_bound), type(data[0])))

    if not isinstance(data[0], type(upper_bound)):
        raise TypeError(
            "Type of upper bound ({0}) must match type of the values of 'data' ({1})."
            .format(type(upper_bound), type(data[0])))

    # Check if values are within range
    within_range = True  # True by default

    if data[~np.isnan(data)].max() > upper_bound:
        if verbose:
            print(
                "Argument for 'data' contains values larger than the upper bound."
            )
        within_range = False
    if data[~np.isnan(data)].min() < lower_bound:
        if verbose:
            print(
                "Argument for 'data' contains values smaller than the lower bound."
            )
        within_range = False

    return within_range
Exemplo n.º 6
0
def fulfills_assumptions(data, verbosity, **assumptions):
    """
    Check the data for a variable number of assumptions. Return true if all assumptions are fulfilled.


    Sample call:

    assumptions = {'contains_types':'int','contains_nan':False, 'type_homogeneous':True, 'variable_type':'metric', 'restrictions':[0, 90]})
    fulfills_assumptions(my_array, verbosity='low', **assumptions)

    or:

    fulfills_assumptions(my_array, verbosity='high', **{'contains_nan':True, 'contains_types':['int', 'str']})


    :param data:            1-dimensional numpy array
    :param verbosity:       'none', 'low' or 'high'. Sets the verbosity level.
    :param assumptions:     dictionary. Must contain at least one of the following keys:

                            contains_types:     String or list of strings. Checks if these types are contained.
                            type_homogeneous:   True or False. Checks for type homogeneity.
                            contains_nan:       True or False. Check for NaN's.
                            variable_type:      'categorical' or 'metric'. Needed for restrictions processing.
                            restrictions:       List of all categories if categorical. List of lower and upper bound if metric.

    :return:                True or False.

    """

    # Specify private local variables
    __allowed_parameters = [
        'contains_types', 'type_homogeneous', 'contains_nan', 'variable_type',
        'restrictions'
    ]
    __allowed_variable_types = ['categorical', 'metric']
    __allowed_verbosity = ['none', 'low', 'high']

    # Check if data is valid
    check_numpy_array_1d(data, 'data')

    # Check for illegal parameters
    for key in assumptions.keys():
        if key not in __allowed_parameters:
            raise TypeError(
                "Parameter '{0}' is not allowed. Please use only the following parameters: {1}"
                .format(key, __allowed_parameters,
                        'See help for further information.'))

    # Check if at least one assumption is specified
    if not any([key in __allowed_parameters for key in assumptions.keys()]):
        raise TypeError(
            'No assumption is specified. Please specify at least one assumption.',
            'See help for further information.')

    # Make sure, 'variable_type' is defined if 'restrictions' are passed
    if 'restrictions' in assumptions.keys(
    ) and 'variable_type' not in assumptions.keys():
        raise TypeError(
            "Parameter 'variable_type' must be defined, if parameter 'restrictions' is used.",
            'See help for further information')

    # Check if 'variable_type' is valid
    if 'variable_type' in assumptions.keys(
    ) and assumptions['variable_type'] not in __allowed_variable_types:
        raise TypeError(
            "Value for 'variable type' ({0}) is not valid.".format(
                assumptions['variable_type']),
            "Please use only one of the following strings for parameter 'variable_type': {0}"
            .format(__allowed_variable_types))

    # Check if 'verbosity' is valid
    if type(verbosity) is not str:
        raise TypeError(
            "Parameter 'verbosity' must be a string. Please use one of the following strings: {0}."
            .format(__allowed_verbosity))
    elif verbosity not in __allowed_verbosity:
        raise TypeError(
            "Illegal value has been used for parameter 'verbosity': {0}.".
            format(verbosity),
            "Please use only one of the following strings: {0}.".format(
                __allowed_verbosity))

    # Create result dictionary
    results = {}

    # Check if array contains specified types
    if 'contains_types' in assumptions.keys():
        results['contains_types'] = contains_types(
            data,
            assumptions['contains_types'],
            exclusively=True,
            verbose=(verbosity == 'high'))

    # Check if array is type homogeneous
    if 'type_homogeneous' in assumptions.keys():
        if assumptions['type_homogeneous']:
            results['type_homogeneous'] = (is_type_homogeneous(
                data, verbose=(
                    verbosity == 'high')) == assumptions['type_homogeneous'])
        else:
            results['not_type_homogeneous'] = (is_type_homogeneous(
                data, verbose=(
                    verbosity == 'high')) == assumptions['type_homogeneous'])

    # Check if array contains NaN values
    if 'contains_nan' in assumptions.keys():
        if assumptions['contains_nan']:
            results['contains_nan'] = (
                contains_nan(data) == assumptions['contains_nan'])
        else:
            results['contains_no_nan'] = (
                contains_nan(data) == assumptions['contains_nan'])

    # Check if restrictions hold
    if 'restrictions' in assumptions.keys():

        # Check if variable is categorical or boolean
        if assumptions['variable_type'] == 'categorical':
            results['restrictions'] = contains_category(
                data,
                assumptions['restrictions'],
                exclusively=True,
                verbose=(verbosity == 'high'))

        # Check if variable is metric
        elif assumptions['variable_type'] == 'metric':
            results['restrictions'] = is_within_range(
                data,
                lower_bound=min(assumptions['restrictions']),
                upper_bound=max(assumptions['restrictions']),
                verbose=(verbosity == 'high'))

        # Raise exception if variable was neither categorical, metric or boolean
        else:
            raise IOError(
                'Variable type was neither categorical, metric or boolean. This may not be your fault.'
            )

    # Summarize results
    all_true = all(results.values())

    # Print summary if 'verbose' is True
    if verbosity == 'low' or verbosity == 'high':
        if all_true:
            print('\nAll tests have been passed successfully:')
        else:
            print(
                '\nWarning: Some tests have failed. Please see the test results below:'
            )

        for key, value in results.items():
            print('{0}: {1}'.format(key, value))
        print('\n')

    # Return result
    return all_true