Пример #1
0
    def noise(self, sample):
        """ Adds noise to the duplicate rows

            Parameteres:
                sample (list or obj): `dataset.Dataset.sample`

            Returns
                sample (list or obj): distorted rows

            TODO:
                - implement more noise options than just random

        """
        sample_dataset = DataSet(sample.copy())
        columns = sample_dataset.sample(self.percentage, columns=True)
        if sample_dataset.data_type == 'pandas':
            sample_dataset.records = \
                sample_dataset.records.reset_index(drop=True)

        for column in columns:
            col = sample_dataset.column_idx(column)
            col_type = sample_dataset.column_dtype(col)
            func = None

            if 'float' in str(col_type):
                func = generate_random_float
            elif 'int' in str(col_type):
                func = generate_random_int
            if func:
                kwargs = {
                    'low': self.dataset.column_agg(col, min),
                    'high': self.dataset.column_agg(col, max)
                }
                if kwargs.get('low') == kwargs.get('high'):
                    kwargs['high'] += 1

                sample = self.apply_func_to_column(lambda x: func(x, **kwargs),
                                                   col)
            elif col_type in [object, str]:
                sample = self.apply_func_to_column(messy_spaces,
                                                   col,
                                                   dataset=sample_dataset)
        return sample_dataset.records
Пример #2
0
def test_column_agg(input_obj, column, agg, result, kwargs):
    data = DataSet(input_obj, **kwargs)
    if isinstance(column, str):
        column = data.column_idx(column)
    assert data.column_agg(column, agg) == result
Пример #3
0
def test_column_dtype(input_obj, column, col_type, kwargs):
    data = DataSet(input_obj, **kwargs)
    if isinstance(column, str):
        column = data.column_idx(column)
    assert data.column_dtype(column) == col_type