示例#1
0
 def __init__(self, meta_filename, meta, tables, missing=None):
     self.meta = meta
     self.tables = tables
     self.ht = HyperTransformer(meta_filename, missing=missing)
     self._anonymize_data()
     self.transformed_data = None
     self.child_map, self.parent_map, self.foreign_keys = self._get_relationships(self.tables)
    def test_fit_transform_table_transformer_dict(self):
        """Create and run the specified transforms in transformed_dict over the given table."""
        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')
        table, table_meta = ht.table_dict['users']
        transformer_dict = {
            ('users', 'age'): 'number',
            ('users', 'date_first_booking'): 'datetime'
        }
        expected_result = pd.DataFrame(
            {
                '?date_first_booking': [1, 0, 0, 0, 1],
                'date_first_booking': [1.38879e+18, 0.0, 0.0, 0.0, 1.3886172e+18],
                '?age': [1, 0, 0, 0, 0],
                'age': [62, 62, 62, 62, 62]
            },
            columns=['?date_first_booking', 'date_first_booking', '?age', 'age']
        )

        # Run
        result = ht.fit_transform_table(table, table_meta, transformer_dict)

        # Check
        assert result.equals(expected_result)

        for key in transformer_dict:
            with self.subTest(transformer_key=key):
                transformer = ht.transformer_dict.get(key)
                transformer_type = [
                    x['type'] for x in table_meta['fields']
                    if x['name'] == key[1]
                ][0]
                assert transformer_type == transformer
示例#3
0
 def __init__(self, meta_filename, meta, tables):
     """Instantiate data navigator object."""
     self.meta = meta
     self.tables = tables
     self.ht = HyperTransformer(meta_filename)
     self.transformed_data = None
     self.child_map, self.parent_map, self.foreign_keys = self._get_relationships(self.tables)
    def test_get_class(self):
        """Get a transformer from its name."""

        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')

        # Run
        transformer = ht.get_class('DTTransformer')

        # Check
        assert transformer.__name__ == 'DTTransformer'
    def test_reverse_transform(self):
        """reverse_transform leave transformed data in its original state."""
        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')
        transformed = ht.fit_transform()
        original_data = {name: table[0] for name, table in ht.table_dict.items()}

        # Run
        reverse_transformed = ht.reverse_transform(transformed)

        # Check
        for name, table in original_data.items():
            reversed_table = reverse_transformed[name]
            assert table.equals(reversed_table)
示例#6
0
def run_airbnb_demo(data_dir):
    """HyperTransfomer will transform back and forth data airbnb data."""

    # Setup
    meta_file = os.path.join(data_dir, 'Airbnb_demo_meta.json')
    transformer_list = ['NumberTransformer', 'DTTransformer', 'CatTransformer']
    ht = HyperTransformer(meta_file)

    # Run
    transformed = ht.fit_transform(transformer_list=transformer_list)
    result = ht.reverse_transform(tables=transformed)

    # Check
    assert result.keys() == ht.table_dict.keys()

    for name, table in result.items():
        assert not result[name].isnull().all().all()
    def test_fit_transform_transformer_list(self):
        """Create and run the transformers in transformer_list on the given table."""
        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')
        table, table_meta = ht.table_dict['users']
        transformer_list = ['NumberTransformer']
        expected_result = pd.DataFrame(
            {
                '?age': [1, 0, 0, 0, 0],
                'age': [62, 62, 62, 62, 62]
            },
            columns=['?age', 'age']
        )

        # Run
        result = ht.fit_transform_table(table, table_meta, transformer_list=transformer_list)

        # Check
        assert result.equals(expected_result)
    def test_transform_table(self):
        """transform_table transform a whole table after being fit."""
        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')
        table, table_meta = ht.table_dict['users']
        transformers = ['DTTransformer', 'NumberTransformer', 'CatTransformer']
        ht.fit_transform_table(table, table_meta, transformer_list=transformers)

        # Run
        result = ht.transform_table(table, table_meta)

        # Check
        assert (result.index == table.index).all()
        for column in table.columns:
            with self.subTest(column=column):
                missing = '?' + column
                assert column in result.columns
                assert missing in result.columns
                assert (result[column] == pd.to_numeric(result[column])).all()
                assert (table[column].isnull() == (result[missing] == 0)).all()
    def test___init__metadata_dict(self):
        """On init, meta file is the only required argument, other attributes are setup."""
        # Setup
        expected_transformer_dict = {
            ('users', 'id'): 'categorical',
            ('users', 'date_account_created'): 'datetime',
            ('users', 'timestamp_first_active'): 'datetime',
            ('users', 'date_first_booking'): 'datetime',
            ('users', 'gender'): 'categorical',
            ('users', 'age'): 'number',
            ('users', 'signup_method'): 'categorical',
            ('users', 'signup_flow'): 'categorical',
            ('users', 'language'): 'categorical',
            ('users', 'affiliate_channel'): 'categorical',
            ('users', 'affiliate_provider'): 'categorical',
            ('users', 'first_affiliate_tracked'): 'categorical',
            ('users', 'signup_app'): 'categorical',
            ('users', 'first_device_type'): 'categorical',
            ('users', 'first_browser'): 'categorical',
            ('countries', 'country_destination'): 'categorical',
            ('countries', 'lat_destination'): 'number',
            ('countries', 'lng_destination'): 'number',
            ('countries', 'distance_km'): 'number',
            ('countries', 'destination_km2'): 'categorical',
            ('countries', 'destination_language '): 'categorical',
            ('countries', 'language_levenshtein_distance'): 'number',
            ('sessions', 'user_id'): 'categorical',
            ('sessions', 'action'): 'categorical',
            ('sessions', 'action_type'): 'categorical',
            ('sessions', 'action_detail'): 'categorical',
            ('sessions', 'device_type'): 'categorical',
            ('sessions', 'secs_elapsed'): 'number',
            ('age_gender_bkts', 'age_bucket'): 'categorical',
            ('age_gender_bkts', 'country_destination'): 'categorical',
            ('age_gender_bkts', 'gender'): 'categorical',
            ('age_gender_bkts', 'population_in_thousands'): 'number',
            ('age_gender_bkts', 'year'): 'datetime'
        }

        # Run
        path = 'tests/data/airbnb/airbnb_meta.json'
        dir_name = os.path.dirname(path)
        with open(path, 'r') as f:
            metadata = json.load(f)
        ht = HyperTransformer(metadata, dir_name)

        # Check
        assert set(ht.table_dict.keys()) == {'users', 'sessions'}
        assert ht.transformer_dict == expected_transformer_dict
示例#10
0
    def test_fit_transform(self):
        """Create transformers for each column/table pair and apply them on input data."""
        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')

        # Run
        result = ht.fit_transform()

        # Check
        assert set(result.keys()) == {'users', 'sessions'}
        for name, table in result.items():
            values, meta = ht.table_dict[name]
            for column in values.columns:
                with self.subTest(column=column):
                    missing = '?' + column
                    meta_col = [field for field in meta['fields'] if field['name'] == column][0]
                    assert column in table.columns
                    assert missing in table.columns
                    assert (table[column] == pd.to_numeric(table[column])).all()

                    if meta_col['type'] != 'categorical':
                        # This is due to the fact that CatTransformer is able to handle
                        # nulls by itself without relying in NullTransformer.
                        assert (values[column].isnull() == (table[missing] == 0)).all()
示例#11
0
    def test_reverse_transform_table(self):
        """reverse_transform leave transformed data in its original state."""

        # Setup
        ht = HyperTransformer('tests/data/airbnb/airbnb_meta.json')
        table, table_meta = ht.table_dict['users']
        transformers = ['DTTransformer', 'NumberTransformer', 'CatTransformer']
        ht.fit_transform_table(table, table_meta, transformer_list=transformers)

        # Run
        transformed = ht.transform_table(table, table_meta)
        reverse_transformed = ht.reverse_transform_table(transformed, table_meta)

        # Check
        for column in table.columns:
            with self.subTest(column=column):
                assert (reverse_transformed[column] == table[column]).all()
示例#12
0
class DataNavigator:
    """Navigate through and transform a dataset.

    This class implement two main functionalities:

    - Navigation through the dataset
        Given a table, it allows to move though its relations and acces its data and metadata.

    - Transform data
        Transform the dataset using `rdt.HyperTransformer` in a format that is supported
        by `sdv.Modeler`.

    Args:
        meta_filename (str): Path to the metadata file.
        meta (dict): Metadata for the dataset.
        tables (dict[str, Table]): Mapping of table names to their values and metadata.
        missing (bool): Wheter or not handle missing values when transforming data.

    """

    DEFAULT_TRANSFORMERS = [
        'NumberTransformer', 'DTTransformer', 'CatTransformer'
    ]

    def update_mapping(self, mapping, key, value):
        """Safely updates a dict of sets.

        Args:
            mapping (dict): Dictionary to be updated.
            key(string): Key to update on `mapping`.
            value: Value to add.

        Returns:
            dict: Updated mapping.

        If mapping[key] exists then value will be added to it.
        If not, it will be created as a single-element set containing `value`.
        """
        item = mapping.get(key)

        if item:
            item.add(value)

        else:
            mapping[key] = {value}

        return mapping

    def _get_relationships(self, tables):
        """Map table name to names of child tables.

        Arguments:
            tables (dict): table_name -> Table.

        Returns:
            tuple: dicts of children, parents and foreign_keys.

        This method is what allow `DataNavigator` to be aware of the different tables and the
        relations between them.
        """
        child_map = {}
        parent_map = {}
        foreign_keys = {}  # {(child, parent) -> (parent pk, fk)}

        for table in tables:
            table_meta = tables[table].meta
            for field_meta in table_meta['fields'].values():
                ref = field_meta.get('ref')
                if ref:
                    parent = ref['table']
                    parent_pk = ref['field']
                    fk = field_meta['name']

                    # update child map
                    child_map = self.update_mapping(child_map, parent, table)

                    # update parent map
                    parent_map = self.update_mapping(parent_map, table, parent)

                    foreign_keys[(table, parent)] = (parent_pk, fk)

        return (child_map, parent_map, foreign_keys)

    def _anonymize_data(self):
        """Replace data with pii with anonymized data from HyperTransformer."""
        for table_name in self.tables:
            table = self.tables[table_name]
            ht_table, ht_meta = self.ht.table_dict[table_name]

            pii_fields = self.ht._get_pii_fields(ht_meta)
            if pii_fields:
                # Table is a namedtuple, which is immutable, so instantiate a new
                # one with transformed data
                self.tables[table_name] = Table(ht_table, table.meta)

    def __init__(self, meta_filename, meta, tables, missing=None):
        self.meta = meta
        self.tables = tables
        self.ht = HyperTransformer(meta_filename, missing=missing)
        self._anonymize_data()
        self.transformed_data = None
        self.child_map, self.parent_map, self.foreign_keys = self._get_relationships(
            self.tables)

    def get_children(self, table_name):
        """Return set of children of a table.

        Args:
            table_name (str): Name of table to get children of.

        Returns:
            set: Set of children for the given table.
        """
        return self.child_map.get(table_name, set())

    def get_parents(self, table_name):
        """Returns parents of a table.

        Args:
            table_name (str): Name of table to get parents of.

        Returns:
            set: Set of parents for the given table.
        """
        return self.parent_map.get(table_name, set())

    def get_data(self, table_name):
        """Return dataframe for a table.

        Args:
            table_name (str): Name of table to get data for.

        Returns:
            pandas.DataFrame: DataFrame with the contents of table_name
        """
        return self.tables[table_name].data

    def get_meta_data(self, table_name):
        """Return meta data for a table.

        Args:
            table_name (str): Name of table to get data for.

        Returns:
            dict: metadata for table_name
        """
        return self.tables[table_name].meta

    def transform_data(self, transformers=None):
        """Applies the specified transformations using an HyperTransformer and returns the new data

        Args:
            transformers (list): List of transformers to use.

        Returns:
            dict: dict with the transformed dataframes.
        """
        transformers = transformers or self.DEFAULT_TRANSFORMERS
        self.transformed_data = self.ht.fit_transform(
            transformer_list=transformers)

        return self.transformed_data
示例#13
0
class DataNavigator:
    """Class to navigate through data set."""

    DEFAULT_TRANSFORMERS = ['NumberTransformer', 'DTTransformer', 'CatTransformer']

    def __init__(self, meta_filename, meta, tables):
        """Instantiate data navigator object."""
        self.meta = meta
        self.tables = tables
        self.ht = HyperTransformer(meta_filename)
        self.transformed_data = None
        self.child_map, self.parent_map, self.foreign_keys = self._get_relationships(self.tables)

    def get_children(self, table_name):
        """Return set of children of a table.

        Args:
            table_name (str): Name of table to get children of.

        Returns:
            set: Set of children for the given table.
        """
        return self.child_map.get(table_name, set())

    def get_parents(self, table_name):
        """Returns parents of a table.

        Args:
            table_name (str): Name of table to get parents of.

        Returns:
            set: Set of parents for the given table.
        """
        return self.parent_map.get(table_name, set())

    def get_data(self, table_name):
        """Return dataframe for a table.

        Args:
            table_name (str): Name of table to get data for.

        Returns:
            pandas.DataFrame: DataFrame with the contents of table_name
        """
        return self.tables[table_name].data

    def get_meta_data(self, table_name):
        """Return meta data for a table.

        Args:
            table_name (str): Name of table to get data for.

        Returns:
            dict: metadata for table_name
        """
        return self.tables[table_name].meta

    def transform_data(self, transformers=None, missing=False):
        """Applies the specified transformations using an HyperTransformer and returns the new data

        Args:
            transformers (list): List of transformers to use.
            missing (bool): Whether or not to keep track of missing variables
                            and create extra columns for them.

        Returns:
            dict: dict with the transformed dataframes.
        """
        transformers = transformers or self.DEFAULT_TRANSFORMERS
        self.transformed_data = self.ht.fit_transform(
            transformer_list=transformers, missing=missing)

        return self.transformed_data

    def update_mapping(self, mapping, key, value):
        """Safely updates a dict of sets.

        Args:
            mapping (dict): Dictionary to be updated.
            key(string): Key to update on `mapping`.
            value: Value to add.

        Returns:
            dict: Updated mapping.

        If mapping[key] exists then value will be added to it.
        If not, it will be created as a single-element set containing `value`.
        """
        item = mapping.get(key)

        if item:
            item.add(value)

        else:
            mapping[key] = {value}

        return mapping

    def _get_relationships(self, tables):
        """Map table name to names of child tables.

        Arguments:
            tables (dict): table_name -> Table.

        Returns:
            tuple: dicts of children, parents and foreign_keys.

        This method is what allow `DataNavigator` to be aware of the different tables and the
        relations between them.
        """
        child_map = {}
        parent_map = {}
        foreign_keys = {}  # {(child, parent) -> (parent pk, fk)}

        for table in tables:
            table_meta = tables[table].meta
            for field_meta in table_meta['fields'].values():
                ref = field_meta.get('ref')
                if ref:
                    parent = ref['table']
                    parent_pk = ref['field']
                    fk = field_meta['name']

                    # update child map
                    child_map = self.update_mapping(child_map, parent, table)

                    # update parent map
                    parent_map = self.update_mapping(parent_map, table, parent)

                    foreign_keys[(table, parent)] = (parent_pk, fk)

        return (child_map, parent_map, foreign_keys)