예제 #1
0
    def _fit_hyper_transformer(self, data, extra_columns):
        """Create and return a new ``rdt.HyperTransformer`` instance.

        First get the ``dtypes`` and then use them to build a transformer dictionary
        to be used by the ``HyperTransformer``.

        Args:
            data (pandas.DataFrame):
                Data to transform.
            extra_columns (set):
                Names of columns that are not in the metadata but that should also
                be transformed. In most cases, these are the fields that were added
                by previous transformations which the data underwent.

        Returns:
            rdt.HyperTransformer
        """
        meta_dtypes = self.get_dtypes(ids=False)
        dtypes = {}
        for column in data.columns:
            if column in meta_dtypes:
                dtypes[column] = meta_dtypes[column]
            elif column in extra_columns:
                dtypes[column] = data[column].dtype.kind

        transformers_dict = self._get_transformers(dtypes)
        self._hyper_transformer = rdt.HyperTransformer(transformers=transformers_dict)
        self._hyper_transformer.fit(data[list(dtypes.keys())])
예제 #2
0
파일: base.py 프로젝트: sdv-dev/SDMetrics
    def _compute_score(cls, real_data, synthetic_data, entity_columns, target):
        transformer = rdt.HyperTransformer(
            default_data_type_transformers={
                'categorical':
                rdt.transformers.OneHotEncodingTransformer(
                    error_on_unknown=False),
                'datetime':
                rdt.transformers.DatetimeTransformer(strip_constant=True),
            })
        transformer.fit(real_data.drop(entity_columns + [target], axis=1))

        real_x, real_y = cls._build_xy(transformer, real_data, entity_columns,
                                       target)
        synt_x, synt_y = cls._build_xy(transformer, synthetic_data,
                                       entity_columns, target)

        train, test = train_test_split(real_x.index, shuffle=True)
        real_x_train, real_x_test = real_x.loc[train], real_x.loc[test]
        real_y_train, real_y_test = real_y.loc[train], real_y.loc[test]

        real_acc = cls._scorer(real_x_train, real_x_test, real_y_train,
                               real_y_test)
        synt_acc = cls._scorer(synt_x, real_x_test, synt_y, real_y_test)

        return synt_acc / real_acc
예제 #3
0
    def _fit_predict(cls, synthetic_data, synthetic_target, real_data,
                     real_target):
        """Fit a model in the synthetic data and make predictions for the real data."""
        del real_target  # delete argument which subclasses use but this method does not.
        unique_labels = np.unique(synthetic_target)
        if len(unique_labels) == 1:
            predictions = np.full(len(real_data), unique_labels[0])
        else:
            transformer = rdt.HyperTransformer(
                default_data_type_transformers={
                    'categorical':
                    rdt.transformers.OneHotEncodingTransformer(
                        error_on_unknown=False),
                })
            real_data = transformer.fit_transform(real_data)
            synthetic_data = transformer.transform(synthetic_data)

            real_data[np.isin(real_data, [np.inf, -np.inf])] = None
            synthetic_data[np.isin(synthetic_data, [np.inf, -np.inf])] = None

            model_kwargs = cls.MODEL_KWARGS.copy() if cls.MODEL_KWARGS else {}
            model = cls.MODEL(**model_kwargs)

            pipeline = Pipeline([('imputer', SimpleImputer()),
                                 ('scaler', RobustScaler()), ('model', model)])

            pipeline.fit(synthetic_data, synthetic_target)

            predictions = pipeline.predict(real_data)

        return predictions
예제 #4
0
    def _fit_sample(self, real_data, table_metadata):
        columns, categoricals = self._get_columns(real_data, table_metadata)
        real_data = real_data[columns]

        ht = rdt.HyperTransformer(dtype_transformers={
            'O': 'label_encoding',
        })
        ht.fit(real_data.iloc[:, categoricals])
        model_data = ht.transform(real_data)

        supported = set(model_data.select_dtypes(('number', 'bool')).columns)
        unsupported = set(model_data.columns) - supported
        if unsupported:
            unsupported_dtypes = model_data[unsupported].dtypes.unique(
            ).tolist()
            raise UnsupportedDataset(
                f'Unsupported dtypes {unsupported_dtypes}')

        nulls = model_data.isnull().any()
        if nulls.any():
            unsupported_columns = nulls[nulls].index.tolist()
            raise UnsupportedDataset(
                f'Null values found in columns {unsupported_columns}')

        LOGGER.info("Fitting %s", self.__class__.__name__)
        self.fit(model_data.to_numpy(), categoricals, ())

        LOGGER.info("Sampling %s", self.__class__.__name__)
        sampled_data = self.sample(len(model_data))
        sampled_data = pd.DataFrame(sampled_data, columns=columns)

        synthetic_data = real_data.copy()
        synthetic_data.update(ht.reverse_transform(sampled_data))
        return synthetic_data
예제 #5
0
    def _fit_hyper_transformer(self, data):
        """Create and return a new ``rdt.HyperTransformer`` instance.

        First get the ``dtypes`` and then use them to build a transformer dictionary
        to be used by the ``HyperTransformer``.

        Returns:
            rdt.HyperTransformer
        """
        dtypes = self.get_dtypes(ids=False)
        transformers_dict = self._get_transformers(dtypes)
        self._hyper_transformer = rdt.HyperTransformer(
            transformers=transformers_dict)
        self._hyper_transformer.fit(data[list(dtypes.keys())])
예제 #6
0
    def compute(cls,
                real_data,
                synthetic_data,
                metadata=None,
                entity_columns=None):
        """Compute this metric.

        Args:
            real_data (pandas.DataFrame):
                The values from the real dataset, passed as a pandas.DataFrame.
            synthetic_data (pandas.DataFrame):
                The values from the synthetic dataset, passed as a pandas.DataFrame.
            metadata (dict):
                TimeSeries metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.
            entity_columns (list[str]):
                Names of the columns which identify different time series
                sequences.

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        _, entity_columns = cls._validate_inputs(real_data, synthetic_data,
                                                 metadata, entity_columns)

        transformer = rdt.HyperTransformer(
            default_data_type_transformers={
                'categorical':
                rdt.transformers.OneHotEncodingTransformer(
                    error_on_unknown=False),
                'datetime':
                rdt.transformers.DatetimeTransformer(strip_constant=True),
            })
        transformer.fit(real_data.drop(entity_columns, axis=1))

        real_x = cls._build_x(real_data, transformer, entity_columns)
        synt_x = cls._build_x(synthetic_data, transformer, entity_columns)

        X = pd.concat([real_x, synt_x])
        y = pd.Series(np.array([0] * len(real_x) + [1] * len(synt_x)))
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            shuffle=True,
                                                            stratify=y)

        return 1 - cls._compute_score(X_train, X_test, y_train, y_test)
예제 #7
0
파일: base.py 프로젝트: sdv-dev/SDGym
    def _transform_fit_sample(self, real_data, metadata):
        ht = rdt.HyperTransformer()
        columns_to_transform = list()
        fields_metadata = metadata['fields']
        id_fields = list()
        for field in fields_metadata:
            if fields_metadata.get(field).get('type') != 'id':
                columns_to_transform.append(field)
            else:
                id_fields.append(field)

        ht.fit(real_data[columns_to_transform])
        transformed_data = ht.transform(real_data)
        synthetic_data = self._fit_sample(transformed_data, metadata)
        reverse_transformed_synthetic_data = ht.reverse_transform(synthetic_data)
        reverse_transformed_synthetic_data[id_fields] = real_data[id_fields]
        return reverse_transformed_synthetic_data
예제 #8
0
    def _fit_sample(self, real_data, table_metadata):
        columns, categoricals = self._get_columns(real_data, table_metadata)

        ht = rdt.HyperTransformer(dtype_transformers={
            'O': 'label_encoding',
        })
        model_data = ht.fit_transform(real_data[columns])

        LOGGER.info("Fitting %s", self.__class__.__name__)
        self.fit(model_data.to_numpy(), categoricals, ())

        LOGGER.info("Sampling %s", self.__class__.__name__)
        sampled_data = self.sample(len(model_data))
        sampled_data = pd.DataFrame(sampled_data, columns=columns)

        synthetic_data = real_data.copy()
        synthetic_data.update(ht.reverse_transform(sampled_data))
        return synthetic_data
예제 #9
0
    def _fit_hyper_transformer(self, data):
        """Create and return a new ``rdt.HyperTransformer`` instance.

        First get the ``dtypes`` and then use them to build a transformer dictionary
        to be used by the ``HyperTransformer``.

        Returns:
            rdt.HyperTransformer
        """
        # dtypes = self.get_dtypes(ids=False)
        dtypes = {}
        fields = self._fields_metadata
        for column in data.columns:
            if column not in fields or fields[column]['type'] != 'id':
                dtypes[column] = data[column].dtype.kind

        transformers_dict = self._get_transformers(dtypes)
        self._hyper_transformer = rdt.HyperTransformer(
            transformers=transformers_dict)
        self._hyper_transformer.fit(data[list(dtypes.keys())])
예제 #10
0
    def _compute_score(cls, real_data, synthetic_data, entity_columns, target):
        transformer = rdt.HyperTransformer(
            dtype_transformers={
                'O': 'one_hot_encoding',
                'M': rdt.transformers.DatetimeTransformer(strip_constant=True),
            })
        transformer.fit(real_data.drop(entity_columns + [target], axis=1))

        real_x, real_y = cls._build_xy(transformer, real_data, entity_columns,
                                       target)
        synt_x, synt_y = cls._build_xy(transformer, synthetic_data,
                                       entity_columns, target)

        train, test = train_test_split(real_x.index, shuffle=True)
        real_x_train, real_x_test = real_x.loc[train], real_x.loc[test]
        real_y_train, real_y_test = real_y.loc[train], real_y.loc[test]

        real_acc = cls._scorer(real_x_train, real_x_test, real_y_train,
                               real_y_test)
        synt_acc = cls._scorer(synt_x, real_x_test, synt_y, real_y_test)

        return synt_acc / real_acc