예제 #1
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        categorical_transformer='one_hot_encoding',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(
        table_metadata=gc.get_metadata(),
        categorical_transformer='one_hot_encoding',
    )
    new_gc.set_parameters(parameters)

    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer'
        },
        'country': {
            'type': 'categorical'
        },
        'gender': {
            'type': 'categorical'
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer'
        }
    }

    assert 'model_kwargs' in metadata
예제 #2
0
def test___init___copies_metadata():
    """Test the ``__init__`` method.

    This test assures that the metadata provided to the model is copied,
    so that any modifications don't change the input.

    Setup:
        - Initialize two models with the same metadata and data.

    Expected behavior:
        - The metadata for each model and the provided metadata should all be different.
    """
    # Setup
    metadata, data = load_tabular_demo('student_placements', metadata=True)

    # Run
    model = GaussianCopula(table_metadata=metadata,
                           categorical_transformer='label_encoding',
                           default_distribution='gamma')
    model.fit(data)
    model2 = GaussianCopula(table_metadata=metadata,
                            categorical_transformer='label_encoding',
                            default_distribution='beta')
    model2.fit(data)

    # Assert
    assert model._metadata != metadata
    assert model._metadata != model2._metadata
    assert model2._metadata != metadata
    gamma = 'copulas.univariate.gamma.GammaUnivariate'
    beta = 'copulas.univariate.beta.BetaUnivariate'
    assert all(distribution == gamma
               for distribution in model.get_distributions().values())
    assert all(distribution == beta
               for distribution in model2.get_distributions().values())
예제 #3
0
def test_conditional_sampling_constraint_uses_reject_sampling(
        gm_mock, isinstance_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by dropping columns that cannot be conditonally sampled
    on due to them being part of a constraint.

    Setup:
    - The model is being passed a ``UniqueCombination`` constraint and then
    asked to sample with two conditions, one of which the constraint depends on.
    The constraint is expected to skip its transformations since only some of
    the columns are provided by the conditions and the model will use reject
    sampling to meet the constraint instead.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    isinstance_mock.side_effect = _isinstance_side_effect
    constraint = FixedCombinations(column_names=['city', 'state'])
    data = pd.DataFrame({
        'city': ['LA', 'SF', 'CHI', 'LA', 'LA'],
        'state': ['CA', 'CA', 'IL', 'CA', 'CA'],
        'age': [27, 28, 26, 21, 30]
    })
    model = GaussianCopula(constraints=[constraint],
                           categorical_transformer='label_encoding')
    sampled_numeric_data = [
        pd.DataFrame({
            'city#state.value': [0, 1, 2, 0, 0],
            'age.value': [30, 30, 30, 30, 30]
        }),
        pd.DataFrame({
            'city#state.value': [1],
            'age.value': [30]
        })
    ]
    gm_mock.return_value.sample.side_effect = sampled_numeric_data
    model.fit(data)

    # Run
    conditions = [Condition({'age': 30, 'state': 'CA'}, num_rows=5)]
    sampled_data = model.sample_conditions(conditions=conditions)

    # Assert
    expected_transformed_conditions = {'age.value': 30}
    expected_data = pd.DataFrame({
        'city': ['LA', 'SF', 'LA', 'LA', 'SF'],
        'state': ['CA', 'CA', 'CA', 'CA', 'CA'],
        'age': [30, 30, 30, 30, 30]
    })
    sample_calls = model._model.sample.mock_calls
    assert len(sample_calls) == 2
    model._model.sample.assert_any_call(
        50, conditions=expected_transformed_conditions)
    pd.testing.assert_frame_equal(sampled_data, expected_data)
예제 #4
0
파일: test_base.py 프로젝트: csala/SDV
def test_sample_empty_transformed_conditions():
    """Test that None is passed to ``_sample_batch`` if transformed conditions are empty.

    The ``Sample`` method is expected to:
    - Return sampled data and pass None to ``sample_batch`` as the
    ``transformed_conditions``.

    Input:
    - Number of rows to sample
    - Conditions

    Output:
    - Sampled data
    """
    # Setup
    model = GaussianCopula()
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })

    conditions = {'column1': 25}
    conditions_series = pd.Series([25, 25, 25, 25, 25], name='column1')
    model._sample_batch = Mock()
    sampled = pd.DataFrame({
        'column1': [28, 28],
        'column2': [37, 37],
        'column3': [93, 93],
    })
    model._sample_batch.return_value = sampled
    model.fit(data)
    model._metadata = Mock()
    model._metadata.get_fields.return_value = ['column1', 'column2', 'column3']
    model._metadata.transform.return_value = pd.DataFrame()
    model._metadata.make_ids_unique.side_effect = lambda x: x

    # Run
    output = model.sample(5,
                          conditions=conditions,
                          graceful_reject_sampling=True)

    # Assert
    expected_output = pd.DataFrame({
        'column1': [28, 28],
        'column2': [37, 37],
        'column3': [93, 93],
    })
    _, args, kwargs = model._metadata.transform.mock_calls[0]
    pd.testing.assert_series_equal(args[0]['column1'], conditions_series)
    assert kwargs['on_missing_column'] == 'drop'
    model._metadata.transform.assert_called_once()
    model._sample_batch.assert_called_with(5, 100, 10, conditions, None, 0.01)
    pd.testing.assert_frame_equal(output, expected_output)
예제 #5
0
def test_fit_with_unique_constraint_on_data_which_has_index_column():
    """Test that the ``fit`` method runs without error when metadata specifies unique constraint,
    ``fit`` is called on data containing a column named index and other columns.

    The ``fit`` method is expected to fit the model to data,
    taking into account the metadata and the ``Unique`` constraint.

    Setup:
    - The model is passed the unique constraint and
    the primary key column.
    - The unique constraint is set on the ``test_column``

    Input:
    - Data, Unique constraint

    Github Issue:
    - Tests that https://github.com/sdv-dev/SDV/issues/616 does not occur
    """
    # Setup
    test_df = pd.DataFrame({
        "key": [
            1,
            2,
            3,
            4,
            5,
        ],
        "index": [
            "A",
            "B",
            "C",
            "D",
            "E",
        ],
        "test_column": [
            "A1",
            "B2",
            "C3",
            "D4",
            "E5",
        ]
    })
    unique = Unique(column_names=["test_column"])
    model = GaussianCopula(primary_key="key", constraints=[unique])

    # Run
    model.fit(test_df)
    samples = model.sample(2)

    # Assert
    assert len(samples) == 2
    assert samples["test_column"].is_unique
예제 #6
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10,
        "column3": ["d", "e", "f"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {"column2": "b", "column3": "f"}
    samples = model.sample(5, conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
예제 #7
0
def test_conditional_sampling_dataframe():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = pd.DataFrame({"column2": ["b", "b", "b", "c", "c"]})
    sampled = model.sample(conditions=conditions)

    assert sampled.shape[0] == len(conditions["column2"])
    assert (sampled["column2"] == np.array(["b", "b", "b", "c", "c"])).all()
예제 #8
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {"column2": "b"}
    sampled = model.sample(30, conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled["column2"].unique()) == set(["b"])
예제 #9
0
def test_conditional_sampling_dict():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b'}, num_rows=30)]
    sampled = model.sample_conditions(conditions=conditions)

    assert sampled.shape == data.shape
    assert set(sampled['column2'].unique()) == set(['b'])
예제 #10
0
def test_conditional_sampling_two_conditions():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({'column2': 'b', 'column3': 'f'}, num_rows=5)]
    samples = model.sample_conditions(conditions=conditions)
    assert list(samples.column2) == ['b'] * 5
    assert list(samples.column3) == ['f'] * 5
예제 #11
0
def test_conditional_sampling_dataframe():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = pd.DataFrame({'column2': ['b', 'b', 'b', 'c', 'c']})
    sampled = model.sample_remaining_columns(conditions)

    assert sampled.shape[0] == len(conditions['column2'])
    assert (sampled['column2'] == np.array(['b', 'b', 'b', 'c', 'c'])).all()
예제 #12
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        "column1": [1.0, 0.5, 2.5] * 10,
        "column2": ["a", "b", "c"] * 10,
        "column3": ["d", "e", "f"] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = {
        "column1": 1.0,
    }
    sampled = model.sample(5, conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5
예제 #13
0
def test_conditional_sampling_numerical():
    data = pd.DataFrame({
        'column1': [1.0, 0.5, 2.5] * 10,
        'column2': ['a', 'b', 'c'] * 10,
        'column3': ['d', 'e', 'f'] * 10
    })

    model = GaussianCopula()
    model.fit(data)
    conditions = [Condition({
        'column1': 1.0,
    }, num_rows=5)]
    sampled = model.sample_conditions(conditions=conditions)

    assert list(sampled.column1) == [1.0] * 5
예제 #14
0
def test_fit_with_unique_constraint_on_data_subset():
    """Test that the ``fit`` method runs without error when metadata specifies unique constraint,
    ``fit`` is called on a subset of the original data.

    The ``fit`` method is expected to fit the model to the subset of data,
    taking into account the metadata and the ``Unique`` constraint.

    Setup:
    - The model is passed a ``Unique`` constraint and is
    matched to a subset of the specified data.
    Subdividing the data results in missing indexes in the subset contained in the original data.

    Input:
    - Subset of data, unique constraint

    Github Issue:
    - Tests that https://github.com/sdv-dev/SDV/issues/610 does not occur
    """
    # Setup
    test_df = pd.DataFrame({
        "key": [
            1,
            2,
            3,
            4,
            5,
        ],
        "test_column": [
            "A",
            "B",
            "C",
            "D",
            "E",
        ]
    })
    unique = Unique(column_names=["test_column"])

    test_df = test_df.iloc[[1, 3, 4]]
    model = GaussianCopula(primary_key="key", constraints=[unique])

    # Run
    model.fit(test_df)
    samples = model.sample(2)

    # Assert
    assert len(samples) == 2
    assert samples["test_column"].is_unique
예제 #15
0
def test_ids_only():
    """Ensure that tables that do not contain anything other than id fields can be modeled."""
    ids_only = pd.DataFrame({
        'id': range(10),
        'other_id': range(10),
    })

    model = GaussianCopula(field_types={
        'id': {
            'type': 'id'
        },
        'other_id': {
            'type': 'id'
        }
    })
    model.fit(ids_only)
    sampled = model.sample()

    assert sampled.shape == ids_only.shape
    assert ids_only.equals(sampled)
예제 #16
0
def test_integer_categoricals():
    """Ensure integer categoricals are still sampled as integers.

    The origin of this tests can be found in the github issue #194:
    https://github.com/sdv-dev/SDV/issues/194
    """
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'categorical',
        },
    }
    gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical')
    gc.fit(users)

    sampled = gc.sample()

    assert users['age'].dtype == np.int64
    assert sampled['age'].dtype == np.int64
예제 #17
0
def test_recreate():
    data = load_demo(metadata=False)['users']

    # If distribution is non parametric, get_parameters fails
    model = GaussianCopula()
    model.fit(data)
    sampled = model.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()

    # Metadata
    model_meta = GaussianCopula(table_metadata=model.get_metadata())
    model_meta.fit(data)
    sampled = model_meta.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()

    # Metadata dict
    model_meta_dict = GaussianCopula(
        table_metadata=model.get_metadata().to_dict())
    model_meta_dict.fit(data)
    sampled = model_meta_dict.sample()

    assert sampled.shape == data.shape
    assert (sampled.dtypes == data.dtypes).all()
    assert (sampled.notnull().sum(axis=1) != 0).all()
예제 #18
0
파일: test_base.py 프로젝트: csala/SDV
def test_sample_batches_transform_conditions_correctly():
    """Test that transformed conditions are batched correctly.

    The ``Sample`` method is expected to:
    - Return sampled data and call ``_sample_batch`` for every unique transformed
    condition group.

    Input:
    - Number of rows to sample
    - Conditions

    Output:
    - Sampled data
    """
    # Setup
    model = GaussianCopula()
    data = pd.DataFrame({
        'column1': list(range(100)),
        'column2': list(range(100)),
        'column3': list(range(100))
    })

    conditions = {'column1': [25, 25, 25, 30, 30]}
    conditions_series = pd.Series([25, 25, 25, 30, 30], name='column1')
    model._sample_batch = Mock()
    expected_outputs = [
        pd.DataFrame({
            'column1': [25, 25, 25],
            'column2': [37, 37, 37],
            'column3': [93, 93, 93],
        }),
        pd.DataFrame({
            'column1': [30],
            'column2': [37],
            'column3': [93],
        }),
        pd.DataFrame({
            'column1': [30],
            'column2': [37],
            'column3': [93],
        })
    ]
    model._sample_batch.side_effect = expected_outputs
    model.fit(data)
    model._metadata = Mock()
    model._metadata.get_fields.return_value = ['column1', 'column2', 'column3']
    model._metadata.transform.return_value = pd.DataFrame(
        [[50], [50], [50], [60], [70]], columns=['transformed_column'])

    # Run
    model.sample(5, conditions=conditions, graceful_reject_sampling=True)

    # Assert
    _, args, kwargs = model._metadata.transform.mock_calls[0]
    pd.testing.assert_series_equal(args[0]['column1'], conditions_series)
    assert kwargs['on_missing_column'] == 'drop'
    model._metadata.transform.assert_called_once()
    model._sample_batch.assert_any_call(3, 100, 10, {'column1': 25},
                                        {'transformed_column': 50}, 0.01)
    model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30},
                                        {'transformed_column': 60}, 0.01)
    model._sample_batch.assert_any_call(1, 100, 10, {'column1': 30},
                                        {'transformed_column': 70}, 0.01)
예제 #19
0
파일: test_base.py 프로젝트: csala/SDV
def test_conditional_sampling_constraint_uses_columns_model_reject_sampling(
        column_model_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by sampling the missing columns for the constraint
    if ``fit_columns_model`` is True. All values sampled by the column
    model should be valid because reject sampling is used on any that aren't.

    Setup:
    - The model is being passed a ``GreaterThan`` constraint and then
    asked to sample with one condition. One of the constraint columns is
    the conditioned column. The ``GaussianMultivariate`` class is mocked
    so that the constraint's ``_column_model`` returns some invalid rows
    in order to test that the reject sampling is used.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    constraint = GreaterThan(low='age_joined',
                             high='age',
                             handling_strategy='transform',
                             fit_columns_model=True,
                             drop='high')
    data = pd.DataFrame({
        'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0],
        'age': [27.0, 28.0, 26.0, 21.0, 30.0],
        'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0],
    })
    model = GaussianCopula(constraints=[constraint])
    sampled_conditions = [
        pd.DataFrame({
            'age_joined': [26.0, 18.0, 31.0, 29.0, 32.0],
            'age': [30.0, 30.0, 30.0, 30.0, 30.0]
        }),
        pd.DataFrame({
            'age_joined': [28.0, 33.0, 31.0],
            'age': [30.0, 30.0, 30.0]
        }),
        pd.DataFrame({
            'age_joined': [27.0],
            'age': [30.0]
        })
    ]

    column_model_mock.return_value.sample.side_effect = sampled_conditions
    model.fit(data)

    # Run
    conditions = {'age': 30.0}
    sampled_data = model.sample(5, conditions=conditions)

    # Assert
    assert len(column_model_mock.return_value.sample.mock_calls) == 3

    expected_result = pd.DataFrame({
        'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0],
        'age': [30.0, 30.0, 30.0, 30.0, 30.0]
    })
    pd.testing.assert_frame_equal(
        sampled_data[['age_joined', 'age']],
        expected_result[['age_joined', 'age']],
    )
예제 #20
0
def test_gaussian_copula():
    users = load_demo(metadata=False)['users']

    field_types = {
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
        },
        'country': {
            'type': 'categorical'
        }
    }
    anonymize_fields = {'country': 'country_code'}

    # If distribution is non parametric, get_parameters fails
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='gaussian_kde',
    )
    gc.fit(users)
    with pytest.raises(NonParametricError):
        parameters = gc.get_parameters()

    # If distribution is parametric, copula can be recreated
    gc = GaussianCopula(
        field_names=['user_id', 'country', 'gender', 'age'],
        field_types=field_types,
        primary_key='user_id',
        anonymize_fields=anonymize_fields,
        field_distributions={'age': 'gamma'},
        default_distribution='bounded',
    )
    gc.fit(users)

    parameters = gc.get_parameters()
    new_gc = GaussianCopula(table_metadata=gc.get_metadata(), )
    new_gc.set_parameters(parameters)

    # Validate sampled dat
    sampled = new_gc.sample()

    # test shape is right
    assert sampled.shape == users.shape

    # test user_id has been generated as an ID field
    assert list(sampled['user_id']) == list(range(0, len(users)))

    # country codes have been replaced with new ones
    assert set(sampled.country.unique()) != set(users.country.unique())

    # Validate metadata
    metadata = gc.get_metadata().to_dict()
    assert metadata['fields'] == {
        'user_id': {
            'type': 'id',
            'subtype': 'integer',
            'transformer': 'integer',
        },
        'country': {
            'type': 'categorical',
            'pii': True,
            'pii_category': 'country_code',
            'transformer': 'one_hot_encoding',
        },
        'gender': {
            'type': 'categorical',
            'transformer': 'one_hot_encoding',
        },
        'age': {
            'type': 'numerical',
            'subtype': 'integer',
            'transformer': 'integer',
        }
    }

    assert 'model_kwargs' in metadata
    assert 'GaussianCopula' in metadata['model_kwargs']
예제 #21
0
파일: test_base.py 프로젝트: csala/SDV
def test_conditional_sampling_constraint_uses_columns_model(gm_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by sampling the missing columns for the constraint
    if ``fit_columns_model`` is True.

    Setup:
    - The model is being passed a ``UniqueCombination`` constraint and then
    asked to sample with two conditions, one of which the constraint depends on.
    The constraint will sample the columns it needs that are not present in
    the conditions and will then use constraint transformations to meet the
    requirements.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    constraint = UniqueCombinations(
        columns=['city', 'state'],
        handling_strategy='transform',
        fit_columns_model=True,
    )
    data = pd.DataFrame({
        'city': ['LA', 'SF', 'CHI', 'LA', 'LA'],
        'state': ['CA', 'CA', 'IL', 'CA', 'CA'],
        'age': [27, 28, 26, 21, 30]
    })
    model = GaussianCopula(constraints=[constraint],
                           categorical_transformer='label_encoding')
    sampled_numeric_data = [
        pd.DataFrame({
            'city#state': [2],
            'age': [30]
        }),
        pd.DataFrame({
            'city#state': [1, 1, 0, 0, 0],
            'age': [30, 30, 30, 30, 30]
        }),
        pd.DataFrame({
            'city#state': [0, 0, 1, 1, 1],
            'age': [30, 30, 30, 30, 30]
        })
    ]
    gm_mock.return_value.sample.side_effect = sampled_numeric_data
    model.fit(data)

    # Run
    conditions = {'age': 30, 'state': 'CA'}
    sampled_data = model.sample(5, conditions=conditions)

    # Assert
    expected_states = pd.Series(['CA', 'CA', 'CA', 'CA', 'CA'], name='state')
    expected_ages = pd.Series([30, 30, 30, 30, 30], name='age')
    sample_calls = model._model.sample.mock_calls
    assert len(sample_calls) >= 2 and len(sample_calls) <= 3
    assert all(c[2]['conditions']['age'] == 30 for c in sample_calls)
    assert all('city#state' in c[2]['conditions'] for c in sample_calls)
    pd.testing.assert_series_equal(sampled_data['age'], expected_ages)
    pd.testing.assert_series_equal(sampled_data['state'], expected_states)
    assert all(c in ('SF', 'LA') for c in sampled_data['city'])