예제 #1
0
def test_FixedIncrements():
    """Test the ``FixedIncrements`` constraint end to end."""
    # Setup
    values = np.random.randint(1, 10, size=20) * 5
    data = pd.DataFrame({'column': values})
    constraint = FixedIncrements(column_name='column', increment_value=5)
    gc = GaussianCopula(constraints=[constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(100)

    # Assert
    assert all(sampled % 5 == 0)
예제 #2
0
def test_constraints(tmpdir):
    # Setup
    employees = load_tabular_demo()
    fixed_company_department_constraint = FixedCombinations(
        column_names=['company', 'department'])
    age_gt_age_when_joined_constraint = Inequality(
        low_column_name='age_when_joined', high_column_name='age')
    age_range_constraint = ScalarRange('age', 29, 50)
    constraints = [
        fixed_company_department_constraint, age_gt_age_when_joined_constraint,
        age_range_constraint
    ]

    # Run
    gc = GaussianCopula(constraints=constraints,
                        min_value=None,
                        max_value=None)
    gc.fit(employees)
    gc.save(tmpdir / 'test.pkl')
    gc = gc.load(tmpdir / 'test.pkl')
    sampled = gc.sample(10)

    # Assert
    assert all(age_gt_age_when_joined_constraint.is_valid(sampled))
    assert all(age_range_constraint.is_valid(sampled))
    assert all(fixed_company_department_constraint.is_valid(sampled))
예제 #3
0
def test_ScalarInequality():
    """Test the ``ScalarInequality`` constraint end to end."""
    # Setup
    data = pd.DataFrame({
        'low': np.random.randint(1, 10, size=20),
    })
    constraint = ScalarInequality(column_name='low', value=11, relation='<')
    gc = GaussianCopula(constraints=[constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(10)

    # Assert
    assert all(sampled['low'] < 11)
예제 #4
0
def test_Inequality():
    """Test the ``Inequality`` constraint end to end."""
    # Setup
    data = pd.DataFrame({
        'low': np.random.randint(1, 10, size=20),
        'high': np.random.randint(10, 20, size=20)
    })
    constraint = Inequality('low', 'high')
    gc = GaussianCopula(constraints=[constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(10)

    # Assert
    assert all(sampled['low'] <= sampled['high'])
예제 #5
0
def test_create_custom_constraint():
    """Test the ``create_custom_constraint`` method end to end."""
    # Setup
    custom_constraint = create_custom_constraint(
        lambda _, x: pd.Series(
            [True if x_i > 0 else False for x_i in x['col']]),
        lambda _, x: pd.DataFrame({'col': x['col']**2}),
        lambda _, x: pd.DataFrame({'col': x['col']**.5}))('col')

    data = pd.DataFrame({'col': np.random.randint(1, 10, size=100)})
    gc = GaussianCopula(constraints=[custom_constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(100)

    # Assert
    assert all(sampled > 0)
예제 #6
0
def test_ScalarRange():
    """Test the ``ScalarRange`` constraint end to end."""
    # Setup
    data = pd.DataFrame({
        'column': np.random.randint(6, 10, size=20),
    })
    constraint = ScalarRange(column_name='column',
                             low_value=5,
                             high_value=11,
                             strict_boundaries=True)

    gc = GaussianCopula(constraints=[constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(100)

    # Assert
    assert sampled.column.min() >= 5
    assert sampled.column.max() <= 11
예제 #7
0
def test_invalid_create_custom_constraint():
    """Test the an invalid ``create_custom_constraint`` method end to end.

    It should correctly sample the synthetic data through reject sample.
    """
    # Setup
    custom_constraint = create_custom_constraint(
        lambda _, x: pd.Series(
            [True if x_i > 0 else False for x_i in x['col']]),
        lambda _: pd.DataFrame({'col': [10 / 0] * 100}),
        lambda _, x: pd.DataFrame({'col': x['col']**.5}))('col')

    data = pd.DataFrame({'col': np.random.randint(1, 10, size=100)})
    gc = GaussianCopula(constraints=[custom_constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(100)

    # Assert
    assert all(sampled > 0)
예제 #8
0
def test_Range():
    """Test the ``Range`` constraint end to end."""
    # Setup
    data = pd.DataFrame({
        'low_column': np.random.randint(1, 5, size=20),
        'middle_column': np.random.randint(6, 10, size=20),
        'high_column': np.random.randint(11, 20, size=20),
    })
    constraint = Range(low_column_name='low_column',
                       middle_column_name='middle_column',
                       high_column_name='high_column',
                       strict_boundaries=True)

    gc = GaussianCopula(constraints=[constraint])
    gc.fit(data)

    # Run
    sampled = gc.sample(100)

    # Assert
    assert sampled.middle_column.min() >= sampled.low_column.min()
    assert sampled.middle_column.max() <= sampled.high_column.max()
예제 #9
0
def generateSyntheticData(ds, mode='', num_sample=5000):
    #I need to merge together x_train y_train
    #generate the new dataset with gaussian copula
    #split again the new data
    # fit the classifier
    # test the classifier using x_test and y_test
    # mode = 'gaussian' , ''

    print('generating artificial data for ', ds, ' dataset using ', mode,
          ' model ')

    x_train = pd.read_csv('splittedDatasets/x_train_' + ds + '.csv')
    y_train = pd.read_csv('splittedDatasets/y_train_' + ds + '.csv')

    if ds == 'income':
        target = 'class'
    if ds == 'titanic':
        target = 'Survived'
    if ds == 'social':
        target = 'Purchased'

    if mode == 'gaussian_copula':
        model = GaussianCopula()
    elif mode == 'ctGAN':
        model = CTGAN()
    elif mode == 'copulaGAN':
        model = CopulaGAN()

    df_all = pd.merge(x_train, y_train, left_index=True, right_index=True)
    model.fit(df_all)

    synthetic_data = model.sample(num_sample)
    if not os.path.isdir('generatedData'):
        os.system('mkdir generatedData')

    synthetic_data[target].to_csv('generatedData/y_train_' + ds + '_' + mode +
                                  '.csv',
                                  index=False)
    y_train_s = synthetic_data[[target]]

    del synthetic_data[target]
    synthetic_data.to_csv('generatedData/x_train_' + ds + '_' + mode + '.csv',
                          index=False)

    x_train_s = synthetic_data

    make_histos_2(ds, synthetic_data, what=mode)

    return x_train_s, y_train_s
예제 #10
0
def test_constraints_reject_sampling_zero_valid():
    """Ensure everything works if no rows are valid on the first try.

    See https://github.com/sdv-dev/SDV/issues/285
    """
    employees = load_tabular_demo()

    _IS_VALID_CALLED.clear()
    constraint = CustomConstraint(is_valid=_is_valid)

    gc = GaussianCopula(constraints=[constraint])
    gc.fit(employees)
    gc.sample(10)
예제 #11
0
def test_constraints(tmpdir):

    employees = load_tabular_demo()

    unique_company_department_constraint = UniqueCombinations(
        columns=['company', 'department'], handling_strategy='transform')

    age_gt_age_when_joined_constraint = GreaterThan(
        low='age_when_joined', high='age', handling_strategy='reject_sampling')

    years_in_the_company_constraint = ColumnFormula(
        column='years_in_the_company',
        formula=years_in_the_company,
        handling_strategy='transform')

    constraints = [
        unique_company_department_constraint,
        age_gt_age_when_joined_constraint, years_in_the_company_constraint
    ]
    gc = GaussianCopula(constraints=constraints)
    gc.fit(employees)
    gc.save(tmpdir / 'test.pkl')
    gc = gc.load(tmpdir / 'test.pkl')
    gc.sample(10)
예제 #12
0
def test_failing_constraints():
    data = pd.DataFrame({
        'a': [0, 0, 0, 0, 0, 0, 0],
        'b': [1, -1, 2, -2, 3, -3, 0],
        'c': [-1, -1, -1, -1, -1, -1, -1],
        'd': [1, -1, 2, -2, 3, -3, 5],
        'e': [1, 2, 3, 4, 5, 6, 'a'],
        'f': [1, 1, 2, 2, 3, 3, -1],
        'g': [1, 0, 1, 0, 0, 1, 0],
        'h': [1, 1, 1, 0, 0, 10, 0],
        'i': [1, 1, 1, 1, 1, 1, 1],
        'j': [2, 3, 4, 5, 6, 7, 5.5],
        'k': [1, -1, 2, -2, 3, -3, 5]
    })

    custom_constraint = create_custom_constraint(
        lambda _, x: pd.Series([True if x_i > 0 else False for x_i in x['k']]))
    constraints = [
        Inequality('a', 'b'),
        Positive('c'),
        Negative('d'),
        OneHotEncoding(['g', 'h']),
        Unique(['i']),
        ScalarInequality('j', '>=', 5.5),
        Range('a', 'b', 'c'),
        ScalarRange('a', 0, 0),
        custom_constraint('k')
    ]
    gc = GaussianCopula(constraints=constraints)

    err_msg = re.escape(
        "Data is not valid for the 'Inequality' constraint:"
        '\n   a  b'
        '\n1  0 -1'
        '\n3  0 -2'
        '\n5  0 -3'
        '\n'
        "\nData is not valid for the 'Positive' constraint:"
        '\n   c'
        '\n0 -1'
        '\n1 -1'
        '\n2 -1'
        '\n3 -1'
        '\n4 -1'
        '\n+2 more'
        '\n'
        "\nData is not valid for the 'Negative' constraint:"
        '\n   d'
        '\n0  1'
        '\n2  2'
        '\n4  3'
        '\n6  5'
        '\n'
        "\nData is not valid for the 'OneHotEncoding' constraint:"
        '\n   g   h'
        '\n0  1   1'
        '\n2  1   1'
        '\n3  0   0'
        '\n4  0   0'
        '\n5  1  10'
        '\n+1 more'
        '\n'
        "\nData is not valid for the 'Unique' constraint:"
        '\n   i'
        '\n1  1'
        '\n2  1'
        '\n3  1'
        '\n4  1'
        '\n5  1'
        '\n+1 more'
        '\n'
        "\nData is not valid for the 'ScalarInequality' constraint:"
        '\n     j'
        '\n0  2.0'
        '\n1  3.0'
        '\n2  4.0'
        '\n3  5.0'
        '\n'
        "\nData is not valid for the 'Range' constraint:"
        '\n   a  b  c'
        '\n0  0  1 -1'
        '\n1  0 -1 -1'
        '\n2  0  2 -1'
        '\n3  0 -2 -1'
        '\n4  0  3 -1'
        '\n+2 more'
        '\n'
        "\nData is not valid for the 'ScalarRange' constraint:"
        '\n   a'
        '\n0  0'
        '\n1  0'
        '\n2  0'
        '\n3  0'
        '\n4  0'
        '\n+2 more'
        '\n'
        "\nData is not valid for the 'CustomConstraint' constraint:"
        '\n   k'
        '\n1 -1'
        '\n3 -2'
        '\n5 -3')

    with pytest.raises(MultipleConstraintsErrors, match=err_msg):
        gc.fit(data)
예제 #13
0
def generateGaussianCopulaModel():
    #I need to merge together x_train y_train
    #generate the new dataset with gaussian copula
    #split again the new data
    #fit the classifier
    #test the classifier using x_test and y_test
    print(
        'generating artificial data for adult income dataset using gaussian model'
    )
    x_train = pd.read_csv('splittedDatasets/x_trainAdult.csv')
    y_train = pd.read_csv('splittedDatasets/y_trainAdult.csv')
    model = GaussianCopula()
    model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True))
    newAdultIncomeData = model.sample(len(x_train) - 1)
    newAdultIncomeData['class'].to_csv(
        'generatedData/y_trainAdultGaussian.csv', index=False)
    del newAdultIncomeData['class']
    newAdultIncomeData.to_csv('generatedData/x_trainAdultGaussian.csv',
                              index=False)
    x_test = pd.read_csv('splittedDatasets/x_testAdult.csv')
    y_test = pd.read_csv('splittedDatasets/y_testAdult.csv')
    print('fitting and testing RF using the new generated data')
    cf = Classifier(x_train, y_train)
    y_prediction = predict(
        x_test,
        cf,
    )
    confusion_m, accuracy, report = evaluation(y_test, y_prediction)
    print(confusion_m)
    print("Accuracy:", accuracy)
    print("Precision (macro avg):", report['macro avg']['precision'])
    print("Recall (macro avg):", report['macro avg']['recall'])
    print("F1-score (macro avg):", report['macro avg']['f1-score'])
    printConfusionMatrix(confusion_m, 'AdultIncomeCopula', 'adult')

    print(
        'generating artificial data for Titanic dataset using gaussian model')
    x_train = pd.read_csv('splittedDatasets/x_trainTitanic.csv')
    y_train = pd.read_csv('splittedDatasets/y_trainTitanic.csv')
    model = GaussianCopula()
    model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True))

    newAdultIncomeData = model.sample(len(x_train) - 1)
    newAdultIncomeData['Survived'].to_csv(
        'generatedData/y_trainTitanicGaussian.csv', index=False)
    del newAdultIncomeData['Survived']
    newAdultIncomeData.to_csv('generatedData/x_trainTitanicGaussian.csv',
                              index=False)
    x_test = pd.read_csv('splittedDatasets/x_testTitanic.csv')
    y_test = pd.read_csv('splittedDatasets/y_testTitanic.csv')
    print('fitting and testing RF using the new generated data')
    cf = Classifier(x_train, y_train)
    y_prediction = predict(
        x_test,
        cf,
    )
    confusion_m, accuracy, report = evaluation(y_test, y_prediction)
    print(confusion_m)
    print("Accuracy:", accuracy)
    print("Precision (macro avg):", report['macro avg']['precision'])
    print("Recall (macro avg):", report['macro avg']['recall'])
    print("F1-score (macro avg):", report['macro avg']['f1-score'])
    printConfusionMatrix(confusion_m, 'titanicCopula', 'titanic')

    print('generating artificial data for Ads dataset using gaussian model')
    x_train = pd.read_csv('splittedDatasets/x_trainAds.csv')
    y_train = pd.read_csv('splittedDatasets/y_trainAds.csv')
    model = GaussianCopula()
    model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True))
    newAdultIncomeData = model.sample(len(x_train) - 1)

    newAdultIncomeData['Purchased'].to_csv(
        'generatedData/y_trainAdsGaussian.csv', index=False)
    del newAdultIncomeData['Purchased']
    newAdultIncomeData.to_csv('generatedData/x_trainAdsGaussian.csv',
                              index=False)
    x_test = pd.read_csv('splittedDatasets/x_testAds.csv')
    y_test = pd.read_csv('splittedDatasets/y_testAds.csv')
    print('fitting and testing RF using the new generated data')
    cf = Classifier(x_train, y_train)
    y_prediction = predict(
        x_test,
        cf,
    )
    confusion_m, accuracy, report = evaluation(y_test, y_prediction)
    print(confusion_m)
    print("Accuracy:", accuracy)
    print("Precision (macro avg):", report['macro avg']['precision'])
    print("Recall (macro avg):", report['macro avg']['recall'])
    print("F1-score (macro avg):", report['macro avg']['f1-score'])
    printConfusionMatrix(confusion_m, 'AdsCopula', 'ads')
예제 #14
0
def write(state):
    if state.trained_model is not None:

        X_before_preprocess = state.X_before_preprocess
        target_name = state.y_before_preprocess
        df_X = X_before_preprocess.drop(target_name, axis=1)
        trained_model = state.trained_model
        min_value = X_before_preprocess[target_name].min()
        max_value = X_before_preprocess[target_name].max()
        mean_value = X_before_preprocess[target_name].mean()
        original_value = optimal_value = mean_value

        st.header("Knowledge Generation and Backward Analysis.")
        with st.beta_expander("Knowledge Generation"):
            st.markdown(
                '<p style="color:#1386fc">Please Select a Method to Generate Data.</p>',
                unsafe_allow_html=True)
            sdv_method = st.selectbox(
                'Method to Generate Data',
                options=['GaussianCopula', 'CTGAN', 'CopulaGAN', 'TVAE'])
            sample = st.number_input('How Many Samples of Data to Generate?',
                                     min_value=1,
                                     value=df_X.shape[0],
                                     key=1)

            if sdv_method == 'GaussianCopula':
                model = GaussianCopula()
            else:
                is_tune = st.checkbox("Do You Want to Tune Hyperparameters?",
                                      value=False)
                if sdv_method == 'CopulaGAN' or sdv_method == 'CTGAN':
                    epochs = 300
                    batch_size = 500
                    log_frequency = True
                    embedding_dim = 128
                    generator_dim = (256, 256)
                    discriminator_dim = (256, 256)
                    generator_lr = 0.0002
                    generator_decay = 1e-6
                    discriminator_lr = 0.0002
                    discriminator_decay = 1e-6
                    discriminator_steps = 1

                    if is_tune:
                        epochs = st.number_input(
                            "Number of Training Epochs (int)",
                            min_value=1,
                            value=300,
                            key=1)
                        batch_size = st.number_input(
                            "Number of Data Samples to Process, should be a multiple of 10 (int)",
                            min_value=1,
                            value=500,
                            key=1)
                        log_frequency = st.checkbox(
                            'Whether to Use Log Frequency', value=True)
                        embedding_dim = st.number_input(
                            "Size of the Random Sample Passed to the Generator (int)",
                            min_value=1,
                            value=128,
                            key=1)
                        generator_dim = st.text_input(
                            "Size of the Generator Residual Layer (int)",
                            value="256,256")
                        discriminator_dim = st.text_input(
                            "Size of the Discriminator Residual Layer (int)",
                            value="256,256")
                        generator_lr = st.number_input(
                            "Learning Rate for the Generator",
                            min_value=0.0,
                            value=0.0002,
                            format="%e")
                        generator_decay = st.number_input(
                            "Generator Weight Decay for the Adam Optimizer",
                            min_value=0.0,
                            value=1e-6,
                            format="%e")
                        discriminator_lr = st.number_input(
                            "Learning Rate for the Discriminator",
                            min_value=0.0,
                            value=0.0002,
                            format="%e")
                        discriminator_decay = st.number_input(
                            "Discriminator  Weight Decay for the Adam Optimizer",
                            min_value=0.0,
                            value=1e-6,
                            format="%e")
                        discriminator_steps = st.number_input(
                            "Number of Discriminator Updates to do for Each Generator Update (int)",
                            min_value=1,
                            value=1)

                        generator_dim = convert_str_to_list(generator_dim)
                        discriminator_dim = convert_str_to_list(
                            discriminator_dim)
                    if sdv_method == 'CopulaGAN':
                        model = CopulaGAN(
                            epochs=epochs,
                            batch_size=batch_size,
                            log_frequency=log_frequency,
                            embedding_dim=embedding_dim,
                            generator_dim=generator_dim,
                            discriminator_dim=discriminator_dim,
                            generator_lr=generator_lr,
                            generator_decay=generator_decay,
                            discriminator_lr=discriminator_lr,
                            discriminator_decay=discriminator_decay,
                            discriminator_steps=discriminator_steps)
                    if sdv_method == 'CTGAN':
                        model = CTGAN(epochs=epochs,
                                      batch_size=batch_size,
                                      log_frequency=log_frequency,
                                      embedding_dim=embedding_dim,
                                      generator_dim=generator_dim,
                                      discriminator_dim=discriminator_dim,
                                      generator_lr=generator_lr,
                                      generator_decay=generator_decay,
                                      discriminator_lr=discriminator_lr,
                                      discriminator_decay=discriminator_decay,
                                      discriminator_steps=discriminator_steps)
                else:
                    compress_dims = decompress_dims = (128, 128)
                    epochs = 300
                    batch_size = 500
                    embedding_dim = 128
                    l2_scale = 1e-5
                    if is_tune:
                        epochs = st.number_input(
                            "Number of Training Epochs (int)",
                            min_value=1,
                            value=300,
                            key=2)
                        batch_size = st.number_input(
                            "Number of Data Samples to Process, should be a multiple of 10 (int)",
                            min_value=1,
                            value=500,
                            key=2)
                        embedding_dim = st.number_input(
                            "Size of the Random Sample Passed to the Generator (int)",
                            min_value=1,
                            value=128,
                            key=2)
                        compress_dims = st.text_input(
                            "Size of Each Hidden Layer in the Encoder (int)",
                            value="128,128")
                        decompress_dims = st.text_input(
                            "Size of Each Hidden Layer in the Decoder (int)",
                            value="128,128")
                        l2_scale = st.number_input("Regularization term",
                                                   min_value=0.0,
                                                   value=1e-5,
                                                   format="%e")

                        compress_dims = convert_str_to_list(compress_dims)
                        decompress_dims = convert_str_to_list(decompress_dims)
                    model = TVAE(embedding_dim=embedding_dim,
                                 compress_dims=compress_dims,
                                 decompress_dims=decompress_dims,
                                 l2scale=l2_scale,
                                 batch_size=batch_size,
                                 epochs=epochs)

            button_generate = st.button("Generate")
            if button_generate:
                with st.spinner("Generating..."):
                    model.fit(df_X)
                    new_data = model.sample(sample)
                    new_data_prediction = predict_model(
                        trained_model, new_data)
                    st.write(new_data_prediction)
                    state.new_data_prediction = new_data_prediction

            button_download = st.button("Download Generated Data")
            if button_download:
                file_extension = st.selectbox(
                    "Choose Csv or Excel File to Download",
                    options=[".csv", ".xlsx"])
                file_name = st.text_input("File Name",
                                          value="prediction",
                                          key=1)
                if file_name:
                    href = download_button(state.new_data_prediction,
                                           file_name, "Download",
                                           file_extension)
                    st.markdown(href, unsafe_allow_html=True)
                else:
                    st.error("File Name cannot be empty!")

        st.markdown("---")
        with st.beta_expander("Backward Analysis"):
            col1, col2 = st.beta_columns(2)
            with col1:
                st.subheader("Please Select a Index for Data to Optimize")
                index = st.number_input("Index of Data",
                                        min_value=0,
                                        value=0,
                                        max_value=df_X.shape[0] - 1,
                                        key=1)
                st.write(X_before_preprocess.iloc[index])
                original_value = X_before_preprocess.iloc[index].loc[
                    target_name]
                # st.write(original_value)
            with col2:
                st.subheader("Optimize")
                lower_bound = st.number_input(
                    "The Lower Bound Value to Optimize", value=min_value)
                upper_bound = st.number_input(
                    "The Upper Bound Value to Optimize", value=max_value)
                button_optimize = st.button("Optimizer")
                if button_optimize:
                    if state.new_data_prediction is not None:
                        new_prediction = state.new_data_prediction['Label']
                        indices = find_top_5_nearest(new_prediction,
                                                     original_value)
                        optimal_value = new_prediction[indices[0]]
                        state.suggest_indices = indices
                        state.optimal_value = optimal_value
                    else:
                        st.error("Please Generate New Data first!")

        with st.beta_container():
            state.optimal_value = state.optimal_value if state.optimal_value is not None else 0
            fig = gauge_plot(original_value, state.optimal_value, lower_bound,
                             upper_bound, min_value, max_value)
            st.plotly_chart(fig)
            button_suggest = st.button("Show the Top 5 Suggestions")
            if button_suggest:
                suggestion = state.new_data_prediction.iloc[
                    state.suggest_indices[:5]]
                st.table(suggestion)
    else:
        st.error("Please Train a Model first!")
예제 #15
0
파일: tabular.py 프로젝트: sdv-dev/SDV
class TabularPreset():
    """Class for all tabular model presets.

    Args:
        name (str):
            The preset to use.
        metadata (dict or metadata.Table):
            Table metadata instance or dict representation.
        constraints (list[Constraint, dict]):
            List of Constraint objects or dicts.
    """

    _model = None
    _null_percentages = None
    _null_column = False
    _default_model = GaussianCopula

    def __init__(self, name=None, metadata=None, constraints=None):
        if name is None:
            raise ValueError(
                'You must provide the name of a preset using the `name` '
                'parameter. Use `TabularPreset.list_available_presets()` to browse '
                'through the options.')
        if name not in PRESETS:
            raise ValueError(f'`name` must be one of {PRESETS}.')

        self.name = name

        if metadata is None:
            warnings.warn(
                'No metadata provided. Metadata will be automatically '
                'detected from your data. This process may not be accurate. '
                'We recommend writing metadata to ensure correct data handling.'
            )

        if metadata is not None and isinstance(metadata, Table):
            metadata = metadata.to_dict()

        if metadata is not None and constraints is not None:
            metadata['constraints'] = []
            for constraint in constraints:
                metadata['constraints'].append(constraint.to_dict())

            constraints = None

        if name == FAST_ML_PRESET:
            self._model = GaussianCopula(
                table_metadata=metadata,
                constraints=constraints,
                categorical_transformer='categorical_fuzzy',
                default_distribution='gaussian',
                rounding=None,
            )

            # Decide if transformers should model the null column or not.
            self._null_column = constraints is not None
            if metadata is not None:
                self._null_column = len(metadata.get('constraints', [])) > 0

            # If transformers should model the null column, pass None to let each transformer
            # decide if it's necessary or not.
            transformer_null_column = None if self._null_column else False

            dtype_transformers = {
                'i':
                rdt.transformers.NumericalTransformer(
                    dtype=np.int64,
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                    min_value='auto',
                    max_value='auto',
                ),
                'f':
                rdt.transformers.NumericalTransformer(
                    dtype=np.float64,
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                    min_value='auto',
                    max_value='auto',
                ),
                'O':
                rdt.transformers.CategoricalTransformer(fuzzy=True),
                'b':
                rdt.transformers.BooleanTransformer(
                    nan=-1 if self._null_column else None,
                    null_column=transformer_null_column,
                ),
                'M':
                rdt.transformers.DatetimeTransformer(
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                ),
            }
            self._model._metadata._dtype_transformers.update(
                dtype_transformers)

    def fit(self, data):
        """Fit this model to the data.

        Args:
            data (pandas.DataFrame):
                Data to fit the model to.
        """
        if not self._null_column:
            self._null_percentages = {}

            for column, column_data in data.iteritems():
                num_nulls = column_data.isna().sum()
                if num_nulls > 0:
                    # Store null percentage for future reference.
                    self._null_percentages[column] = num_nulls / len(
                        column_data)

        self._model.fit(data)

    def _postprocess_sampled(self, sampled):
        """Postprocess the sampled data.

        Add null values back based on null percentages captured in the fitting process.

        Args:
            sampled (pandas.DataFrame):
                The sampled data to postprocess.

        Returns:
            pandas.DataFrame
        """
        if self._null_percentages:
            for column, percentage in self._null_percentages.items():
                sampled[column] = sampled[column].mask(
                    np.random.random((len(sampled), )) < percentage)

        return sampled

    def sample(self,
               num_rows,
               randomize_samples=True,
               batch_size=None,
               output_file_path=None,
               conditions=None):
        """Sample rows from this table.

        Args:
            num_rows (int):
                Number of rows to sample. This parameter is required.
            randomize_samples (bool):
                Whether or not to use a fixed seed when sampling. Defaults
                to True.
            batch_size (int or None):
                The batch size to sample. Defaults to `num_rows`, if None.
            output_file_path (str or None):
                The file to periodically write sampled rows to. If None, does not
                write rows anywhere.
            conditions:
                Deprecated argument. Use the `sample_conditions` method with
                `sdv.sampling.Condition` objects instead.

        Returns:
            pandas.DataFrame:
                Sampled data.
        """
        sampled = self._model.sample(num_rows, randomize_samples, batch_size,
                                     output_file_path, conditions)

        return self._postprocess_sampled(sampled)

    def sample_conditions(self,
                          conditions,
                          max_tries=100,
                          batch_size_per_try=None,
                          randomize_samples=True,
                          output_file_path=None):
        """Sample rows from this table with the given conditions.

        Args:
            conditions (list[sdv.sampling.Condition]):
                A list of sdv.sampling.Condition objects, which specify the column
                values in a condition, along with the number of rows for that
                condition.
            max_tries (int):
                Number of times to try sampling discarded rows. Defaults to 100.
            batch_size_per_try (int):
                The batch size to use per attempt at sampling. Defaults to 10 times
                the number of rows.
            randomize_samples (bool):
                Whether or not to use a fixed seed when sampling. Defaults
                to True.
            output_file_path (str or None):
                The file to periodically write sampled rows to. Defaults to
                a temporary file, if None.

        Returns:
            pandas.DataFrame:
                Sampled data.
        """
        if isinstance(self._model, GaussianCopula):
            sampled = self._model.sample_conditions(
                conditions,
                batch_size=batch_size_per_try,
                randomize_samples=randomize_samples,
                output_file_path=output_file_path,
            )
        else:
            sampled = self._model.sample_conditions(conditions, max_tries,
                                                    batch_size_per_try,
                                                    randomize_samples,
                                                    output_file_path)

        return self._postprocess_sampled(sampled)

    def sample_remaining_columns(self,
                                 known_columns,
                                 max_tries=100,
                                 batch_size_per_try=None,
                                 randomize_samples=True,
                                 output_file_path=None):
        """Sample rows from this table.

        Args:
            known_columns (pandas.DataFrame):
                A pandas.DataFrame with the columns that are already known. The output
                is a DataFrame such that each row in the output is sampled
                conditionally on the corresponding row in the input.
            max_tries (int):
                Number of times to try sampling discarded rows. Defaults to 100.
            batch_size_per_try (int):
                The batch size to use per attempt at sampling. Defaults to 10 times
                the number of rows.
            randomize_samples (bool):
                Whether or not to use a fixed seed when sampling. Defaults
                to True.
            output_file_path (str or None):
                The file to periodically write sampled rows to. Defaults to
                a temporary file, if None.

        Returns:
            pandas.DataFrame:
                Sampled data.
        """
        if isinstance(self._model, GaussianCopula):
            sampled = self._model.sample_remaining_columns(
                known_columns,
                batch_size=batch_size_per_try,
                randomize_samples=randomize_samples,
                output_file_path=output_file_path,
            )
        else:
            sampled = self._model.sample_remaining_columns(
                known_columns, max_tries, batch_size_per_try,
                randomize_samples, output_file_path)

        return self._postprocess_sampled(sampled)

    def save(self, path):
        """Save this model instance to the given path using pickle.

        Args:
            path (str):
                Path where the SDV instance will be serialized.
        """
        self._package_versions = get_package_versions(
            getattr(self, '_model', None))

        with open(path, 'wb') as output:
            pickle.dump(self, output)

    @classmethod
    def load(cls, path):
        """Load a TabularModel instance from a given path.

        Args:
            path (str):
                Path from which to load the instance.

        Returns:
            TabularModel:
                The loaded tabular model.
        """
        with open(path, 'rb') as f:
            model = pickle.load(f)
            throw_version_mismatch_warning(
                getattr(model, '_package_versions', None))

            return model

    @classmethod
    def list_available_presets(cls, out=sys.stdout):
        """List the available presets and their descriptions."""
        out.write(
            f'Available presets:\n{PRESETS}\n\n'
            'Supply the desired preset using the `name` parameter.\n\n'
            'Have any requests for custom presets? Contact the SDV team to learn '
            'more an SDV Premium license.\n')

    def __repr__(self):
        """Represent tabular preset instance as text.

        Returns:
            str
        """
        return f'TabularPreset(name={self.name})'
예제 #16
0
파일: tabular.py 프로젝트: sdv-dev/SDV
    def __init__(self, name=None, metadata=None, constraints=None):
        if name is None:
            raise ValueError(
                'You must provide the name of a preset using the `name` '
                'parameter. Use `TabularPreset.list_available_presets()` to browse '
                'through the options.')
        if name not in PRESETS:
            raise ValueError(f'`name` must be one of {PRESETS}.')

        self.name = name

        if metadata is None:
            warnings.warn(
                'No metadata provided. Metadata will be automatically '
                'detected from your data. This process may not be accurate. '
                'We recommend writing metadata to ensure correct data handling.'
            )

        if metadata is not None and isinstance(metadata, Table):
            metadata = metadata.to_dict()

        if metadata is not None and constraints is not None:
            metadata['constraints'] = []
            for constraint in constraints:
                metadata['constraints'].append(constraint.to_dict())

            constraints = None

        if name == FAST_ML_PRESET:
            self._model = GaussianCopula(
                table_metadata=metadata,
                constraints=constraints,
                categorical_transformer='categorical_fuzzy',
                default_distribution='gaussian',
                rounding=None,
            )

            # Decide if transformers should model the null column or not.
            self._null_column = constraints is not None
            if metadata is not None:
                self._null_column = len(metadata.get('constraints', [])) > 0

            # If transformers should model the null column, pass None to let each transformer
            # decide if it's necessary or not.
            transformer_null_column = None if self._null_column else False

            dtype_transformers = {
                'i':
                rdt.transformers.NumericalTransformer(
                    dtype=np.int64,
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                    min_value='auto',
                    max_value='auto',
                ),
                'f':
                rdt.transformers.NumericalTransformer(
                    dtype=np.float64,
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                    min_value='auto',
                    max_value='auto',
                ),
                'O':
                rdt.transformers.CategoricalTransformer(fuzzy=True),
                'b':
                rdt.transformers.BooleanTransformer(
                    nan=-1 if self._null_column else None,
                    null_column=transformer_null_column,
                ),
                'M':
                rdt.transformers.DatetimeTransformer(
                    nan='mean' if self._null_column else None,
                    null_column=transformer_null_column,
                ),
            }
            self._model._metadata._dtype_transformers.update(
                dtype_transformers)
예제 #17
0
#%%
from sdv import Metadata
from sdv.relational import HMA1
from sdv.tabular import GaussianCopula
from sdv.timeseries import PAR
import numpy as np
import pandas as pd
from base_data import *

data = create_transactions(50)
base_model = GaussianCopula()
base_model.fit(data)
transactions = base_model.sample(1000)

transactions.to_csv('Transactions.csv', index=False)

#Products
products = pd.DataFrame(transactions['Product id'].drop_duplicates())
products['Product group'] = products['Product id'].apply(
    lambda x: x.split('-')[0])
products['Product cost'] = [
    round(x, 3)
    for x in np.random.gamma(shape=10, scale=5, size=products.shape[0])
]
products['Product inventory unit'] = 'st'
products['Product available in stock'] = [
    round(max(0, x)) for x in np.random.triangular(
        left=-50, mode=50, right=120, size=products.shape[0])
]

products.to_csv('Products.csv', index=False)