def test_FixedIncrements(): """Test the ``FixedIncrements`` constraint end to end.""" # Setup values = np.random.randint(1, 10, size=20) * 5 data = pd.DataFrame({'column': values}) constraint = FixedIncrements(column_name='column', increment_value=5) gc = GaussianCopula(constraints=[constraint]) gc.fit(data) # Run sampled = gc.sample(100) # Assert assert all(sampled % 5 == 0)
def test_constraints(tmpdir): # Setup employees = load_tabular_demo() fixed_company_department_constraint = FixedCombinations( column_names=['company', 'department']) age_gt_age_when_joined_constraint = Inequality( low_column_name='age_when_joined', high_column_name='age') age_range_constraint = ScalarRange('age', 29, 50) constraints = [ fixed_company_department_constraint, age_gt_age_when_joined_constraint, age_range_constraint ] # Run gc = GaussianCopula(constraints=constraints, min_value=None, max_value=None) gc.fit(employees) gc.save(tmpdir / 'test.pkl') gc = gc.load(tmpdir / 'test.pkl') sampled = gc.sample(10) # Assert assert all(age_gt_age_when_joined_constraint.is_valid(sampled)) assert all(age_range_constraint.is_valid(sampled)) assert all(fixed_company_department_constraint.is_valid(sampled))
def test_ScalarInequality(): """Test the ``ScalarInequality`` constraint end to end.""" # Setup data = pd.DataFrame({ 'low': np.random.randint(1, 10, size=20), }) constraint = ScalarInequality(column_name='low', value=11, relation='<') gc = GaussianCopula(constraints=[constraint]) gc.fit(data) # Run sampled = gc.sample(10) # Assert assert all(sampled['low'] < 11)
def test_Inequality(): """Test the ``Inequality`` constraint end to end.""" # Setup data = pd.DataFrame({ 'low': np.random.randint(1, 10, size=20), 'high': np.random.randint(10, 20, size=20) }) constraint = Inequality('low', 'high') gc = GaussianCopula(constraints=[constraint]) gc.fit(data) # Run sampled = gc.sample(10) # Assert assert all(sampled['low'] <= sampled['high'])
def test_create_custom_constraint(): """Test the ``create_custom_constraint`` method end to end.""" # Setup custom_constraint = create_custom_constraint( lambda _, x: pd.Series( [True if x_i > 0 else False for x_i in x['col']]), lambda _, x: pd.DataFrame({'col': x['col']**2}), lambda _, x: pd.DataFrame({'col': x['col']**.5}))('col') data = pd.DataFrame({'col': np.random.randint(1, 10, size=100)}) gc = GaussianCopula(constraints=[custom_constraint]) gc.fit(data) # Run sampled = gc.sample(100) # Assert assert all(sampled > 0)
def test_ScalarRange(): """Test the ``ScalarRange`` constraint end to end.""" # Setup data = pd.DataFrame({ 'column': np.random.randint(6, 10, size=20), }) constraint = ScalarRange(column_name='column', low_value=5, high_value=11, strict_boundaries=True) gc = GaussianCopula(constraints=[constraint]) gc.fit(data) # Run sampled = gc.sample(100) # Assert assert sampled.column.min() >= 5 assert sampled.column.max() <= 11
def test_invalid_create_custom_constraint(): """Test the an invalid ``create_custom_constraint`` method end to end. It should correctly sample the synthetic data through reject sample. """ # Setup custom_constraint = create_custom_constraint( lambda _, x: pd.Series( [True if x_i > 0 else False for x_i in x['col']]), lambda _: pd.DataFrame({'col': [10 / 0] * 100}), lambda _, x: pd.DataFrame({'col': x['col']**.5}))('col') data = pd.DataFrame({'col': np.random.randint(1, 10, size=100)}) gc = GaussianCopula(constraints=[custom_constraint]) gc.fit(data) # Run sampled = gc.sample(100) # Assert assert all(sampled > 0)
def test_Range(): """Test the ``Range`` constraint end to end.""" # Setup data = pd.DataFrame({ 'low_column': np.random.randint(1, 5, size=20), 'middle_column': np.random.randint(6, 10, size=20), 'high_column': np.random.randint(11, 20, size=20), }) constraint = Range(low_column_name='low_column', middle_column_name='middle_column', high_column_name='high_column', strict_boundaries=True) gc = GaussianCopula(constraints=[constraint]) gc.fit(data) # Run sampled = gc.sample(100) # Assert assert sampled.middle_column.min() >= sampled.low_column.min() assert sampled.middle_column.max() <= sampled.high_column.max()
def generateSyntheticData(ds, mode='', num_sample=5000): #I need to merge together x_train y_train #generate the new dataset with gaussian copula #split again the new data # fit the classifier # test the classifier using x_test and y_test # mode = 'gaussian' , '' print('generating artificial data for ', ds, ' dataset using ', mode, ' model ') x_train = pd.read_csv('splittedDatasets/x_train_' + ds + '.csv') y_train = pd.read_csv('splittedDatasets/y_train_' + ds + '.csv') if ds == 'income': target = 'class' if ds == 'titanic': target = 'Survived' if ds == 'social': target = 'Purchased' if mode == 'gaussian_copula': model = GaussianCopula() elif mode == 'ctGAN': model = CTGAN() elif mode == 'copulaGAN': model = CopulaGAN() df_all = pd.merge(x_train, y_train, left_index=True, right_index=True) model.fit(df_all) synthetic_data = model.sample(num_sample) if not os.path.isdir('generatedData'): os.system('mkdir generatedData') synthetic_data[target].to_csv('generatedData/y_train_' + ds + '_' + mode + '.csv', index=False) y_train_s = synthetic_data[[target]] del synthetic_data[target] synthetic_data.to_csv('generatedData/x_train_' + ds + '_' + mode + '.csv', index=False) x_train_s = synthetic_data make_histos_2(ds, synthetic_data, what=mode) return x_train_s, y_train_s
def test_constraints_reject_sampling_zero_valid(): """Ensure everything works if no rows are valid on the first try. See https://github.com/sdv-dev/SDV/issues/285 """ employees = load_tabular_demo() _IS_VALID_CALLED.clear() constraint = CustomConstraint(is_valid=_is_valid) gc = GaussianCopula(constraints=[constraint]) gc.fit(employees) gc.sample(10)
def test_constraints(tmpdir): employees = load_tabular_demo() unique_company_department_constraint = UniqueCombinations( columns=['company', 'department'], handling_strategy='transform') age_gt_age_when_joined_constraint = GreaterThan( low='age_when_joined', high='age', handling_strategy='reject_sampling') years_in_the_company_constraint = ColumnFormula( column='years_in_the_company', formula=years_in_the_company, handling_strategy='transform') constraints = [ unique_company_department_constraint, age_gt_age_when_joined_constraint, years_in_the_company_constraint ] gc = GaussianCopula(constraints=constraints) gc.fit(employees) gc.save(tmpdir / 'test.pkl') gc = gc.load(tmpdir / 'test.pkl') gc.sample(10)
def test_failing_constraints(): data = pd.DataFrame({ 'a': [0, 0, 0, 0, 0, 0, 0], 'b': [1, -1, 2, -2, 3, -3, 0], 'c': [-1, -1, -1, -1, -1, -1, -1], 'd': [1, -1, 2, -2, 3, -3, 5], 'e': [1, 2, 3, 4, 5, 6, 'a'], 'f': [1, 1, 2, 2, 3, 3, -1], 'g': [1, 0, 1, 0, 0, 1, 0], 'h': [1, 1, 1, 0, 0, 10, 0], 'i': [1, 1, 1, 1, 1, 1, 1], 'j': [2, 3, 4, 5, 6, 7, 5.5], 'k': [1, -1, 2, -2, 3, -3, 5] }) custom_constraint = create_custom_constraint( lambda _, x: pd.Series([True if x_i > 0 else False for x_i in x['k']])) constraints = [ Inequality('a', 'b'), Positive('c'), Negative('d'), OneHotEncoding(['g', 'h']), Unique(['i']), ScalarInequality('j', '>=', 5.5), Range('a', 'b', 'c'), ScalarRange('a', 0, 0), custom_constraint('k') ] gc = GaussianCopula(constraints=constraints) err_msg = re.escape( "Data is not valid for the 'Inequality' constraint:" '\n a b' '\n1 0 -1' '\n3 0 -2' '\n5 0 -3' '\n' "\nData is not valid for the 'Positive' constraint:" '\n c' '\n0 -1' '\n1 -1' '\n2 -1' '\n3 -1' '\n4 -1' '\n+2 more' '\n' "\nData is not valid for the 'Negative' constraint:" '\n d' '\n0 1' '\n2 2' '\n4 3' '\n6 5' '\n' "\nData is not valid for the 'OneHotEncoding' constraint:" '\n g h' '\n0 1 1' '\n2 1 1' '\n3 0 0' '\n4 0 0' '\n5 1 10' '\n+1 more' '\n' "\nData is not valid for the 'Unique' constraint:" '\n i' '\n1 1' '\n2 1' '\n3 1' '\n4 1' '\n5 1' '\n+1 more' '\n' "\nData is not valid for the 'ScalarInequality' constraint:" '\n j' '\n0 2.0' '\n1 3.0' '\n2 4.0' '\n3 5.0' '\n' "\nData is not valid for the 'Range' constraint:" '\n a b c' '\n0 0 1 -1' '\n1 0 -1 -1' '\n2 0 2 -1' '\n3 0 -2 -1' '\n4 0 3 -1' '\n+2 more' '\n' "\nData is not valid for the 'ScalarRange' constraint:" '\n a' '\n0 0' '\n1 0' '\n2 0' '\n3 0' '\n4 0' '\n+2 more' '\n' "\nData is not valid for the 'CustomConstraint' constraint:" '\n k' '\n1 -1' '\n3 -2' '\n5 -3') with pytest.raises(MultipleConstraintsErrors, match=err_msg): gc.fit(data)
def generateGaussianCopulaModel(): #I need to merge together x_train y_train #generate the new dataset with gaussian copula #split again the new data #fit the classifier #test the classifier using x_test and y_test print( 'generating artificial data for adult income dataset using gaussian model' ) x_train = pd.read_csv('splittedDatasets/x_trainAdult.csv') y_train = pd.read_csv('splittedDatasets/y_trainAdult.csv') model = GaussianCopula() model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True)) newAdultIncomeData = model.sample(len(x_train) - 1) newAdultIncomeData['class'].to_csv( 'generatedData/y_trainAdultGaussian.csv', index=False) del newAdultIncomeData['class'] newAdultIncomeData.to_csv('generatedData/x_trainAdultGaussian.csv', index=False) x_test = pd.read_csv('splittedDatasets/x_testAdult.csv') y_test = pd.read_csv('splittedDatasets/y_testAdult.csv') print('fitting and testing RF using the new generated data') cf = Classifier(x_train, y_train) y_prediction = predict( x_test, cf, ) confusion_m, accuracy, report = evaluation(y_test, y_prediction) print(confusion_m) print("Accuracy:", accuracy) print("Precision (macro avg):", report['macro avg']['precision']) print("Recall (macro avg):", report['macro avg']['recall']) print("F1-score (macro avg):", report['macro avg']['f1-score']) printConfusionMatrix(confusion_m, 'AdultIncomeCopula', 'adult') print( 'generating artificial data for Titanic dataset using gaussian model') x_train = pd.read_csv('splittedDatasets/x_trainTitanic.csv') y_train = pd.read_csv('splittedDatasets/y_trainTitanic.csv') model = GaussianCopula() model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True)) newAdultIncomeData = model.sample(len(x_train) - 1) newAdultIncomeData['Survived'].to_csv( 'generatedData/y_trainTitanicGaussian.csv', index=False) del newAdultIncomeData['Survived'] newAdultIncomeData.to_csv('generatedData/x_trainTitanicGaussian.csv', index=False) x_test = pd.read_csv('splittedDatasets/x_testTitanic.csv') y_test = pd.read_csv('splittedDatasets/y_testTitanic.csv') print('fitting and testing RF using the new generated data') cf = Classifier(x_train, y_train) y_prediction = predict( x_test, cf, ) confusion_m, accuracy, report = evaluation(y_test, y_prediction) print(confusion_m) print("Accuracy:", accuracy) print("Precision (macro avg):", report['macro avg']['precision']) print("Recall (macro avg):", report['macro avg']['recall']) print("F1-score (macro avg):", report['macro avg']['f1-score']) printConfusionMatrix(confusion_m, 'titanicCopula', 'titanic') print('generating artificial data for Ads dataset using gaussian model') x_train = pd.read_csv('splittedDatasets/x_trainAds.csv') y_train = pd.read_csv('splittedDatasets/y_trainAds.csv') model = GaussianCopula() model.fit(pd.merge(x_train, y_train, left_index=True, right_index=True)) newAdultIncomeData = model.sample(len(x_train) - 1) newAdultIncomeData['Purchased'].to_csv( 'generatedData/y_trainAdsGaussian.csv', index=False) del newAdultIncomeData['Purchased'] newAdultIncomeData.to_csv('generatedData/x_trainAdsGaussian.csv', index=False) x_test = pd.read_csv('splittedDatasets/x_testAds.csv') y_test = pd.read_csv('splittedDatasets/y_testAds.csv') print('fitting and testing RF using the new generated data') cf = Classifier(x_train, y_train) y_prediction = predict( x_test, cf, ) confusion_m, accuracy, report = evaluation(y_test, y_prediction) print(confusion_m) print("Accuracy:", accuracy) print("Precision (macro avg):", report['macro avg']['precision']) print("Recall (macro avg):", report['macro avg']['recall']) print("F1-score (macro avg):", report['macro avg']['f1-score']) printConfusionMatrix(confusion_m, 'AdsCopula', 'ads')
def write(state): if state.trained_model is not None: X_before_preprocess = state.X_before_preprocess target_name = state.y_before_preprocess df_X = X_before_preprocess.drop(target_name, axis=1) trained_model = state.trained_model min_value = X_before_preprocess[target_name].min() max_value = X_before_preprocess[target_name].max() mean_value = X_before_preprocess[target_name].mean() original_value = optimal_value = mean_value st.header("Knowledge Generation and Backward Analysis.") with st.beta_expander("Knowledge Generation"): st.markdown( '<p style="color:#1386fc">Please Select a Method to Generate Data.</p>', unsafe_allow_html=True) sdv_method = st.selectbox( 'Method to Generate Data', options=['GaussianCopula', 'CTGAN', 'CopulaGAN', 'TVAE']) sample = st.number_input('How Many Samples of Data to Generate?', min_value=1, value=df_X.shape[0], key=1) if sdv_method == 'GaussianCopula': model = GaussianCopula() else: is_tune = st.checkbox("Do You Want to Tune Hyperparameters?", value=False) if sdv_method == 'CopulaGAN' or sdv_method == 'CTGAN': epochs = 300 batch_size = 500 log_frequency = True embedding_dim = 128 generator_dim = (256, 256) discriminator_dim = (256, 256) generator_lr = 0.0002 generator_decay = 1e-6 discriminator_lr = 0.0002 discriminator_decay = 1e-6 discriminator_steps = 1 if is_tune: epochs = st.number_input( "Number of Training Epochs (int)", min_value=1, value=300, key=1) batch_size = st.number_input( "Number of Data Samples to Process, should be a multiple of 10 (int)", min_value=1, value=500, key=1) log_frequency = st.checkbox( 'Whether to Use Log Frequency', value=True) embedding_dim = st.number_input( "Size of the Random Sample Passed to the Generator (int)", min_value=1, value=128, key=1) generator_dim = st.text_input( "Size of the Generator Residual Layer (int)", value="256,256") discriminator_dim = st.text_input( "Size of the Discriminator Residual Layer (int)", value="256,256") generator_lr = st.number_input( "Learning Rate for the Generator", min_value=0.0, value=0.0002, format="%e") generator_decay = st.number_input( "Generator Weight Decay for the Adam Optimizer", min_value=0.0, value=1e-6, format="%e") discriminator_lr = st.number_input( "Learning Rate for the Discriminator", min_value=0.0, value=0.0002, format="%e") discriminator_decay = st.number_input( "Discriminator Weight Decay for the Adam Optimizer", min_value=0.0, value=1e-6, format="%e") discriminator_steps = st.number_input( "Number of Discriminator Updates to do for Each Generator Update (int)", min_value=1, value=1) generator_dim = convert_str_to_list(generator_dim) discriminator_dim = convert_str_to_list( discriminator_dim) if sdv_method == 'CopulaGAN': model = CopulaGAN( epochs=epochs, batch_size=batch_size, log_frequency=log_frequency, embedding_dim=embedding_dim, generator_dim=generator_dim, discriminator_dim=discriminator_dim, generator_lr=generator_lr, generator_decay=generator_decay, discriminator_lr=discriminator_lr, discriminator_decay=discriminator_decay, discriminator_steps=discriminator_steps) if sdv_method == 'CTGAN': model = CTGAN(epochs=epochs, batch_size=batch_size, log_frequency=log_frequency, embedding_dim=embedding_dim, generator_dim=generator_dim, discriminator_dim=discriminator_dim, generator_lr=generator_lr, generator_decay=generator_decay, discriminator_lr=discriminator_lr, discriminator_decay=discriminator_decay, discriminator_steps=discriminator_steps) else: compress_dims = decompress_dims = (128, 128) epochs = 300 batch_size = 500 embedding_dim = 128 l2_scale = 1e-5 if is_tune: epochs = st.number_input( "Number of Training Epochs (int)", min_value=1, value=300, key=2) batch_size = st.number_input( "Number of Data Samples to Process, should be a multiple of 10 (int)", min_value=1, value=500, key=2) embedding_dim = st.number_input( "Size of the Random Sample Passed to the Generator (int)", min_value=1, value=128, key=2) compress_dims = st.text_input( "Size of Each Hidden Layer in the Encoder (int)", value="128,128") decompress_dims = st.text_input( "Size of Each Hidden Layer in the Decoder (int)", value="128,128") l2_scale = st.number_input("Regularization term", min_value=0.0, value=1e-5, format="%e") compress_dims = convert_str_to_list(compress_dims) decompress_dims = convert_str_to_list(decompress_dims) model = TVAE(embedding_dim=embedding_dim, compress_dims=compress_dims, decompress_dims=decompress_dims, l2scale=l2_scale, batch_size=batch_size, epochs=epochs) button_generate = st.button("Generate") if button_generate: with st.spinner("Generating..."): model.fit(df_X) new_data = model.sample(sample) new_data_prediction = predict_model( trained_model, new_data) st.write(new_data_prediction) state.new_data_prediction = new_data_prediction button_download = st.button("Download Generated Data") if button_download: file_extension = st.selectbox( "Choose Csv or Excel File to Download", options=[".csv", ".xlsx"]) file_name = st.text_input("File Name", value="prediction", key=1) if file_name: href = download_button(state.new_data_prediction, file_name, "Download", file_extension) st.markdown(href, unsafe_allow_html=True) else: st.error("File Name cannot be empty!") st.markdown("---") with st.beta_expander("Backward Analysis"): col1, col2 = st.beta_columns(2) with col1: st.subheader("Please Select a Index for Data to Optimize") index = st.number_input("Index of Data", min_value=0, value=0, max_value=df_X.shape[0] - 1, key=1) st.write(X_before_preprocess.iloc[index]) original_value = X_before_preprocess.iloc[index].loc[ target_name] # st.write(original_value) with col2: st.subheader("Optimize") lower_bound = st.number_input( "The Lower Bound Value to Optimize", value=min_value) upper_bound = st.number_input( "The Upper Bound Value to Optimize", value=max_value) button_optimize = st.button("Optimizer") if button_optimize: if state.new_data_prediction is not None: new_prediction = state.new_data_prediction['Label'] indices = find_top_5_nearest(new_prediction, original_value) optimal_value = new_prediction[indices[0]] state.suggest_indices = indices state.optimal_value = optimal_value else: st.error("Please Generate New Data first!") with st.beta_container(): state.optimal_value = state.optimal_value if state.optimal_value is not None else 0 fig = gauge_plot(original_value, state.optimal_value, lower_bound, upper_bound, min_value, max_value) st.plotly_chart(fig) button_suggest = st.button("Show the Top 5 Suggestions") if button_suggest: suggestion = state.new_data_prediction.iloc[ state.suggest_indices[:5]] st.table(suggestion) else: st.error("Please Train a Model first!")
class TabularPreset(): """Class for all tabular model presets. Args: name (str): The preset to use. metadata (dict or metadata.Table): Table metadata instance or dict representation. constraints (list[Constraint, dict]): List of Constraint objects or dicts. """ _model = None _null_percentages = None _null_column = False _default_model = GaussianCopula def __init__(self, name=None, metadata=None, constraints=None): if name is None: raise ValueError( 'You must provide the name of a preset using the `name` ' 'parameter. Use `TabularPreset.list_available_presets()` to browse ' 'through the options.') if name not in PRESETS: raise ValueError(f'`name` must be one of {PRESETS}.') self.name = name if metadata is None: warnings.warn( 'No metadata provided. Metadata will be automatically ' 'detected from your data. This process may not be accurate. ' 'We recommend writing metadata to ensure correct data handling.' ) if metadata is not None and isinstance(metadata, Table): metadata = metadata.to_dict() if metadata is not None and constraints is not None: metadata['constraints'] = [] for constraint in constraints: metadata['constraints'].append(constraint.to_dict()) constraints = None if name == FAST_ML_PRESET: self._model = GaussianCopula( table_metadata=metadata, constraints=constraints, categorical_transformer='categorical_fuzzy', default_distribution='gaussian', rounding=None, ) # Decide if transformers should model the null column or not. self._null_column = constraints is not None if metadata is not None: self._null_column = len(metadata.get('constraints', [])) > 0 # If transformers should model the null column, pass None to let each transformer # decide if it's necessary or not. transformer_null_column = None if self._null_column else False dtype_transformers = { 'i': rdt.transformers.NumericalTransformer( dtype=np.int64, nan='mean' if self._null_column else None, null_column=transformer_null_column, min_value='auto', max_value='auto', ), 'f': rdt.transformers.NumericalTransformer( dtype=np.float64, nan='mean' if self._null_column else None, null_column=transformer_null_column, min_value='auto', max_value='auto', ), 'O': rdt.transformers.CategoricalTransformer(fuzzy=True), 'b': rdt.transformers.BooleanTransformer( nan=-1 if self._null_column else None, null_column=transformer_null_column, ), 'M': rdt.transformers.DatetimeTransformer( nan='mean' if self._null_column else None, null_column=transformer_null_column, ), } self._model._metadata._dtype_transformers.update( dtype_transformers) def fit(self, data): """Fit this model to the data. Args: data (pandas.DataFrame): Data to fit the model to. """ if not self._null_column: self._null_percentages = {} for column, column_data in data.iteritems(): num_nulls = column_data.isna().sum() if num_nulls > 0: # Store null percentage for future reference. self._null_percentages[column] = num_nulls / len( column_data) self._model.fit(data) def _postprocess_sampled(self, sampled): """Postprocess the sampled data. Add null values back based on null percentages captured in the fitting process. Args: sampled (pandas.DataFrame): The sampled data to postprocess. Returns: pandas.DataFrame """ if self._null_percentages: for column, percentage in self._null_percentages.items(): sampled[column] = sampled[column].mask( np.random.random((len(sampled), )) < percentage) return sampled def sample(self, num_rows, randomize_samples=True, batch_size=None, output_file_path=None, conditions=None): """Sample rows from this table. Args: num_rows (int): Number of rows to sample. This parameter is required. randomize_samples (bool): Whether or not to use a fixed seed when sampling. Defaults to True. batch_size (int or None): The batch size to sample. Defaults to `num_rows`, if None. output_file_path (str or None): The file to periodically write sampled rows to. If None, does not write rows anywhere. conditions: Deprecated argument. Use the `sample_conditions` method with `sdv.sampling.Condition` objects instead. Returns: pandas.DataFrame: Sampled data. """ sampled = self._model.sample(num_rows, randomize_samples, batch_size, output_file_path, conditions) return self._postprocess_sampled(sampled) def sample_conditions(self, conditions, max_tries=100, batch_size_per_try=None, randomize_samples=True, output_file_path=None): """Sample rows from this table with the given conditions. Args: conditions (list[sdv.sampling.Condition]): A list of sdv.sampling.Condition objects, which specify the column values in a condition, along with the number of rows for that condition. max_tries (int): Number of times to try sampling discarded rows. Defaults to 100. batch_size_per_try (int): The batch size to use per attempt at sampling. Defaults to 10 times the number of rows. randomize_samples (bool): Whether or not to use a fixed seed when sampling. Defaults to True. output_file_path (str or None): The file to periodically write sampled rows to. Defaults to a temporary file, if None. Returns: pandas.DataFrame: Sampled data. """ if isinstance(self._model, GaussianCopula): sampled = self._model.sample_conditions( conditions, batch_size=batch_size_per_try, randomize_samples=randomize_samples, output_file_path=output_file_path, ) else: sampled = self._model.sample_conditions(conditions, max_tries, batch_size_per_try, randomize_samples, output_file_path) return self._postprocess_sampled(sampled) def sample_remaining_columns(self, known_columns, max_tries=100, batch_size_per_try=None, randomize_samples=True, output_file_path=None): """Sample rows from this table. Args: known_columns (pandas.DataFrame): A pandas.DataFrame with the columns that are already known. The output is a DataFrame such that each row in the output is sampled conditionally on the corresponding row in the input. max_tries (int): Number of times to try sampling discarded rows. Defaults to 100. batch_size_per_try (int): The batch size to use per attempt at sampling. Defaults to 10 times the number of rows. randomize_samples (bool): Whether or not to use a fixed seed when sampling. Defaults to True. output_file_path (str or None): The file to periodically write sampled rows to. Defaults to a temporary file, if None. Returns: pandas.DataFrame: Sampled data. """ if isinstance(self._model, GaussianCopula): sampled = self._model.sample_remaining_columns( known_columns, batch_size=batch_size_per_try, randomize_samples=randomize_samples, output_file_path=output_file_path, ) else: sampled = self._model.sample_remaining_columns( known_columns, max_tries, batch_size_per_try, randomize_samples, output_file_path) return self._postprocess_sampled(sampled) def save(self, path): """Save this model instance to the given path using pickle. Args: path (str): Path where the SDV instance will be serialized. """ self._package_versions = get_package_versions( getattr(self, '_model', None)) with open(path, 'wb') as output: pickle.dump(self, output) @classmethod def load(cls, path): """Load a TabularModel instance from a given path. Args: path (str): Path from which to load the instance. Returns: TabularModel: The loaded tabular model. """ with open(path, 'rb') as f: model = pickle.load(f) throw_version_mismatch_warning( getattr(model, '_package_versions', None)) return model @classmethod def list_available_presets(cls, out=sys.stdout): """List the available presets and their descriptions.""" out.write( f'Available presets:\n{PRESETS}\n\n' 'Supply the desired preset using the `name` parameter.\n\n' 'Have any requests for custom presets? Contact the SDV team to learn ' 'more an SDV Premium license.\n') def __repr__(self): """Represent tabular preset instance as text. Returns: str """ return f'TabularPreset(name={self.name})'
def __init__(self, name=None, metadata=None, constraints=None): if name is None: raise ValueError( 'You must provide the name of a preset using the `name` ' 'parameter. Use `TabularPreset.list_available_presets()` to browse ' 'through the options.') if name not in PRESETS: raise ValueError(f'`name` must be one of {PRESETS}.') self.name = name if metadata is None: warnings.warn( 'No metadata provided. Metadata will be automatically ' 'detected from your data. This process may not be accurate. ' 'We recommend writing metadata to ensure correct data handling.' ) if metadata is not None and isinstance(metadata, Table): metadata = metadata.to_dict() if metadata is not None and constraints is not None: metadata['constraints'] = [] for constraint in constraints: metadata['constraints'].append(constraint.to_dict()) constraints = None if name == FAST_ML_PRESET: self._model = GaussianCopula( table_metadata=metadata, constraints=constraints, categorical_transformer='categorical_fuzzy', default_distribution='gaussian', rounding=None, ) # Decide if transformers should model the null column or not. self._null_column = constraints is not None if metadata is not None: self._null_column = len(metadata.get('constraints', [])) > 0 # If transformers should model the null column, pass None to let each transformer # decide if it's necessary or not. transformer_null_column = None if self._null_column else False dtype_transformers = { 'i': rdt.transformers.NumericalTransformer( dtype=np.int64, nan='mean' if self._null_column else None, null_column=transformer_null_column, min_value='auto', max_value='auto', ), 'f': rdt.transformers.NumericalTransformer( dtype=np.float64, nan='mean' if self._null_column else None, null_column=transformer_null_column, min_value='auto', max_value='auto', ), 'O': rdt.transformers.CategoricalTransformer(fuzzy=True), 'b': rdt.transformers.BooleanTransformer( nan=-1 if self._null_column else None, null_column=transformer_null_column, ), 'M': rdt.transformers.DatetimeTransformer( nan='mean' if self._null_column else None, null_column=transformer_null_column, ), } self._model._metadata._dtype_transformers.update( dtype_transformers)
#%% from sdv import Metadata from sdv.relational import HMA1 from sdv.tabular import GaussianCopula from sdv.timeseries import PAR import numpy as np import pandas as pd from base_data import * data = create_transactions(50) base_model = GaussianCopula() base_model.fit(data) transactions = base_model.sample(1000) transactions.to_csv('Transactions.csv', index=False) #Products products = pd.DataFrame(transactions['Product id'].drop_duplicates()) products['Product group'] = products['Product id'].apply( lambda x: x.split('-')[0]) products['Product cost'] = [ round(x, 3) for x in np.random.gamma(shape=10, scale=5, size=products.shape[0]) ] products['Product inventory unit'] = 'st' products['Product available in stock'] = [ round(max(0, x)) for x in np.random.triangular( left=-50, mode=50, right=120, size=products.shape[0]) ] products.to_csv('Products.csv', index=False)