def _preprocessing(self, data): """Preprocessing steps for the dataframe before building copulas. Retrieve meta files, add noise for integer columns and compute the cdf,ppf for each column and transform data in cdfs. Returns: cdfs: list of cdf function generator for each column ppfs: list of ppf function generator for each column unis: np matrix of data after applying cdf function to each column """ cdfs = [] ppfs = [] unis = np.empty([data.shape[0], data.shape[1]]) count = 0 for col in data: # noise = np.random.normal(0,0.01,self.n_sample) # data[col]=data[col].astype('float32') # perturbed = noise+data[col].values dist = utils.Distribution(column=data[col], summary={ 'name': 'kde', 'values': None }) # dist.name=self.u_type cdf = dist.cdf cdfs.append(cdf) unis[:, count] = [cdf(x) for x in list(data[col].values)] count += 1 return cdfs, unis, ppfs
def test_uniform_distribution(self): """When the values are a uniformely distributed, the Distribution recognizes it""" distribution = utils.Distribution(column=np.linspace(-15, 15)) assert distribution.name == 'uniform' assert distribution.ppf(0.5) == 0 assert distribution.ppf(0.01) == -distribution.ppf(0.99)
def test_categorical_distribution(self): """When the keyword argument categorical=True, the Distribution behaves as such""" distribution = utils.Distribution( column=['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'C'], categorical=True) assert distribution.name == 'categorical' assert distribution.cats == ['B', 'A', 'C'] assert distribution.estimate_args(['B', 'B', 'B', 'A', 'C']) == [0.6, 0.2, 0.2]
def test_simple_case(self): # Setup d0 = utils.Distribution(column=np.linspace(-15, 15)) d1 = utils.Distribution( column=['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'C'], categorical=True) d2 = utils.Distribution( column=['T', 'T', 'T', 'H', 'H', 'H', 'H', 'H', 'H', 'H'], categorical=True) covariance = np.array([[1, 0.2, 0.3], [0.2, 1, 0.5], [0.3, 0.5, 1]]) # Run samples = utils.generate_samples(covariance, [d0.ppf, d1.ppf, d2.ppf], 2) # Check assert len(samples) == 2
def test_nan_value_logic(self): # Quick tests d0 = utils.Distribution(column=np.linspace(-15, 15)) d1 = utils.Distribution( column=['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'C'], categorical=True) d2 = utils.Distribution( column=['T', 'T', 'T', 'H', 'H', 'H', 'H', 'H', 'H', 'H'], categorical=True) cov = np.array([[1, 0.2, 0.3], [0.2, 1, 0.5], [0.3, 0.5, 1]]) LOGGER.debug('\nGenerated Samples:') LOGGER.debug(utils.generate_samples(cov, [d0.ppf, d1.ppf, d2.ppf], 2)) # Test the np.nan values stuff numerical = np.array([1.0, 2.1, 3.4, np.nan, 3.4, 5.6, np.nan]) d2 = utils.Distribution(column=np.array(numerical)) LOGGER.debug(d2.name, d2.args) categorical_num = np.array([1, 2, 1, 1, 2, 1, 1, np.nan, 2, np.nan, 1]) categorical_str = np.array(['a', 'b', np.nan, 'a', np.nan, 'b'], dtype='object') LOGGER.debug('\nTesting NaN value logic') d3 = utils.Distribution(column=categorical_num, categorical=True) d4 = utils.Distribution(column=categorical_str, categorical=True) assert sum(d3.args) == 1.0 assert sum(d4.args) == 1.0 LOGGER.debug(zip(d3.cats, d3.args)) LOGGER.debug(zip(d4.cats, d4.args)) LOGGER.debug(d2.estimate_args(np.array([np.nan, np.nan, np.nan]))) LOGGER.debug(d3.estimate_args(np.array([np.nan, np.nan, np.nan]))) LOGGER.debug( d4.estimate_args(np.array([np.nan, np.nan, np.nan], dtype='object'))) d5 = utils.Distribution(column=np.array( ['a', 'b', np.nan, 'a', 'b', np.nan]), categorical=True) LOGGER.debug(sum(d5.args))