def generate_dataset_in_random_mode(self, n, description_file, seed=0): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = pd.DataFrame() for attr in description['attribute_description'].keys(): attr_description = description['attribute_description'][attr] datatype = attr_description['datatype'] is_categorical = attr_description['is_categorical'] if is_categorical: self.synthetic_dataset[attr] = np.random.choice( attr_description['distribution_bins'], n) elif datatype == 'string': length = np.random.randint(attr_description['min'], attr_description['max']) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: minimum, maximum = attr_description['min'], attr_description[ 'max'] if datatype == 'int': self.synthetic_dataset[attr] = np.random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = np.random.uniform( minimum, maximum, n)
def sample_from_encoded_dataset(self): self.synthetic_dataset = self.encoded_dataset.copy() for attribute in self.synthetic_dataset: datatype = self.description['attribute_description'][attribute][ 'datatype'] not_categorical = not self.description['attribute_description'][ attribute]['is_categorical'] self.synthetic_dataset[attribute] = self.synthetic_dataset[ attribute].apply(lambda x: self.sample_uniformly_for_attribute( attribute, int(x))) if datatype == 'integer': self.synthetic_dataset[attribute] = self.synthetic_dataset[ ~self.synthetic_dataset[attribute].isnull( )][attribute].astype(int) elif datatype == 'string' and not_categorical: self.synthetic_dataset[attribute] = self.synthetic_dataset[ ~self.synthetic_dataset[attribute].isnull( )][attribute].map(lambda x: generate_random_string(int(x))) sorted_attributes = [ attr for attr in self.description['meta']['attribute_list'] if attr in self.synthetic_dataset ] self.synthetic_dataset = self.synthetic_dataset.loc[:, sorted_attributes]
def sample_values_from_binning_indices(self, binning_indices): column = super().sample_values_from_binning_indices(binning_indices) if not self.is_categorical: column[~column.isnull()] = column[~column.isnull()].apply( lambda x: utils.generate_random_string(int(x))) return column
def generate_dataset_in_random_mode(self, n, description_file, seed=0, minimum=0, maximum=100): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = DataFrame() for attr in description['attribute_description'].keys(): attr_info = description['attribute_description'][attr] datatype = attr_info['data_type'] is_categorical = attr_info['is_categorical'] is_candidate_key = attr_info['is_candidate_key'] if is_candidate_key: self.synthetic_dataset[attr] = parse_json( attr_info).generate_values_as_candidate_key(n) elif is_categorical: self.synthetic_dataset[attr] = random.choice( attr_info['distribution_bins'], n) elif datatype == 'String': length = random.randint(attr_info['min'], attr_info['max'] + 1) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: if datatype == 'Integer': self.synthetic_dataset[attr] = random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = random.uniform( minimum, maximum, n)
def generate_values_as_candidate_key(self, n): length = np.random.randint(self.min, self.max) vectorized = np.vectorize( lambda x: '{}{}'.format(utils.generate_random_string(length), x)) return vectorized(np.arange(n))