def describe_dataset_in_independent_attribute_mode( self, dataset_file, epsilon=0.1, attribute_to_datatype={}, attribute_to_is_categorical={}, attribute_to_is_candidate_key={}, seed=0): utils.set_random_seed(seed) self.attribute_to_datatype = { attr: DataType(data_type) for attr, data_type in attribute_to_datatype.items() } self.attribute_to_is_categorical = dict(attribute_to_is_categorical) self.attribute_to_is_candidate_key = dict( attribute_to_is_candidate_key) self.read_dataset_from_csv(dataset_file) self.infer_attribute_data_types() self.get_dataset_meta_info() self.convert_input_dataset_into_a_dict_of_columns() self.infer_domains() self.inject_laplace_noise_into_distribution_per_attribute(epsilon) # record attribute information in json format self.dataset_description['attribute_description'] = {} for attr, column in self.input_dataset_as_column_dict.items(): assert isinstance(column, AbstractAttribute) self.dataset_description['attribute_description'][ attr] = column.to_json()
def generate_dataset_in_random_mode(self, n, description_file, seed=0): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = pd.DataFrame() for attr in description['attribute_description'].keys(): attr_description = description['attribute_description'][attr] datatype = attr_description['datatype'] is_categorical = attr_description['is_categorical'] if is_categorical: self.synthetic_dataset[attr] = np.random.choice( attr_description['distribution_bins'], n) elif datatype == 'string': length = np.random.randint(attr_description['min'], attr_description['max']) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: minimum, maximum = attr_description['min'], attr_description[ 'max'] if datatype == 'int': self.synthetic_dataset[attr] = np.random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = np.random.uniform( minimum, maximum, n)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): set_random_seed(seed) self.n = n self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in self.encoded_dataset: self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( self.encoded_dataset[attr]) elif attr in candidate_keys: self.synthetic_dataset[ attr] = column.generate_values_as_candidate_key(n) else: # for attributes not in BN or candidate keys, use independent attribute mode. binning_indices = column.sample_binning_indices_in_independent_attribute_mode( n) self.synthetic_dataset[ attr] = column.sample_values_from_binning_indices( binning_indices)
def generate_dataset_in_random_mode(self, n, description_file, seed=0, minimum=0, maximum=100): set_random_seed(seed) description = read_json_file(description_file) self.synthetic_dataset = DataFrame() for attr in description['attribute_description'].keys(): attr_info = description['attribute_description'][attr] datatype = attr_info['data_type'] is_categorical = attr_info['is_categorical'] is_candidate_key = attr_info['is_candidate_key'] if is_candidate_key: self.synthetic_dataset[attr] = parse_json( attr_info).generate_values_as_candidate_key(n) elif is_categorical: self.synthetic_dataset[attr] = random.choice( attr_info['distribution_bins'], n) elif datatype == 'String': length = random.randint(attr_info['min'], attr_info['max'] + 1) self.synthetic_dataset[attr] = length self.synthetic_dataset[attr] = self.synthetic_dataset[ attr].map(lambda x: generate_random_string(x)) else: if datatype == 'Integer': self.synthetic_dataset[attr] = random.randint( minimum, maximum + 1, n) else: self.synthetic_dataset[attr] = random.uniform( minimum, maximum, n)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): self.n = n set_random_seed(seed) self.description = read_json_file(description_file) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) self.sample_from_encoded_dataset()
def describe_dataset_in_independent_attribute_mode(self, dataset_file, epsilon=0.1, attribute_to_datatype={}, attribute_to_is_categorical={}, seed=0): utils.set_random_seed(seed) self.attribute_to_datatype = dict(attribute_to_datatype) self.attribute_to_is_categorical = dict(attribute_to_is_categorical) self.read_dataset_from_csv(dataset_file) self.get_dataset_meta_info() self.infer_attribute_datatypes() self.infer_domains() self.inject_laplace_noise_into_distribution_per_attribute(epsilon)
def describe_dataset_in_random_mode( self, dataset_file: str, attribute_to_datatype: Dict[str, DataType] = None, attribute_to_is_categorical: Dict[str, bool] = None, attribute_to_is_candidate_key: Dict[str, bool] = None, categorical_attribute_domain_file: str = None, numerical_attribute_ranges: Dict[str, List] = None, seed=0): attribute_to_datatype = attribute_to_datatype or {} attribute_to_is_categorical = attribute_to_is_categorical or {} attribute_to_is_candidate_key = attribute_to_is_candidate_key or {} numerical_attribute_ranges = numerical_attribute_ranges or {} if categorical_attribute_domain_file: categorical_attribute_to_domain = utils.read_json_file( categorical_attribute_domain_file) else: categorical_attribute_to_domain = {} utils.set_random_seed(seed) self.attr_to_datatype = { attr: DataType(datatype) for attr, datatype in attribute_to_datatype.items() } self.attr_to_is_categorical = attribute_to_is_categorical self.attr_to_is_candidate_key = attribute_to_is_candidate_key self.read_dataset_from_csv(dataset_file) self.infer_attribute_data_types() self.analyze_dataset_meta() self.represent_input_dataset_by_columns() for column in self.attr_to_column.values(): attr_name = column.name if attr_name in categorical_attribute_to_domain: column.infer_domain( categorical_domain=categorical_attribute_to_domain[ attr_name]) elif attr_name in numerical_attribute_ranges: column.infer_domain( numerical_range=numerical_attribute_ranges[attr_name]) else: column.infer_domain() # record attribute information in json format self.data_description['attribute_description'] = {} for attr, column in self.attr_to_column.items(): self.data_description['attribute_description'][ attr] = column.to_json()
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) all_attributes = self.description['meta']['all_attributes'] candidate_keys = set(self.description['meta']['candidate_keys']) self.synthetic_dataset = DataFrame(columns=all_attributes) for attr in all_attributes: attr_info = self.description['attribute_description'][attr] column = parse_json(attr_info) if attr in candidate_keys: self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n) else: binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n) self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)
def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0): self.n = n set_random_seed(seed) self.description = read_json_file(description_file) self.encoded_dataset = DataGenerator.generate_encoded_dataset( self.n, self.description) # # use independent attribute mode for attributes ignored by BN, which are non-categorical strings. # for attr in self.description['meta']['attributes_ignored_by_BN']: # attr_info = self.description['attribute_description'][attr] # bins = attr_info['distribution_bins'] # probs = attr_info['distribution_probabilities'] # self.encoded_dataset[attr] = np.random.choice(list(range(len(bins))), size=n, p=probs) self.sample_from_encoded_dataset()
def generate_dataset_in_independent_mode(self, n, description_file, seed=0): set_random_seed(seed) self.description = read_json_file(description_file) attributes = self.description['meta']['attribute_list'] self.encoded_dataset = pd.DataFrame(columns=attributes, index=list(range(n))) for attr in attributes: attr_info = self.description['attribute_description'][attr] bins = attr_info['distribution_bins'] probs = attr_info['distribution_probabilities'] self.encoded_dataset[attr] = np.random.choice(list(range( len(bins))), size=n, p=probs) self.sample_from_encoded_dataset()