示例#1
0
    def __init__(self, domain, jtreepy, _lambda=0.2):
        """
		Using linear programming method to find a less noise variance.
		param:
			domain: the domain of the given data
				(note) the order of columns in domain should be same with the original
			jtree: the structure of junction tree
			_lambda: the balance number.

		TODO:
			1. Move jt_rep to Junction Tree Module
			2. Move Different Operator to a linear algebra package
		"""
        self.LOG = Base.get_logger("CliqueMerge")
        self.domain = domain
        self.node_card = [len(vals) for vals in domain.values()]
        self._lambda = float(_lambda)
        self.max_iter = 20
        self.jtree = jtreepy
        self.nodes_num = len(self.node_card)
        self.cliques_num = len(jtreepy)
        self.cnum = range(2, len(jtreepy) + 1) if len(jtreepy) >= 2 else [1]
        self.jtree_in_node_index = [
            self.find_subset_index(clique) for clique in jtreepy
        ]
示例#2
0
	def __init__(
		self, 
		data = None, 
		edges = None, 
		noise_flag = True, 
		white_list = [], 
		eps1_val = c.EPSILON_1, 
		cramer = 0.2):
		"""
		__init__
		Input:
			1. DataUtils.Data
		Procedure
			1. Convert the given data frame to dataframe in R
			2. Convert the given Domain(in python dict) to ListVector
			3. Instantial the attributes dependency.
		"""
		self.LOG = Base.get_logger("DepGraph")
		self.noise_flag = noise_flag
		self.eps1_val = eps1_val
		self.cramer = cramer
		self.data = data
		if data is None:
			self.edges = edges
		else:
			self.edges = self._run()

		self.white_list = white_list
    def __init__(self, sensitive_data):
        """ Import the original data and initialize the utility measurement object

		Parameters
		----------
		sensitive_data: string
			The path to the original data. 
		"""
        self.LOG = Base.get_logger("UserQuery")
        sensitive = DataUtils(file_path=sensitive_data)
        self.sensitive_df = sensitive.get_pandas_df()
	def __init__(self, sensitive_data):
		""" Import the original data and initialize the utility measurement object

		Parameters
		----------
		sensitive_data: string
			The path to the original data. 
		"""
		self.LOG = Base.get_logger("UserQuery")
		sensitive = DataUtils(file_path = sensitive_data)
		self.sensitive_df = sensitive.get_pandas_df()
	def __init__(
		self, 
		file_path = None, 
		selected_attrs = None, 
		pandas_df = None, 
		valbin_maps = None, 
		names=None, 
		specified_c_domain = None,
		chunk_size = -1,
		date_format = None
		):
		""" Loading data to warpped python object

		Parameters
		----------
			file_path: string
				The path of original data
			selected_attrs: dict
				{
					"A":"C",
					"B":"D",
					...
				}
			pandas_df: Pandas dataframe
				Initialize with a pandas dataframe(TODO: deprecated)
			valbin_maps: dict
				A mapping of original values with coarse value
			names: list, experiment
				A list to specifiy the attributes' names when the input file has no header
			specified_c_domain: dict, experiment
				A mapping of continuous type attributes with the specified edges in coarse
		"""
		self.LOG = Base.get_logger("DataUtils")
		self.valbin_maps = dict() if valbin_maps is None else valbin_maps
		self.chunk_size = chunk_size
		if chunk_size > 0:
			self.dataframe = self._loading_chunk(file_path, pandas_df, names)
		else:
			self.dataframe = self._loading(file_path, pandas_df, names)

		if selected_attrs is not None:
			self.selected_attrs = selected_attrs
			# the 'selected_attrs' is ordered
			self.dataframe = self.dataframe[selected_attrs.keys()]

		self.preview_count = 5
		self.specified_c_domain = specified_c_domain
		self.date_format = date_format
    def __init__(self,
                 file_path=None,
                 selected_attrs=None,
                 pandas_df=None,
                 valbin_maps=None,
                 names=None,
                 specified_c_domain=None,
                 chunk_size=-1,
                 date_format=None):
        """ Loading data to warpped python object

		Parameters
		----------
			file_path: string
				The path of original data
			selected_attrs: dict
				{
					"A":"C",
					"B":"D",
					...
				}
			pandas_df: Pandas dataframe
				Initialize with a pandas dataframe(TODO: deprecated)
			valbin_maps: dict
				A mapping of original values with coarse value
			names: list, experiment
				A list to specifiy the attributes' names when the input file has no header
			specified_c_domain: dict, experiment
				A mapping of continuous type attributes with the specified edges in coarse
		"""
        self.LOG = Base.get_logger("DataUtils")
        self.valbin_maps = dict() if valbin_maps is None else valbin_maps
        self.chunk_size = chunk_size
        if chunk_size > 0:
            self.dataframe = self._loading_chunk(file_path, pandas_df, names)
        else:
            self.dataframe = self._loading(file_path, pandas_df, names)

        if selected_attrs is not None:
            self.selected_attrs = selected_attrs
            # the 'selected_attrs' is ordered
            self.dataframe = self.dataframe[selected_attrs.keys()]

        self.preview_count = 5
        self.specified_c_domain = specified_c_domain
        self.date_format = date_format
示例#7
0
	def __init__(self, 
		data, 
		jtree_path, 
		domain, 
		cluster, 
		histogramdds, 
		epsilon = 0.0
	):
		"""
		Initialize the inference class.
		TODO: 1. refactor, the data_path, edges, nodes, domain 
				are temporary to be here.
		param
			data: the pandas dataframe
			TODO: Because the DPTable algorithm construct lots of attributes when reading data,
					to using memory cache, one should refector the inference step of DPTable.

		param
			domain: data information with format in dictionary

			{
				"A":[1,2,3,4,5],
				"B":[2,3,4,5,6]
			}
		param
			cluster: the merged cluster structure
		param
			epsilon: the privacy budget
		"""
		self.LOG = Base.get_logger("Inference")
		self.data = data
		self.data_size = data.get_count()
		self.epsilon = epsilon
		self.rdomain = self.convert2rdomain(domain)

		sorted_internal = lambda ls2: [sorted(ls) for ls in ls2]
		self.cluster = sorted_internal(cluster)
		self.jtree_path = jtree_path
		self.histogramdds = histogramdds
	def __init__(self, domain, jtreepy, _lambda=0.2):
		"""
		Using linear programming method to find a less noise variance.
		param:
			domain: the domain of the given data
				(note) the order of columns in domain should be same with the original
			jtree: the structure of junction tree
			_lambda: the balance number.

		TODO:
			1. Move jt_rep to Junction Tree Module
			2. Move Different Operator to a linear algebra package
		"""
		self.LOG = Base.get_logger("CliqueMerge")
		self.domain = domain
		self.node_card = [len(vals) for vals in domain.values()]
		self._lambda = float(_lambda)
		self.max_iter = 20
		self.jtree = jtreepy
		self.nodes_num = len(self.node_card)
		self.cliques_num = len(jtreepy)
		self.cnum = range(2, len(jtreepy)+1) if len(jtreepy) >=2 else [1]
		self.jtree_in_node_index = [self.find_subset_index(clique) for clique in jtreepy]
示例#9
0
    def __init__(self, edges, nodes, jtree_path=None):

        edges = self.convert2rlistofvector(edges)

        self.LOG = Base.get_logger("JunctionTree")
        self.jtree = self._build_jtree(edges, nodes, jtree_path)
示例#10
0
	def __init__(self, edges, nodes, jtree_path = None):
		
		edges = self.convert2rlistofvector(edges)

		self.LOG = Base.get_logger("JunctionTree")
		self.jtree = self._build_jtree(edges, nodes, jtree_path)