def _extract_fn(data_extractors: pipeline_dp.DataExtractors, row: DataType) -> DataType: """Extracts the columns to (pid, pkey, pvalue). Args: data_extractors: A function to extract privacy_id, partition_key, value of the input data. row: The data to extract, should usually be raw input of the pipline. Returns: Data in format of (pid, pkey, pvalue) defined by the extractors. """ return data_extractors.privacy_id_extractor( row), data_extractors.partition_extractor( row), data_extractors.value_extractor(row)
def setUpClass(cls): conf = pyspark.SparkConf() cls.sc = pyspark.SparkContext(conf=conf) cls.data_extractors = DataExtractors( partition_extractor=lambda x: x[1], privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: x[2])
def setUpClass(cls): import pyspark conf = pyspark.SparkConf() cls.sc = pyspark.SparkContext.getOrCreate(conf=conf) cls.data_extractors = DataExtractors( partition_extractor=lambda x: x[1], privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: x[2]) cls.backend = SparkRDDBackend(cls.sc)
def aggregate_true( self, col, params: SampleParams, data_extractors: pipeline_dp.DataExtractors) -> DataType: """Computes raw aggregation results of the input data without adding noises. Aggregation means aggregate values group by partition_key. Both values and partition_key are extracted by data extractors. Args: input_data: The data to sample. It can be local data, beam PCollection or Spark RDD depending on the engine used. data_extractors: A function to extract privacy_id, partition_key, value of the input data. Returns: True aggregation results. """ combiner = non_private_combiners.create_compound_combiner( metrics=params.metrics) col = self._be.map( col, lambda row: (data_extractors.privacy_id_extractor(row), data_extractors.partition_extractor(row), data_extractors.value_extractor(row)), "Extract (privacy_id, partition_key, value))") # col : (privacy_id, partition_key, value) col = self._be.map_tuple( col, lambda pid, pk, v: ((pid, pk), v), "Rekey to ( (privacy_id, partition_key), value))") col = self._be.group_by_key(col, "Group by pk") col = self._be.map_values(col, combiner.create_accumulator, "Aggregate by (pk, pid)") # ((privacy_id, partition_key), aggregator) col = self._be.map_tuple(col, lambda pid_pk, v: (pid_pk[1], v), "Drop privacy id") # col : (partition_key, accumulator) col = self._be.combine_accumulators_per_key( col, combiner, "Reduce accumulators per partition key") # col : (partition_key, accumulator) # Compute metrics. col = self._be.map_values(col, combiner.compute_metrics, "Compute DP metrics") # col : (partition_key, aggregated_value) return col
def setUpClass(cls): cls.ops = MultiProcLocalPipelineOperations(n_jobs=1) cls.data_extractors = DataExtractors( partition_extractor=cls.partition_extract, privacy_id_extractor=cls.privacy_id_extract, value_extractor=cls.value_extract)
def setUpClass(cls): cls.ops = BeamOperations() cls.data_extractors = DataExtractors( partition_extractor=lambda x: x[1], privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: x[2])
def setUpClass(cls): cls.backend = MultiProcLocalBackend(n_jobs=1) cls.data_extractors = DataExtractors( partition_extractor=cls.partition_extract, privacy_id_extractor=cls.privacy_id_extract, value_extractor=cls.value_extract)
def setUpClass(cls): cls.backend = LocalBackend() cls.data_extractors = DataExtractors( partition_extractor=lambda x: x[1], privacy_id_extractor=lambda x: x[0], value_extractor=lambda x: x[2])