def emit(self, key, value): self.progress() if self.writer: self.writer.emit(key, value) else: if self._is_mapper and self._private_encoding: key = private_encode(key) value = private_encode(value) if self.partitioner: part = self.partitioner.partition(key, self.n_reduces) self.up_link.send(self.up_link.PARTITIONED_OUTPUT, part, key, value) else: self.up_link.send(self.up_link.OUTPUT, key, value)
def initialize_break_points(cls, n_reducers, sampled_records, input_dir, n_threads=2): file_infos = [ i for i in hdfs.lsl(input_dir) if (i['kind'] == 'file' and os.path.basename(i['name']).startswith('part')) ] n_files = len(file_infos) total_size = sum(map(lambda _: int(_['size']), file_infos)) n_records = total_size // RECORD_LENGTH assert n_records > sampled_records df = max(n_files // n_reducers, 1) paths = [ i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df) ] break_points = cls.get_break_points(sampled_records // n_reducers, n_reducers, paths, n_threads) vals = [_ for _ in zip(break_points, range(1, n_reducers))] selector = Selector(vals) bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE) with io.open(bp_path, "wb") as f: f.write(srl.private_encode(selector)) return bp_path
def emit(self, key, value): self.progress() if self.writer: self.writer.emit(key, value) else: if self._private_encoding: key = private_encode(key) value = private_encode(value) else: key = (key if type(key) in [str, unicode] else unicode(key)) value = (value if type(value) in [str, unicode] else unicode(value)) if self.partitioner: part = self.partitioner.partition(key, self.n_reduces) self.up_link.send('partitionedOutput', part, key, value) else: self.up_link.send('output', key, value)
def initialize_break_points(cls, n_reducers, sampled_records, input_dir, n_threads=2): file_infos = [i for i in hdfs.lsl(input_dir) if (i['kind'] == 'file' and os.path.basename(i['name']).startswith('part'))] n_files = len(file_infos) total_size = sum(map(lambda _: int(_['size']), file_infos)) n_records = total_size // RECORD_LENGTH assert n_records > sampled_records df = max(n_files // n_reducers, 1) paths = [i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df)] break_points = cls.get_break_points(sampled_records // n_reducers, n_reducers, paths, n_threads) vals = [_ for _ in zip(break_points, range(1, n_reducers))] selector = Selector(vals) bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE) with io.open(bp_path, "wb") as f: f.write(srl.private_encode(selector)) return bp_path
def test_private_serialize(self): for obj in [1, 0.4, "Hello", [1, 2, 3], {"key": "value"}]: self.assertEqual(obj, srl.private_decode(srl.private_encode(obj)))