def _mapper_output_protocol(self, step_num, step_map): map_key = self._step_key(step_num, 'mapper') if map_key in step_map: if step_map[map_key] >= (len(step_map) - 1): return self.output_protocol() else: return self.internal_protocol() else: # mapper is not a script substep, so protocols don't apply at all return RawValueProtocol()
def _pick_protocol_instances(self, step_num, step_type): steps_desc = self._steps_desc() step_map = self._script_step_mapping(steps_desc) # pick input protocol if step_type == 'combiner': # Combiners read and write the mapper's output protocol because # they have to be able to run 0-inf times without changing the # format of the data. # Combiners for non-script substeps can't use protocols, so this # function will just give us RawValueProtocol() in that case. previous_mapper_output = self._mapper_output_protocol( step_num, step_map) return previous_mapper_output, previous_mapper_output else: step_key = self._step_key(step_num, step_type) if step_key not in step_map: # It's unlikely that we will encounter this logic in real life, # but if asked what the protocol of a non-script step is, we # should just say RawValueProtocol because we have no idea what # the jars or commands are doing with our precious data. # If --strict-protocols, though, we won't stand for these # shenanigans! if self.options.strict_protocols: raise ValueError( "Can't pick a protocol for a non-script step") else: p = RawValueProtocol() return p, p real_num = step_map[step_key] if real_num == (len(step_map) - 1): write = self.output_protocol() else: write = self.internal_protocol() if real_num == 0: read = self.input_protocol() else: read = self.internal_protocol() return read, write
def test_no_strip(self): self.assertEqual(RawValueProtocol.read('foo\t \n\n'), (None, 'foo\t \n\n'))
def test_reads_raw_line(self): self.assertEqual(RawValueProtocol.read('foobar'), (None, 'foobar'))
def test_dumps_keys(self): self.assertEqual(RawValueProtocol.write('foo', 'bar'), 'bar')
def output_protocol(self): return RawValueProtocol()
def input_protocol(self): if self.options.job_to_run != 'stats': LOG.debug('Reading text input from cdx files') return RawValueProtocol() LOG.debug('Reading JSON input from count job') return JSONProtocol()
def test_bytestrings(self): self.assertRoundTripOK(RawValueProtocol(), None, '\xe90\c1a')
def test_dumps_keys(self): self.assertEqual(RawValueProtocol().write(b'foo', b'bar'), b'bar')