class TestSane(Topology): config = { "topology.wide.config.1": "value", "spout.overriden.config": True } spout = HeronComponentSpec(None, "sp_class", True, 3, inputs=None, outputs=[ "word", "count", Stream(fields=['error_msg'], name='error_stream') ], config={ "spout.specific.config.1": "value", "spout.specific.config.2": True, "spout.specific.config.3": -12.4, "spout.specific.config.4": [1, 2, 3], "spout.overriden.config": False }) bolt = HeronComponentSpec(None, "bl_class", False, 4, inputs={ spout: Grouping.SHUFFLE, spout['error_stream']: Grouping.ALL })
class MapBolt(Bolt, StatefulComponent): """MapBolt""" # output declarer outputs = [Stream(fields=['_output_'], name='output')] FUNCTION = 'function' def initState(self, stateful_state): # mapBolt does not have any state pass def preSave(self, checkpoint_id): # mapBolt does not have any state pass def initialize(self, config, context): self.logger.debug("MapBolt's Component-specific config: \n%s" % str(config)) self.processed = 0 self.emitted = 0 if MapBolt.FUNCTION in config: self.map_function = config[MapBolt.FUNCTION] if not callable(self.map_function): raise RuntimeError("Map function has to be callable") else: raise RuntimeError("MapBolt needs to be passed map function") def process(self, tup): retval = self.map_function(tup.values[0]) self.emit([retval], stream='output') self.processed += 1 self.emitted += 1 self.ack(tup)
class JoinBolt(SlidingWindowBolt): """JoinBolt""" # output declarer outputs = [Stream(fields=['_output_'], name='output')] WINDOWDURATION = SlidingWindowBolt.WINDOW_DURATION_SECS SLIDEINTERVAL = SlidingWindowBolt.WINDOW_SLIDEINTERVAL_SECS @staticmethod def _add(key, value, mymap): if key in mymap: mymap[key].append(value) else: mymap[key] = [value] def processWindow(self, window_config, tuples): # our temporary map mymap = {} for tup in tuples: userdata = tup.values[0] if not isinstance(userdata, collections.Iterable) or len(userdata) != 2: raise RuntimeError("Join tuples must be iterable of length 2") self._add(userdata[0], userdata[1], mymap) for (key, values) in mymap.items(): self.emit([(key, values)], stream='output')
class SampleBolt(Bolt, StatefulComponent): """SampleBolt""" # output declarer outputs = [Stream(fields=['_output_'], name='output')] FRACTION = 'fraction' def initState(self, stateful_state): # sample does not have any state pass def preSave(self, checkpoint_id): # sample does not have any state pass def initialize(self, config, context): self.logger.debug("SampleBolt's Component-specific config: \n%s" % str(config)) self.processed = 0 self.emitted = 0 if SampleBolt.FRACTION in config: self.sample_fraction = config[SampleBolt.FRACTION] if not isinstance(self.sample_fraction, float): raise RuntimeError("Sample fraction has to be a float") if self.sample_fraction > 1.0: raise RuntimeError("Sample fraction has to be <= 1.0") else: raise RuntimeError("SampleBolt needs to be passed filter function") def process(self, tup): self.processed += 1 self.ack(tup) raise RuntimeError("SampleBolt not fully functional")
def test_constructor(self): # sane stream = Stream(fields=['word', 'count']) self.assertEqual(stream.fields, ['word', 'count']) self.assertEqual(stream.stream_id, "default") stream = Stream(fields=['error', 'message'], name='error_stream') self.assertEqual(stream.fields, ['error', 'message']) self.assertEqual(stream.stream_id, "error_stream") stream = Stream() self.assertEqual(stream.fields, []) self.assertEqual(stream.stream_id, "default") # fields not list, tuple nor None with self.assertRaises(TypeError): Stream(fields={"key": "value"}) # fields contains non-string with self.assertRaises(TypeError): Stream(fields=["hello", 123, "world"]) # stream name not string with self.assertRaises(TypeError): Stream(fields=["hello", "world"], name=True) with self.assertRaises(TypeError): Stream(fields=["hello", "world"], name=None)
def test_get_out_streamids(self): # outputs is none spec = HeronComponentSpec("spout", "class", True, 1) ret = spec.get_out_streamids() self.assertEqual(ret, set()) # outputs neither list nor tuple spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = "string" with self.assertRaises(TypeError): spec.get_out_streamids() # outputs sane spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = [ "string", "hello", Stream(fields=["abc", "def"], name="another_stream"), Stream(fields=["another", "default"], name="default") ] ret = spec.get_out_streamids() self.assertEqual(ret, {"default", "another_stream"})
def test_get_item(self): # HeronComponentSpec name set spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = [ "string", "hello", Stream(fields=["abc", "def"], name="another_stream"), Stream(fields=["another", "default"], name="default") ] ret = spec['another_stream'] self.assertEqual(ret, GlobalStreamId("spout", "another_stream")) # HeronComponentSpec name not set spec = HeronComponentSpec(None, "class", True, 1) spec.outputs = [ "string", "hello", Stream(fields=["abc", "def"], name="another_stream"), Stream(fields=["another", "default"], name="default") ] ret = spec['default'] self.assertEqual(ret, GlobalStreamId(spec, "default")) # stream id not registered spec = HeronComponentSpec(None, "class", True, 1) spec.outputs = [ "string", "hello", Stream(fields=["abc", "def"], name="another_stream"), Stream(fields=["another", "default"], name="default") ] with self.assertRaises(ValueError): spec['non_existent_stream']
def test_sanitize_outputs(self): # outputs is None (no argument to outputs) spec = HeronComponentSpec("spout", "class", True, 1) ret = spec._sanitize_outputs() self.assertIsNone(ret) # outputs neither list nor tuple spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = "string" with self.assertRaises(TypeError): spec._sanitize_outputs() # output list contains a non-string and non-Stream object spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = ["string", False, 123] with self.assertRaises(TypeError): spec._sanitize_outputs() # output list is all string spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = ["string", "hello", "heron"] ret = spec._sanitize_outputs() self.assertEqual(ret, {"default": ["string", "hello", "heron"]}) # output list has mixed stream spec = HeronComponentSpec("spout", "class", True, 1) spec.outputs = [ "string", "hello", Stream(fields=["abc", "def"], name="another_stream"), Stream(fields=["another", "default"], name="default") ] ret = spec._sanitize_outputs() self.assertEqual( ret, { "default": ["string", "hello", "another", "default"], "another_stream": ["abc", "def"] })
class MultiStreamSpout(Spout): """WordSpout: emits a set of words repeatedly""" # output field declarer outputs = ['word', Stream(fields=['error_msg'], name='error')] def initialize(self, config, context): self.logger.info("In initialize() of WordSpout") self.words = cycle(["hello", "bye", "good", "bad", "heron", "storm"]) self.emit_count = 0 self.logger.info("Component-specific config: \n%s" % str(config)) self.logger.info("Context: \n%s" % str(context)) def next_tuple(self): word = next(self.words) self.emit([word]) self.emit_count += 1 if self.emit_count % 100000 == 0: self.logger.info("Emitted %s" % str(self.emit_count)) self.logger.info("Emitting to error stream") self.emit(["test error message"], stream='error')
class FixedLinesSpout(Spout): """FixedLinesSpout: Generates a line from a set of static lines again and again """ outputs = [Stream(fields=['_output_'], name='output')] # pylint: disable=unused-argument def initialize(self, config, context): """Implements FixedLines Spout's initialize method""" self.logger.info("Initializing FixedLinesSpout with the following") self.logger.info("Component-specific config: \n%s" % str(config)) self.words = [ "Mary had a little lamb", "Humpy Dumpy sat on a wall", "Here we round the Moulberry bush" ] self.index = 0 self.emit_count = 0 self.ack_count = 0 self.fail_count = 0 def _get_next_line(self): retval = self.words[self.index] self.index += 1 if self.index >= len(self.words): self.index = 0 return retval def next_tuple(self): self.emit([self._get_next_line()], stream='output') self.emit_count += 1 def ack(self, tup_id): self.ack_count += 1 self.logger.debug("Acked tuple %s" % str(tup_id)) def fail(self, tup_id): self.fail_count += 1 self.logger.debug("Failed tuple %s" % str(tup_id))
class RepartitionBolt(Bolt, StatefulComponent): """RepartitionBolt""" # output declarer outputs = [Stream(fields=['_output_'], name='output')] def initState(self, stateful_state): # repartition does not have any state pass def preSave(self, checkpoint_id): # repartition does not have any state pass def initialize(self, config, context): self.logger.debug("RepartitionBolt's Component-specific config: \n%s" % str(config)) self.processed = 0 self.emitted = 0 def process(self, tup): self.emit(tup.values, stream='output') self.processed += 1 self.emitted += 1 self.ack(tup)
class ReduceByKeyAndWindowBolt(SlidingWindowBolt): """ReduceByKeyAndWindowBolt""" # output declarer outputs = [Stream(fields=['_output_'], name='output')] FUNCTION = 'function' WINDOWDURATION = SlidingWindowBolt.WINDOW_DURATION_SECS SLIDEINTERVAL = SlidingWindowBolt.WINDOW_SLIDEINTERVAL_SECS def initialize(self, config, context): super(ReduceByKeyAndWindowBolt, self).initialize(config, context) if ReduceByKeyAndWindowBolt.FUNCTION not in config: raise RuntimeError("FUNCTION not specified in reducebywindow operator") self.reduce_function = config[ReduceByKeyAndWindowBolt.FUNCTION] if not callable(self.reduce_function): raise RuntimeError("Reduce Function has to be callable") @staticmethod def _add(key, value, mymap): if key in mymap: mymap[key].append(value) else: mymap[key] = [value] def processWindow(self, window_config, tuples): # our temporary map mymap = {} for tup in tuples: userdata = tup.values[0] if not isinstance(userdata, collections.Iterable) or len(userdata) != 2: raise RuntimeError("ReduceByWindow tuples must be iterable of length 2") self._add(userdata[0], userdata[1], mymap) for (key, values) in mymap.items(): result = values[0] for value in values[1:]: self.reduce_function(result, value) self.emit([(key, result)], stream='output')
class IntegrationTestBolt(Bolt): """Base bolt for integration test Every bolt of integration test topology consists of this instance, each delegating user's bolt. """ outputs = [ Stream(fields=[integ_const.INTEGRATION_TEST_TERMINAL], name=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID) ] @classmethod def spec(cls, name, par, inputs, config, user_bolt_classpath, user_output_fields=None): python_class_path = "%s.%s" % (cls.__module__, cls.__name__) config[integ_const.USER_BOLT_CLASSPATH] = user_bolt_classpath # avoid modification to cls.outputs _outputs = copy.copy(cls.outputs) if user_output_fields is not None: _outputs.extend(user_output_fields) return HeronComponentSpec(name, python_class_path, is_spout=False, par=par, inputs=inputs, outputs=_outputs, config=config) def initialize(self, config, context): user_bolt_classpath = config.get(integ_const.USER_BOLT_CLASSPATH, None) if user_bolt_classpath is None: raise RuntimeError("User defined integration bolt was not found") user_bolt_cls = self._load_user_bolt(context.get_topology_pex_path(), user_bolt_classpath) self.user_bolt = user_bolt_cls(delegate=self) upstream_components = set() self.terminal_to_receive = 0 for streamId in context.get_this_sources().keys(): # streamId is topology_pb2.StreamId protobuf message upstream_components.add(streamId.component_name) for comp_name in upstream_components: self.terminal_to_receive += len( context.get_component_tasks(comp_name)) self.tuple_received = 0 self.tuples_processed = 0 self.current_tuple_processing = None Log.info("Terminals to receive: %d" % self.terminal_to_receive) self.user_bolt.initialize(config, context) @staticmethod def _load_user_bolt(pex_file, classpath): pex_loader.load_pex(pex_file) cls = pex_loader.import_and_get_class(pex_file, classpath) return cls @property def is_done(self): return self.terminal_to_receive == 0 def process(self, tup): self.tuple_received += 1 stream_id = tup.stream Log.info("Received a tuple: %s from %s" % (tup, stream_id)) if stream_id == integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID: self.terminal_to_receive -= 1 if self.is_done: if isinstance(self.user_bolt, BatchBolt): Log.info("Invoke bolt to do finish batch") self.user_bolt.finish_batch() Log.info("Populating the terminals to downstream") super(IntegrationTestBolt, self).emit( [integ_const.INTEGRATION_TEST_TERMINAL], stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID) else: self.current_tuple_processing = tup self.user_bolt.process(tup) self.ack(tup) def emit(self, tup, stream=Stream.DEFAULT_STREAM_ID, anchors=None, direct_task=None, need_task_ids=False): if tup is None: super(IntegrationTestBolt, self).emit(list(self.current_tuple_processing), stream=stream, anchors=anchors, direct_task=direct_task, need_task_ids=need_task_ids) else: super(IntegrationTestBolt, self).emit(tup, stream, anchors, direct_task, need_task_ids) def ack(self, tup): Log.info("Trying to do an ack. tuples processed: %d, received: %d" % (self.tuples_processed, self.tuple_received)) if self.tuples_processed < self.tuple_received: super(IntegrationTestBolt, self).ack(tup) self.tuples_processed += 1 def fail(self, tup): Log.info("Trying to do a fail. tuples processed: %d, received: %d" % (self.tuples_processed, self.tuple_received)) if self.tuples_processed < self.tuple_received: super(IntegrationTestBolt, self).fail(tup) self.tuples_processed += 1
class PulsarSpout(Spout): """PulsarSpout: reads from a pulsar topic""" # pylint: disable=too-many-instance-attributes # pylint: disable=no-self-use outputs = [Stream(fields=['_output_'], name='output')] def default_deserializer(self, msg): return [str(msg)] # TopologyBuilder uses these constants to set # cluster/topicname serviceUrl = "PULSAR_SERVICE_URL" topicName = "PULSAR_TOPIC" receiveTimeoutMs = "PULSAR_RECEIVE_TIMEOUT_MS" deserializer = "PULSAR_MESSAGE_DESERIALIZER" def initialize(self, config, context): """Implements Pulsar Spout's initialize method""" self.logger.info("Initializing PulsarSpout with the following") self.logger.info("Component-specific config: \n%s" % str(config)) self.logger.info("Context: \n%s" % str(context)) self.emit_count = 0 self.ack_count = 0 self.fail_count = 0 if not PulsarSpout.serviceUrl in config or not PulsarSpout.topicName in config: self.logger.fatal("Need to specify both serviceUrl and topicName") self.pulsar_cluster = str(config[PulsarSpout.serviceUrl]) self.topic = str(config[PulsarSpout.topicName]) mode = config[api_constants.TOPOLOGY_RELIABILITY_MODE] if mode == api_constants.TopologyReliabilityMode.ATLEAST_ONCE: self.acking_timeout = 1000 * int(config[api_constants.TOPOLOGY_MESSAGE_TIMEOUT_SECS]) else: self.acking_timeout = 30000 if PulsarSpout.receiveTimeoutMs in config: self.receive_timeout_ms = config[PulsarSpout.receiveTimeoutMs] else: self.receive_timeout_ms = 10 if PulsarSpout.deserializer in config: self.deserializer = config[PulsarSpout.deserializer] if not callable(self.deserializer): self.logger.fatal("Pulsar Message Deserializer needs to be callable") else: self.deserializer = self.default_deserializer # First generate the config self.logConfFileName = GenerateLogConfig(context) self.logger.info("Generated LogConf at %s" % self.logConfFileName) # We currently use the high level consumer api # For supporting exactly once, we will need to switch # to using lower level Reader api, when it becomes # available in python self.client = pulsar.Client(self.pulsar_cluster, log_conf_file_path=self.logConfFileName) self.logger.info("Setup Client with cluster %s" % self.pulsar_cluster) try: self.consumer = self.client.subscribe(self.topic, context.get_topology_name(), consumer_type=pulsar.ConsumerType.Failover, unacked_messages_timeout_ms=self.acking_timeout) except Exception as e: self.logger.fatal("Pulsar client subscription failed: %s" % str(e)) self.logger.info("Subscribed to topic %s" % self.topic) def next_tuple(self): try: msg = self.consumer.receive(timeout_millis=self.receive_timeout_ms) except Exception as e: self.logger.debug("Exception during recieve: %s" % str(e)) return try: self.emit(self.deserializer(msg.data()), tup_id=msg.message_id()) self.emit_count += 1 except Exception as e: self.logger.info("Exception during emit: %s" % str(e)) def ack(self, tup_id): self.ack_count += 1 self.consumer.acknowledge(tup_id) def fail(self, tup_id): self.fail_count += 1 self.logger.debug("Failed tuple %s" % str(tup_id))
class DslBoltBase(object): """DslBoltBase""" # output declarer outputs = [Stream(fields=['_output_'], name='output')]
class IntegrationTestSpout(Spout): """Base spout for integration test Every spout of integration test topology consists of this instance, each delegating user's spout. """ outputs = [ Stream(fields=[integ_const.INTEGRATION_TEST_TERMINAL], name=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID) ] @classmethod def spec(cls, name, par, config, user_spout_classpath, user_output_fields=None): python_class_path = "%s.%s" % (cls.__module__, cls.__name__) config[integ_const.USER_SPOUT_CLASSPATH] = user_spout_classpath # avoid modification to cls.outputs _outputs = copy.copy(cls.outputs) if user_output_fields is not None: _outputs.extend(user_output_fields) return HeronComponentSpec(name, python_class_path, is_spout=True, par=par, inputs=None, outputs=_outputs, config=config) def initialize(self, config, context): user_spout_classpath = config.get(integ_const.USER_SPOUT_CLASSPATH, None) if user_spout_classpath is None: raise RuntimeError( "User defined integration test spout was not found") user_spout_cls = self._load_user_spout(context.get_topology_pex_path(), user_spout_classpath) self.user_spout = user_spout_cls(delegate=self) self.max_executions = config.get(integ_const.USER_MAX_EXECUTIONS, integ_const.MAX_EXECUTIONS) assert isinstance(self.max_executions, int) and self.max_executions > 0 Log.info("Max executions: %d" % self.max_executions) self.tuples_to_complete = 0 self.user_spout.initialize(config, context) @staticmethod def _load_user_spout(pex_file, classpath): pex_loader.load_pex(pex_file) cls = pex_loader.import_and_get_class(pex_file, classpath) return cls @property def is_done(self): return self.max_executions == 0 def next_tuple(self): if self.is_done: return self.max_executions -= 1 Log.info("max executions: %d" % self.max_executions) self.user_spout.next_tuple() if self.is_done: self._emit_terminal_if_needed() Log.info("This topology is finished.") def ack(self, tup_id): Log.info("Received an ack with tuple id: %s" % str(tup_id)) self.tuples_to_complete -= 1 if tup_id != integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID: self.user_spout.ack(tup_id) self._emit_terminal_if_needed() def fail(self, tup_id): Log.info("Received a fail message with tuple id: %s" % str(tup_id)) self.tuples_to_complete -= 1 if tup_id != integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID: self.user_spout.fail(tup_id) self._emit_terminal_if_needed() def emit(self, tup, tup_id=None, stream=Stream.DEFAULT_STREAM_ID, direct_task=None, need_task_ids=None): """Emits from this integration test spout Overriden method which will be called when user's spout calls emit() """ # if is_control True -> control stream should not count self.tuples_to_complete += 1 if tup_id is None: Log.info("Add tup_id for tuple: %s" % str(tup)) _tup_id = integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID else: _tup_id = tup_id super(IntegrationTestSpout, self).emit(tup, _tup_id, stream, direct_task, need_task_ids) def _emit_terminal_if_needed(self): Log.info("is_done: %s, tuples_to_complete: %s" % (self.is_done, self.tuples_to_complete)) if self.is_done and self.tuples_to_complete == 0: Log.info("Emitting terminals to downstream") super(IntegrationTestSpout, self).emit( [integ_const.INTEGRATION_TEST_TERMINAL], stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID)
def test_sanitize_inputs(self): # Note that _sanitize_inputs() should only be called after HeronComponentSpec's # name attribute is set # invalid inputs given as argument (valid ones are either dict, list, tuple or None) invalid_spec = HeronComponentSpec("name", "classpath", True, 1, inputs="string") with self.assertRaises(TypeError): invalid_spec._sanitize_inputs() invalid_spec = HeronComponentSpec("name", "classpath", True, 1, inputs=100) with self.assertRaises(TypeError): invalid_spec._sanitize_inputs() # dict <HeronComponentSpec -> Grouping> from_spec = HeronComponentSpec("spout", "sp_clspath", True, 1) to_spec = HeronComponentSpec("bolt", "bl_clspath", False, 1, inputs={from_spec: Grouping.SHUFFLE}) ret = to_spec._sanitize_inputs() self.assertEqual( ret, {GlobalStreamId("spout", "default"): Grouping.SHUFFLE}) from_spec = HeronComponentSpec("spout", "sp_clspath", True, 1) from_spec.outputs = [Stream(name='another_stream')] to_spec = HeronComponentSpec( "bolt", "bl_clspath", False, 1, inputs={from_spec['another_stream']: Grouping.ALL}) ret = to_spec._sanitize_inputs() self.assertEqual( ret, {GlobalStreamId("spout", "another_stream"): Grouping.ALL}) # HeronComponentSpec's name attribute not set from_spec = HeronComponentSpec(None, "sp_clspath", True, 1) to_spec = HeronComponentSpec("bolt", "bl_clspath", False, 1, inputs={from_spec: Grouping.ALL}) with self.assertRaises(RuntimeError): to_spec._sanitize_inputs() # dict <GlobalStreamId -> Grouping> inputs_dict = { GlobalStreamId("some_spout", "some_stream"): Grouping.NONE, GlobalStreamId("another_spout", "default"): Grouping.fields(['word', 'count']) } spec = HeronComponentSpec("bolt", "classpath", False, 1, inputs=inputs_dict) ret = spec._sanitize_inputs() self.assertEqual(ret, inputs_dict) # list of HeronComponentSpec from_spec1 = HeronComponentSpec("spout1", "sp1_cls", True, 1) from_spec2 = HeronComponentSpec("spout2", "sp2_cls", True, 1) to_spec = HeronComponentSpec("bolt", "bl_cls", False, 1, inputs=[from_spec1, from_spec2]) ret = to_spec._sanitize_inputs() self.assertEqual( ret, { GlobalStreamId("spout1", "default"): Grouping.SHUFFLE, GlobalStreamId("spout2", "default"): Grouping.SHUFFLE }) # HeronComponentSpec's name attribute not set from_spec = HeronComponentSpec(None, "sp_clspath", True, 1) to_spec = HeronComponentSpec("bolt", "bl_clspath", False, 1, inputs=[from_spec]) with self.assertRaises(RuntimeError): to_spec._sanitize_inputs() # list of GlobalStreamId inputs_list = [ GlobalStreamId("spout1", "default"), GlobalStreamId("spout2", "some_stream") ] spec = HeronComponentSpec("bolt", "bl_cls", False, 1, inputs=inputs_list) ret = spec._sanitize_inputs() self.assertEqual(ret, dict(zip(inputs_list, [Grouping.SHUFFLE] * 2))) # list of neither GlobalStreamId nor HeronComponentSpec inputs_list = [None, 123, "string", [GlobalStreamId("sp", "default")]] spec = HeronComponentSpec("bolt", "bl_cls", False, 1, inputs=inputs_list) with self.assertRaises(ValueError): spec._sanitize_inputs()