def test_parallelism(): """Tests operator parallelism.""" env = Environment() # Try setting a common parallelism for all operators env.set_parallelism(2) stream = env.source(None).map(None).filter(None).flat_map(None) env._collect_garbage() for operator in env.operators.values(): if operator.type == OpType.Source: # TODO (john): Currently each source has only one instance assert operator.num_instances == 1, (operator.num_instances, 1) else: assert operator.num_instances == 2, (operator.num_instances, 2) # Check again after adding an operator with different parallelism stream.map(None, "Map1").shuffle().set_parallelism(3).map( None, "Map2").set_parallelism(4) env._collect_garbage() for operator in env.operators.values(): if operator.type == OpType.Source: assert operator.num_instances == 1, (operator.num_instances, 1) elif operator.name != "Map1" and operator.name != "Map2": assert operator.num_instances == 2, (operator.num_instances, 2) elif operator.name != "Map2": assert operator.num_instances == 3, (operator.num_instances, 3) else: assert operator.num_instances == 4, (operator.num_instances, 4)
def test_partitioning(): """Tests stream partitioning.""" env = Environment() # Try defining multiple partitioning strategies for the same stream _ = env.source(None).shuffle().rescale().broadcast().map( None).broadcast().shuffle() env._collect_garbage() for operator in env.operators.values(): p_schemes = operator.partitioning_strategies for scheme in p_schemes.values(): # Only last defined strategy should be kept if operator.type == OpType.Source: assert scheme.strategy == PStrategy.Broadcast, ( scheme.strategy, PStrategy.Broadcast) else: assert scheme.strategy == PStrategy.Shuffle, ( scheme.strategy, PStrategy.Shuffle)
def test_forking(): """Tests stream forking.""" env = Environment() # Try forking a stream stream = env.source(None).map(None).set_parallelism(2) # First branch with a shuffle partitioning strategy _ = stream.shuffle().key_by(0).sum(1) # Second branch with the default partitioning strategy _ = stream.key_by(1).sum(2) env._collect_garbage() # Operator ids source_id = None map_id = None keyby1_id = None keyby2_id = None sum1_id = None sum2_id = None # Collect ids for id, operator in env.operators.items(): if operator.type == OpType.Source: source_id = id elif operator.type == OpType.Map: map_id = id elif operator.type == OpType.KeyBy: if operator.other_args == 0: keyby1_id = id else: assert operator.other_args == 1, (operator.other_args, 1) keyby2_id = id elif operator.type == OpType.Sum: if operator.other_args == 1: sum1_id = id else: assert operator.other_args == 2, (operator.other_args, 2) sum2_id = id # Check generated streams and their partitioning for source, destination in env.logical_topo.edges: operator = env.operators[source] if source == source_id: assert destination == map_id, (destination, map_id) elif source == map_id: p_scheme = operator.partitioning_strategies[destination] strategy = p_scheme.strategy key_index = env.operators[destination].other_args if key_index == 0: # This must be the first branch assert strategy == PStrategy.Shuffle, (strategy, PStrategy.Shuffle) assert destination == keyby1_id, (destination, keyby1_id) else: # This must be the second branch assert key_index == 1, (key_index, 1) assert strategy == PStrategy.Forward, (strategy, PStrategy.Forward) assert destination == keyby2_id, (destination, keyby2_id) elif source == keyby1_id or source == keyby2_id: p_scheme = operator.partitioning_strategies[destination] strategy = p_scheme.strategy key_index = env.operators[destination].other_args if key_index == 1: # This must be the first branch assert strategy == PStrategy.ShuffleByKey, ( strategy, PStrategy.ShuffleByKey) assert destination == sum1_id, (destination, sum1_id) else: # This must be the second branch assert key_index == 2, (key_index, 2) assert strategy == PStrategy.ShuffleByKey, ( strategy, PStrategy.ShuffleByKey) assert destination == sum2_id, (destination, sum2_id) else: # This must be a sum operator assert operator.type == OpType.Sum, (operator.type, OpType.Sum)