Exemplo n.º 1
0
 def test_basics(self):
     v = ('a' * 10, 'b' * 90)
     pickler = coders.PickleCoder()
     self.assertEquals(v, pickler.decode(pickler.encode(v)))
     pickler = coders.Base64PickleCoder()
     self.assertEquals(v, pickler.decode(pickler.encode(v)))
     self.assertEquals(coders.Base64PickleCoder().encode(v),
                       base64.b64encode(coders.PickleCoder().encode(v)))
Exemplo n.º 2
0
    def test_should_sample(self):
        # Order of magnitude more buckets than highest constant in code under test.
        buckets = [0] * 300
        # The seed is arbitrary and exists just to ensure this test is robust.
        # If you don't like this seed, try your own; the test should still pass.
        random.seed(1720)
        # Do enough runs that the expected hits even in the last buckets
        # is big enough to expect some statistical smoothing.
        total_runs = 10 * len(buckets)

        # Fill the buckets.
        for _ in range(total_runs):
            opcounts = OperationCounters(CounterFactory(), 'some-name',
                                         coders.PickleCoder(), 0)
            for i in range(len(buckets)):
                if opcounts.should_sample():
                    buckets[i] += 1

        # Look at the buckets to see if they are likely.
        for i in range(10):
            self.assertEqual(total_runs, buckets[i])
        for i in range(10, len(buckets)):
            self.assertTrue(
                buckets[i] > 7 * total_runs / i,
                'i=%d, buckets[i]=%d, expected=%d, ratio=%f' %
                (i, buckets[i], 10 * total_runs / i, buckets[i] /
                 (10.0 * total_runs / i)))
            self.assertTrue(
                buckets[i] < 14 * total_runs / i,
                'i=%d, buckets[i]=%d, expected=%d, ratio=%f' %
                (i, buckets[i], 10 * total_runs / i, buckets[i] /
                 (10.0 * total_runs / i)))
Exemplo n.º 3
0
 def test_update_int(self):
     opcounts = OperationCounters(CounterFactory(), 'some-name',
                                  coders.PickleCoder(), 0)
     self.verify_counters(opcounts, 0)
     opcounts.update_from(GlobalWindows.windowed_value(1))
     opcounts.update_collect()
     self.verify_counters(opcounts, 1)
Exemplo n.º 4
0
class SyncFn(beam.DoFn):

    STATE = userstate.BagStateSpec('state', coders.PickleCoder())

    def __init__(self, size):
        assert size > 0, 'Must provide a positive size'
        self.size = size

    def process(self, element, state=beam.DoFn.StateParam(STATE)):
        key, value = element

        cache = list(state.read())
        if cache:
            cache = cache[0]
        else:
            cache = {}

        values = cache.get(key, [])
        values.append(value)

        if len(values) == self.size:
            if key in cache:
                del cache[key]
            yield tuple(values)
        else:
            cache[key] = values

        state.clear()
        if cache:
            state.add(cache)
Exemplo n.º 5
0
 def test_update_str(self):
     coder = coders.PickleCoder()
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     value = GlobalWindows.windowed_value('abcde')
     opcounts.update_from(value)
     estimated_size = coder.estimate_size(value)
     self.verify_counters(opcounts, 1, estimated_size)
Exemplo n.º 6
0
    class IndexAssigningStatefulDoFn(beam.DoFn):
        INDEX_STATE = CombiningValueStateSpec(name="index", coder=coders.PickleCoder(), combine_fn=sum)

        def process(self, element, index=beam.DoFn.StateParam(INDEX_STATE)):
            _, value = element
            current_index = index.read()
            index.add(1)
            yield (current_index, value)
Exemplo n.º 7
0
 def test_update_old_object(self):
     coder = coders.PickleCoder()
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     obj = OldClassThatDoesNotImplementLen()
     value = GlobalWindows.windowed_value(obj)
     opcounts.update_from(value)
     estimated_size = coder.estimate_size(value)
     self.verify_counters(opcounts, 1, estimated_size)
Exemplo n.º 8
0
class _StatefulJobOutputsFn(beam.DoFn):

    STATE = userstate.BagStateSpec('state', coders.PickleCoder())

    def process(self, element, level, state=beam.DoFn.StateParam(STATE)):
        assert level in JobAggregateLevel.STATEFUL

        # example payload structure...
        # {
        #     'source': Any
        #     'graphid': 0,
        #     'jobtasks': {0: 3, 1: 3},
        #     'jobid': 0,
        #     'taskid': 2,
        #     'output': [
        #         '/tmp/job-0_output-0.task-2.ext',
        #         '/tmp/job-0_output-1.task-2.ext',
        #     ],
        # }
        _, payload = element

        # There are two values we will track that differ depending on the
        # aggregation type/level desired.
        #
        # - key : aggregation per-unique value
        # - size : total number of times expected to see `key`

        key = payload[level]
        if level == JobAggregateLevel.JOB:
            # str(key) is to deal with json making all dict keys strings
            size = payload['jobtasks'][str(key)]
        elif level == JobAggregateLevel.GRAPH:
            size = sum(payload['jobtasks'].values())
        else:
            raise NotImplementedError

        cache = dict(state.read())
        seen, data = cache.get(key, (0, []))
        seen += 1
        data.extend(payload['output'])
        cache[key] = (seen, data)
        state.clear()

        for k, v in cache.items():
            # size == seen
            if size == v[0]:
                # cprint('fire-{}: {}'.format(level, k), 'red', attrs=['bold'])
                yield cache.pop(k)[1]
            else:
                state.add((k, v))
Exemplo n.º 9
0
 def test_update_multiple(self):
     coder = coders.PickleCoder()
     total_size = 0
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     value = GlobalWindows.windowed_value('abcde')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     value = GlobalWindows.windowed_value('defghij')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     self.verify_counters(opcounts, 2, float(total_size) / 2)
     value = GlobalWindows.windowed_value('klmnop')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     self.verify_counters(opcounts, 3, float(total_size) / 3)
Exemplo n.º 10
0
 def run_Create(self, transform_node):
   transform = transform_node.transform
   step = self._add_step(TransformNames.CREATE_PCOLLECTION,
                         transform_node.full_label, transform_node)
   # TODO(silviuc): Eventually use a coder based on typecoders.
   # Note that we base64-encode values here so that the service will accept
   # the values.
   element_coder = coders.PickleCoder()
   step.add_property(
       PropertyNames.ELEMENT,
       [base64.b64encode(element_coder.encode(v))
        for v in transform.value])
   # The service expects a WindowedValueCoder here, so we wrap the actual
   # encoding in a WindowedValueCoder.
   step.encoding = self._get_cloud_encoding(
       coders.WindowedValueCoder(element_coder))
   step.add_property(
       PropertyNames.OUTPUT_INFO,
       [{PropertyNames.USER_NAME: (
           '%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING: step.encoding,
         PropertyNames.OUTPUT_NAME: PropertyNames.OUT}])
Exemplo n.º 11
0
    class SolveDoFn(beam.DoFn):
        PREV_TIMESTAMP = BagStateSpec(name="timestamp_state", coder=coders.PickleCoder())
        PREV_ELEMENTS = BagStateSpec(name="elements_state", coder=coders.PickleCoder())
        PREV_MODEL = BagStateSpec(name="model_state", coder=coders.PickleCoder())
        PREV_SAMPLESET = BagStateSpec(name="sampleset_state", coder=coders.PickleCoder())

        def process(
            self,
            value,
            timestamp=beam.DoFn.TimestampParam,
            timestamp_state=beam.DoFn.StateParam(PREV_TIMESTAMP),
            elements_state=beam.DoFn.StateParam(PREV_ELEMENTS),
            model_state=beam.DoFn.StateParam(PREV_MODEL),
            sampleset_state=beam.DoFn.StateParam(PREV_SAMPLESET),
            algorithm=None,
            algorithm_options=None,
            map_fn=None,
            solve_fn=None,
            unmap_fn=None,
            solver=LocalSolver(exact=False),  # default solver
            initial_mtype=sawatabi.constants.MODEL_ISING,
        ):
            _, elements = value

            # Sort with the event time.
            # If we sort a list of tuples, the first element of the tuple is recognized as a key by default,
            # so just `sorted` is enough.
            sorted_elements = sorted(elements)

            # generator into a list
            timestamp_state_as_list = list(timestamp_state.read())
            elements_state_as_list = list(elements_state.read())
            model_state_as_list = list(model_state.read())
            sampleset_state_as_list = list(sampleset_state.read())

            # Extract the previous timestamp, elements, and model from state
            if len(timestamp_state_as_list) == 0:
                prev_timestamp = -1.0
            else:
                prev_timestamp = timestamp_state_as_list[-1]
            if len(elements_state_as_list) == 0:
                prev_elements = []
            else:
                prev_elements = elements_state_as_list[-1]
            if len(model_state_as_list) == 0:
                prev_model = sawatabi.model.LogicalModel(mtype=initial_mtype)
            else:
                prev_model = model_state_as_list[-1]
            if len(sampleset_state_as_list) == 0:
                prev_sampleset = None
            else:
                prev_sampleset = sampleset_state_as_list[-1]

            # Sometimes, when we use the sliding window algorithm for a bounded data (such as a local file),
            # we may receive an outdated event whose timestamp is older than timestamp of previously processed event.
            if float(timestamp) < float(prev_timestamp):
                yield (
                    f"The received event is outdated: Timestamp is {timestamp.to_utc_datetime()}, "
                    + f"while an event with timestamp of {timestamp.to_utc_datetime()} has been already processed."
                )
                return

            # Algorithm specific operations
            # Incremental: Append current window into the all previous data.
            if algorithm == sawatabi.constants.ALGORITHM_INCREMENTAL:
                sorted_elements.extend(prev_elements)
                sorted_elements = sorted(sorted_elements)
            # Partial: Merge current window with the specified data.
            elif algorithm == sawatabi.constants.ALGORITHM_PARTIAL:
                filter_fn = algorithm_options["filter_fn"]
                filtered = filter(filter_fn, prev_elements)
                sorted_elements = list(filtered) + sorted_elements
                sorted_elements = sorted(sorted_elements)

            # Resolve outgoing elements in this iteration
            def resolve_outgoing(prev_elements, sorted_elements):
                outgoing = []
                for p in prev_elements:
                    if p[0] >= sorted_elements[0][0]:
                        break
                    outgoing.append(p)
                return outgoing

            outgoing = resolve_outgoing(prev_elements, sorted_elements)

            # Resolve incoming elements in this iteration
            def resolve_incoming(prev_elements, sorted_elements):
                incoming = []
                if len(prev_elements) == 0:
                    incoming = sorted_elements
                else:
                    for v in reversed(sorted_elements):
                        if v[0] <= prev_elements[-1][0]:
                            break
                        incoming.insert(0, v)
                return incoming

            incoming = resolve_incoming(prev_elements, sorted_elements)

            # Clear the BagState so we can hold only the latest state, and
            # Register new timestamp and elements to the states
            timestamp_state.clear()
            timestamp_state.add(timestamp)
            elements_state.clear()
            elements_state.add(sorted_elements)

            # Map problem input to the model
            try:
                model = map_fn(prev_model, prev_sampleset, sorted_elements, incoming, outgoing)
            except Exception as e:
                yield f"Failed to map: {e}\n{traceback.format_exc()}"
                return

            # Clear the BagState so we can hold only the latest state, and
            # Register new model to the state
            model_state.clear()
            model_state.add(model)

            # Algorithm specific operations
            # Attenuation: Update scale based on data timestamp.
            if algorithm == sawatabi.constants.ALGORITHM_ATTENUATION:
                model.to_physical()  # Resolve removed interactions. TODO: Deal with placeholders.
                ref_timestamp = model._interactions_array[algorithm_options["attenuation.key"]]
                min_ts = min(ref_timestamp)
                max_ts = max(ref_timestamp)
                min_scale = algorithm_options["attenuation.min_scale"]
                if min_ts < max_ts:
                    for i, t in enumerate(ref_timestamp):
                        new_scale = (1.0 - min_scale) / (max_ts - min_ts) * (t - min_ts) + min_scale
                        model._interactions_array["scale"][i] = new_scale

            # Solve and unmap to the solution
            try:
                sampleset = solve_fn(solver, model, prev_sampleset, sorted_elements, incoming, outgoing)
            except Exception as e:
                yield f"Failed to solve: {e}\n{traceback.format_exc()}"
                return

            # Clear the BagState so we can hold only the latest state, and
            # Register new sampleset to the state
            sampleset_state.clear()
            sampleset_state.add(sampleset)

            try:
                yield unmap_fn(sampleset, sorted_elements, incoming, outgoing)
            except Exception as e:
                yield f"Failed to unmap: {e}\n{traceback.format_exc()}"
Exemplo n.º 12
0
 def test_equality(self):
     self.assertEquals(coders.PickleCoder(), coders.PickleCoder())
     self.assertEquals(coders.Base64PickleCoder(),
                       coders.Base64PickleCoder())
     self.assertNotEquals(coders.Base64PickleCoder(), coders.PickleCoder())
     self.assertNotEquals(coders.Base64PickleCoder(), object())