Exemplo n.º 1
0
    def deploy(self):
        queue = [self]

        # Once the DAG is set, mark the final operator as the final. This helps
        # any type conversions from tuples to Table (right now, only used for
        # the MultiOperator).
        num_ends = 0
        seen = set()
        while len(queue) > 0:
            op = queue.pop(0)

            if len(op.downstreams) == 0 and op.fn_name not in seen:
                num_ends += 1
                op.final = True

                seen.add(op.fn_name)

            queue.extend(op.downstreams)

        if num_ends > 1:
            raise FlowError('You must converge all of your operators to a' +
                            ' single output.')

        functions, connections, gpus, batching = [], [], [], []

        for downstream in self.downstreams:
            fns, conns, args, _, registered, ds_gpus, ds_batching = \
                downstream.deploy(self.cloudburst)
            functions += fns
            connections += conns
            gpus += ds_gpus
            batching += ds_batching
            self.registered = registered

        if len(functions) > 0:
            uid = str(len(registered))
            name = self.flowname + '-' + uid
            registered[name] = args

            success, error = self.cloudburst.register_dag(
                name,
                functions,
                connections,
                colocated=self.colocates,
                gpu_functions=gpus,
                batching_functions=batching)

            if not success:
                raise FlowError(str(error))

        self.deployed = True
Exemplo n.º 2
0
    def __setitem__(self, key, value):
        if key == Row.qid_key:
            raise FlowError('Cannot modify query ID of a row.')

        for name, tp in self.schema:
            if key == name:
                if not isinstance(value, tp.typ):
                    raise FlowError(
                        f'Invalid update to {key}: Does not match' +
                        f'type {tp}.')
                else:
                    break

        self.data[key] = value
Exemplo n.º 3
0
    def _check(self, vals: List[Any]):
        if len(vals) != len(self.schema):
            raise FlowError(f'Expected {len(self.schema)} vals but found' +
                            f' {len(vals)}')

        for idx, val in enumerate(vals):
            name, typ = self.schema[idx]

            # None values are okay because they are NULLs.
            if val is not None and not isinstance(val, typ.typ):
                raise FlowError(f'Expected type {typ.typ} but instead found' +
                                f' {type(val)} for column {name}')

        return True
Exemplo n.º 4
0
            def average(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    ' field.')

                sm = 0.0
                for row in table.get():
                    sm += row[column]

                return sm / table.size()
Exemplo n.º 5
0
    def insert(self, vals: List[Any], qid: int = None) -> bool:
        if isinstance(vals, Row):
            if vals.schema == self.schema:
                self.data.append(vals)
                return True

            raise FlowError(f'Invalid row insertion: {vals}')

        if not isinstance(vals, list):
            raise FlowError('Unrecognized type: ' + str(vals) +
                            '\n\nCan only insert a Row or a list.')

        if not self._check(vals):
            return False

        if qid is None:
            self.data.append(Row(self.schema, vals, self.qid_count))
            self.qid_count += 1
        else:
            self.data.append(Row(self.schema, vals, qid))

        return True
Exemplo n.º 6
0
            def max(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    'field.')

                mx = None
                for row in table.get():
                    if mx is None:
                        mx = row[column]

                    if row[column] > mx:
                        mx = row[column]

                return mx
Exemplo n.º 7
0
def merge_tables(tables: List[Table]) -> (Table, Dict[int, int]):
    schema = tables[0].schema
    for other in tables[1:]:
        if other.schema != schema:
            raise FlowError(f'Schema mismatch:\n\t{schema}\n\t{other.schema}')

    mappings = {}
    mappings['NUM_TABLES'] = len(tables)
    result = Table(schema)

    qid = 0
    for idx, table in enumerate(tables):
        for row in table.get():
            # Convert to a list, so it gets assigned a new qid.
            vals = []
            for val, _ in schema:
                vals.append(row[val])

            result.insert(vals, qid)
            mappings[qid] = idx
            qid += 1

    return result, mappings
Exemplo n.º 8
0
    def __init__(self, flowname: str, aggregate: str, column: str, sink: list):
        if aggregate not in AGGREGATES:
            raise FlowError(f'Unknown aggregate: {aggregate}')
        if aggregate != 'count' and column is None:
            raise FlowError(f'For non-count aggregates, column must be' +
                            ' specified.')

        self.__name__ = 'AggregateOperator'

        def x(x: int):
            x + 1  # XXX: This is a hack for registration.

        self.fn = x
        self._setup(flowname)

        self.aggregate = aggregate
        self.column = column

        self.sink = sink

        class AggregateLogic:
            def __init__(self, cloudburst):
                self.preprocess(cloudburst)

            def preprocess(self, _):
                pass

            def run(self, cloudburst, aggregate, column, inp):
                serialized = False
                if type(inp) == bytes:
                    serialized = True
                    inp = deserialize(inp)

                if aggregate == 'count':
                    aggfn = self.count
                if aggregate == 'min':
                    aggfn = self.min
                if aggregate == 'max':
                    aggfn = self.max
                if aggregate == 'sum':
                    aggfn = self.sum
                if aggregate == 'average':
                    aggfn = self.average

                if isinstance(inp, GroupbyTable):
                    gb_col = inp.col
                    val, _ = next(inp.get())
                    gb_typ = get_type(type(val))

                    result = Table([(gb_col, gb_typ), (aggregate, FloatType)])

                    for val, tbl in inp.get():
                        agg = aggfn(tbl, column)
                        result.insert([val, float(agg)])
                else:
                    result = Table([(aggregate, FloatType)])
                    result.insert([float(aggnf(inp, column))])

                if serialized:
                    result = serialize(result)

                return result

            def count(self, table, column):
                return table.size()

            def min(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    ' field.')

                mn = None
                for row in table.get():
                    if mn is None:
                        mn = row[column]

                    if row[column] < mn:
                        mn = row[column]

                return mn

            def max(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    'field.')

                mx = None
                for row in table.get():
                    if mx is None:
                        mx = row[column]

                    if row[column] > mx:
                        mx = row[column]

                return mx

            def sum(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    ' field.')

                sm = 0.0
                for row in table.get():
                    sm += row[column]

                return sm

            def average(self, table, column):
                coltp = type(next(table.get())[column])
                if coltp != int and coltp != float:
                    raise FlowError('Cannot apply aggregate to non-numerical' +
                                    ' field.')

                sm = 0.0
                for row in table.get():
                    sm += row[column]

                return sm / table.size()

        self.logic = AggregateLogic
        self.exec_args = (self.aggregate, self.column)
Exemplo n.º 9
0
def optimize(flow, rules: dict = DEFAULT_RULES):
    for key in DEFAULT_RULES:
        if key not in rules:
            rules[key] = False

    if rules['colocate'] and rules['breakpoint']:
        raise FlowError('Cannot enable the colocate and breakpoint rules' +
                        ' together.')

    optimized = Flow(flow.flowname, flow.typ, flow.cloudburst, flow.source)

    if rules['whole']:
        cloned = optimize(
            flow, {
                'fusion': False,
                'compete': False,
                'compete_replicas': 1,
                'colocate': False,
                'breakpoint': False,
                'whole': False
            })

        cloned.cloudburst = None  # Remove sockets to serialize and send flow.
        queue = [cloned]
        gpu = False
        batching = []
        while len(queue) > 0:
            op = queue.pop(0)
            op.cb_fn = None

            if type(op) != Flow:
                batching.append(op.batching)
            gpu = op.gpu if not gpu else gpu
            queue.extend(op.downstreams)

        if all(batching):
            cloned.batching = True

        optimized.multi([cloned], whole=True)
        multi_op = optimized.downstreams[0]
        multi_op.batching = all(batching)
        multi_op.gpu = gpu

        if gpu:
            multi_op.fn_name += '-gpu'

        return optimized

    ### OPERATOR FUSION ###
    queue = []
    join_tracker = {}
    processed = set()

    for ds in flow.downstreams:
        queue.append((ds, optimized))

    # NOTE: We clone the whole flow regardless. If fusion is turned on,
    # then we will fuse operators, and otherwise, we simply find chains,
    # throw them away, and add operators to the optimized flow.
    while len(queue) > 0:
        op, upstream = queue.pop(0)

        if op.fn_name in processed:
            continue

        chain = find_chain(op)

        if len(chain) == 0 or not rules['fusion']:
            downstreams = op.downstreams
            processed.add(op.fn_name)

            if type(op) == MapOperator:
                marker = upstream.map(op.fn, op.col, op.names,
                                      op.logic.preprocess, op.high_variance,
                                      op.gpu, op.batching, op.multi)
            if type(op) == FilterOperator:
                marker = upstream.filter(op.fn, op.group, op.logic.preprocess)
            if type(op) == GroupbyOperator:
                marker = upstream.gropuby(op.groupby_key, op.logic.preprocess)
            if type(op) == CombineOperator:
                marker = upstream.combine()
            if type(op) == LookupOperator:
                # Merge lookup operators with their successors.
                downstreams = []
                for ds in op.downstreams:
                    if isinstance(ds, MultiOperator):
                        ops = [op] + ds.ops
                    else:
                        ops = [op, ds]
                    marker = upstream.multi(ops)

                    for next_ds in ds.downstreams:
                        queue.append((next_ds, marker))
            if type(op) == AggOperator:
                marker = upstream.agg(op.aggregate, op.column)
            if type(op) == MultiOperator:
                # This will only happen in the case where the previous operator
                # was a LookupHelperOperator combined with something else.
                marker = upstream.multi(op.ops)
            if type(op) == JoinOperator:
                if op.fn_name not in join_tracker:
                    join_tracker[op.fn_name] = upstream
                    downstreams = []
                    processed.discard(op.fn_name)
                else:
                    other = join_tracker[op.fn_name]
                    marker = other.join(upstream, op.on, op.how,
                                        op.logic.preprocess)
        else:
            marker = upstream.multi(chain)
            downstreams = chain[-1].downstreams

            for op in chain:
                # Set the multi operator to have various properties.
                if op.high_variance:
                    optimized.operators[marker.position].high_variance = True
                if op.gpu:
                    optimized.operators[marker.position].gpu = True

                    # Hack for autoscaling...
                    optimized.operators[marker.position].fn_name += '-gpu'
                if op.batching:
                    optimized.operators[marker.position].batching = True

            if optimized.operators[marker.position].batching:
                for old in chain:
                    if not old.batching:
                        print('Cannot create a fused operator with' +
                              ' batching enabled if all operators do' +
                              ' not batch.')
                        optimized.operators[marker.position].batching = False

        for ds in downstreams:
            queue.append((ds, marker))

    ### LOCALITY BREAKPOINTS ###
    if rules['breakpoint']:
        queue = [optimized]
        processed = set()

        while len(queue) > 0:
            op = queue.pop(0)

            if op.fn_name in processed:
                continue

            # We only set breakpoints if we are in a linear chain portion of the
            # flow. This will only be true if there is only one operator in the
            # queue at a time. After pop, the length should be 0 until we add this
            # op's downstreams.
            if len(queue) == 0:
                if isinstance(op, LookupOperator):
                    op.breakpoint = True
                if isinstance(op, MultiOperator):
                    for sub in op.ops:
                        if isinstance(sub, LookupOperator):
                            op.breakpoint = True

            processed.add(op.fn_name)
            queue.extend(op.downstreams)

    ### COMPETITIVE EXECUTION ###
    if rules['compete']:
        new_ops = []
        for operator in optimized.operators.values():
            if operator.high_variance:
                for downstream in operator.downstreams:
                    if len(downstream.upstreams) > 1:
                        raise RuntimeError("Cannot have a competitive" +
                                           " execution map feed into an " +
                                           "operator with multiple upstreams.")
                    downstream.multi_exec = True

                for _ in range(rules['compete_replicas']):
                    # Create a new operator that is an exact replica.
                    if isinstance(operator, MapOperator):
                        new_op = MapOperator(operator.fn, operator.fntype,
                                             operator.flowname, operator.col,
                                             operator.names,
                                             operator.logic.preprocess,
                                             operator.high_variance,
                                             operator.gpu, operator.batching,
                                             operator.multi, optimized.sink)

                    if isinstance(operator, MultiOperator):
                        new_op = MultiOperator(operator.ops, operator.flowname,
                                               optimized.sink)

                    # Hook it into the DAG by updating all up/downstreams.
                    new_op.downstreams = list(operator.downstreams)
                    new_op.upstreams = list(operator.upstreams)

                    for op in new_op.downstreams:
                        op.upstreams.append(new_op)

                    for op in new_op.upstreams:
                        op.downstreams.append(new_op)

                    new_ops.append(new_op)
        for new_op in new_ops:
            optimized.operators[str(uuid.uuid4())] = new_op

    if rules['colocate']:
        curr_op = optimized

        while len(curr_op.downstreams) > 0:
            if len(curr_op.downstreams) == 1:
                curr_op = curr_op.downstreams[0]
            else:  # We only support one colocation for now.
                if not curr_op.supports_broadcast:
                    raise RuntimeError('Unsupported broadcast attempt.')

                colocates = list(
                    map(lambda op: op.fn_name, curr_op.downstreams))
                optimized.colocates = colocates

                for op in curr_op.downstreams:
                    if not curr_op.supports_broadcast:
                        raise RuntimeError('Unsupported broadcast attempt.')
                    args = list(op.init_args)
                    args[1] = True  # Receive broadcast.
                    op.init_args = tuple(args)

                args = list(curr_op.init_args)
                args[0] = True  # Send broadcast.
                curr_op.init_args = tuple(args)
                break

    return optimized
Exemplo n.º 10
0
    def extend(self, other, marker: FlowMarker = None):
        assert isinstance(other, Flow)

        # Check the current flow to make sure it has only one final operator.
        queue = [self]
        seen = set()
        num_ends = 0
        sinks = []
        join_tracker = {}

        while len(queue) > 0:
            op = queue.pop(0)
            if op.fn_name not in seen and len(op.downstreams) == 0:
                num_ends += 1

            seen.add(op.fn_name)
            queue.extend(op.downstreams)

        # If there are multiple operators with 0 downstreams, raise an error.
        if num_ends > 1:
            raise FlowError(
                'Cannot extend a flow when there are multiple'
                ' unmerged operators. Flow can only have one sink.')

        # Iterate over everything in the
        queue = list(other.downstreams)
        upstream_map = {}
        seen = set()

        for op in other.downstreams:
            upstream_map[op.fn_name] = marker if marker else self

        while len(queue) > 0:
            op = queue.pop(0)

            if op.fn_name in seen:
                continue

            seen.add(op.fn_name)
            next_marker = upstream_map[op.fn_name]

            if type(op) == MapOperator:
                marker = next_marker.map(op.fn, op.col, op.names,
                                         op.logic.preprocess, op.high_variance,
                                         op.gpu, op.batching, op.multi)
            if type(op) == FilterOperator:
                marker = next_marker.filter(op.fn, op.group,
                                            op.logic.preprocess)
            if type(op) == GroupbyOperator:
                marker = next_marker.gropuby(op.groupby_key,
                                             op.logic.preprocess)
            if type(op) == CombineOperator:
                marker = next_marker.combine()
            if type(op) == LookupOperator:
                # Merge lookup operators with their successors.
                marker = op.lookup(op.lookup_key, op.dynamic, op.dummy)
            if type(op) == AggOperator:
                marker = upstream.agg(op.aggregate, op.column)
            if type(op) == MultiOperator:
                # This will only happen in the case where the previous operator
                # was a LookupHelperOperator combined with something else.
                marker = upstream.multi(op.ops)
            if type(op) == JoinOperator:
                if op.fn_name not in join_tracker:
                    join_tracker[op.fn_name] = upstream
                    downstreams = []
                    processed.discard(op.fn_name)
                else:
                    other = join_tracker[op.fn_name]
                    marker = other.join(upstream, op.on, op.how,
                                        op.logic.preprocess)
            for ds in op.downstreams:
                upstream_map[ds.fn_name] = marker

            queue.extend(op.downstreams)
Exemplo n.º 11
0
    def deploy(self, cloudburst):
        if self.deployed:
            return [], [], {}, [self.fn_name], {}, [], []

        if not self.registered:
            self.register(cloudburst)

        functions, connections, gpus, batching = [], [], [], []
        # NOTE: What if we need something other than None here? Will need an
        # arg at some point, maybe. This is for multi-execution pick-first
        # result back functions.
        if self.multi_exec:
            functions.append((self.fn_name, [None]))
        else:
            functions.append(self.fn_name)

        if self.gpu:
            gpus.append(self.fn_name)
        if self.batching:
            batching.append(self.fn_name)

        arg_map = {}
        arg_map[self.fn_name] = list(self.get_exec_args())

        registered = {}
        for ds in self.downstreams:
            fn_names, ds_conns, fn_args, starts, ds_registered, ds_gpus, \
                ds_batching = \
                ds.deploy(cloudburst)

            for fn_name in starts:
                connections.append((self.fn_name, fn_name))

            functions += fn_names
            connections += ds_conns
            gpus += ds_gpus
            batching += ds_batching
            arg_map.update(fn_args)
            registered.update(ds_registered)

        self.deployed = True

        # We can number these linearly because the optimization algorithm
        # guarantees that breakpoints will only be at linear points in the
        # flow.
        if self.breakpoint:
            uid = str(len(registered))
            name = self.flowname + '-' + uid
            success, error = cloudburst.register_dag(
                name,
                functions,
                connections,
                gpu_functions=gpus,
                batching_functions=batching)

            if not success:
                raise FlowError(str(error))

            registered[name] = arg_map
            return [], [], {}, [], registered, [], []
        else:
            return functions, connections, arg_map, [self.fn_name], \
                registered, gpus, batching
Exemplo n.º 12
0
    def __init__(self,
                 fn: Callable,
                 fntype: FunctionType,
                 flowname: str,
                 col: str,
                 names: List[str],
                 init: Callable,
                 high_variance: bool,
                 gpu: bool,
                 batching: bool,
                 multi: bool,
                 sink: list):
        self.__name__ = 'MapOperator'
        self.fn = fn
        self._setup(flowname)

        self.fntype = fntype
        self.sink = sink
        self.col = col
        self.names = names
        self.high_variance = high_variance
        self.gpu = gpu
        self.batching = batching
        self.multi = multi

        self.supports_broadcast = True

        if col is not None: # Map over column not over whole row.
            if len(names) >= 2: # We can only rename one column here
                raise FlowError('Map over a column cannot rename multiple'
                                + ' columns.')
            if gpu:
                raise FlowError('You cannot do a column map with batching.')
        else:
            if len(names) != 0 and len(names) != len(fntype.ret):
                raise FlowError('Map over row must have same number of columns'
                                + ' as function outputs.')

        class MapLogic:
            def __init__(self, cloudburst, send_broadcast, recv_broadcast,
                         batching, multi):
                # We pass in None in local mode because we expect the
                # Cloudburst user library in the other case.
                self.send_broadcast = send_broadcast
                self.recv_broadcast = recv_broadcast
                self.batching = batching
                self.multi = multi

                self.preprocess(cloudburst)

            def preprocess(self, _):
                pass

            def run(self, cloudburst, fn, fntype, col, names, inp):
                # Merge all of the tables.
                serialized = False
                batching = self.batching and isinstance(inp, list)
                if batching:
                    if type(inp[0]) == bytes:
                        inp = [deserialize(tbl) for tbl in inp]
                        serialized = True

                    # inp will be a list of Tables. If it not, this is part of
                    # a MultiOperator, and everything is taken care of for us.
                    merged, mappings = merge_tables(inp)
                    inp = merged

                    # This will all be repeated because of the way Cloudburst's
                    # batching works, so we just pick the first one. But we
                    # check because even with batching enabled, in a multi
                    # operator, we will not have to deal with this.
                    if type(fn) == list:
                        fn = fn[0]
                    if type(fntype) == list:
                        fntype = fntype[0]
                    if type(col) == list:
                        col = col[0]
                    if type(names) == list and type(names[0]) == list:
                        names = names[0]
                else:
                    if type(inp) == bytes:
                        inp = deserialize(inp)
                        serialized = True

                schema = []
                if col is None:
                    if len(names) != 0:
                        schema = list(zip(names, fntype.ret))
                    else:
                        for i in range(len(fntype.ret)):
                            schema.append((str(i), fntype.ret[i]))
                else:
                    for name, tp in inp.schema:
                        if name != col:
                            schema.append((name, tp))
                        else:
                            if len(names) != 0:
                                schema.append((names[0], fntype.ret[0]))
                            else:
                                schema.append((name, fntype.ret[0]))

                if isinstance(inp, GroupbyTable):
                    result = GroupbyTable(schema, inp.col)
                    for group, gtable in inp.get():
                        result.add_group(group, self.run(fn, fntype, col, gtable))
                else:
                    result = Table(schema)

                    if self.batching or self.multi:
                        res = fn(self, inp)
                        for val in res:
                            if type(val) == tuple:
                                val = list(val)
                            elif type(val) != list:
                                val = [val]

                            result.insert(val)
                    else:
                        for row in inp.get():
                            if col is None:
                                vals = fn(self, row)
                                if type(vals) == tuple:
                                    vals = list(vals)
                                elif type(vals) != list:
                                    vals = [vals]

                                result.insert(vals, row[Row.qid_key])
                            else:
                                val = fn(self, row[col])
                                new_vals = []
                                for name, _ in inp.schema:
                                    if name == col:
                                        new_vals.append(val)
                                    else:
                                        new_vals.append(row[name])

                                result.insert(new_vals, row[Row.qid_key])

                if batching: # Unmerge all the tables.
                    tables = demux_tables(result, mappings)
                    result = tables

                    if serialized:
                        result = [serialize(tbl) for tbl in result]
                else:
                    if serialized:
                        result = serialize(result)

                if self.send_broadcast:
                    import uuid
                    uid = str(uuid.uuid4())
                    cloudburst.put(uid, result)
                    result = uid

                return result

        self.logic = MapLogic
        if init is not None:
            self.logic.preprocess = init
        self.exec_args = (self.fn, self.fntype, self.col, self.names)
        self.init_args = (False, False, self.batching, self.multi)