class Meta(object): """ Stores metadata about labeled and intermediary datasets, including: - Dataset label - Function to apply - Buffer size It also is used for dataset removal. """ def __init__(self, cache): self.label = "okmeta" self.cache = cache self.logger = Logger(self.label) def register(self, dsLabel, obj): self.logger.debug("Registering '%s'" % dsLabel) self.cache.hset(self.label, dsLabel, pickle_dumps(obj)) def get(self, dsLabel): self.logger.debug("Getting '%s'" % dsLabel) return pickle.loads(self.cache.get(self.label, dsLabel)) def createIntermediary(self, ds): """ Create intermediary label append operating + fn to a list """ self.logger.debug("Creating intermediary '%s'" % dsLabel) prefix = "%s_intermediary_" % ds.label timer = Timer() self.currentDsLabel = prefix + str(self.cache.incr(prefix)) self.profiler.add("masterCache", timer.since()) return self.currentDsLabel def remove(self, dsLabel): self.logger.debug("Removing '%s'" % dsLabel) self.cache.hdel(self.label, dsLabel) for k in self.cache.keys(dsLabel + "*"): self.cache.delete(dsLabel) def rename(self, label, newLabel): self.cache.hset(self.label, newLabel, self.cache.get(self.label, label))
def __init__(self, config, cache, bufferSize): self.cache = cache self.config = config self.logger = Logger("master") self.meta = Meta(self.cache) self.dataSets = {} localTimer = Timer() self.profiler = Profiler() """ zmq init """ try: raw_input except NameError: # Python 3 raw_input = input zmqTimer = Timer() context = zmq.Context() # Sender self.sender = context.socket(zmq.PUSH) self.sender.bind("tcp://*:" + str(self.config["cluster"]["master"]["port"])) self.logger.debug("Initialized sender socket") # Sink self.sink = context.socket(zmq.PULL) self.sink.bind("tcp://*:" + str(self.config["cluster"]["return"]["port"])) self.logger.debug("Initialized sink socket") # Server self.server = context.socket(zmq.REP) self.server.bind("tcp://*:" + str(self.config["server"]["port"])) self.logger.debug("Initialized server socket") self.profiler.add("localZmq", zmqTimer.since())
#!/usr/bin/env python from okdataset.clist import ChainableList from okdataset.context import Context from okdataset.logger import Logger logger = Logger("map from existing example") context = Context() logger.info("Building dataset from existing") ds = context.dataSet(label="big list", fromExisting=True) logger.info("Calling map") ds.map(lambda x: x * 2) ds.map(lambda x: x + 3) ds.compute() logger.info("All done!")
def __init__(self, cache, config, clist=None, label=None, fromExisting=False, bufferSize=None): self.cache = cache self.config = config self.meta = Meta(self.cache) self.label = label if label else "okds_%s" % uuid.uuid1() self.opsList = [] localTimer = Timer() self.profiler = Profiler() if clist is None and not fromExisting: raise ValueError("Must provide either clist or fromExisting") if clist is not None and fromExisting: raise ValueError("Cannot provide both clist and fromExisting") if fromExisting and bufferSize is not None: raise ValueError("Cannot specify bufferSize for existing dataset") if fromExisting and label is None: raise ValueError("Must specify label for existing dataset") if clist is not None: self.dsLen = len(clist) if bufferSize is not None: self.bufferSize = bufferSize else: self.bufferSize = self.config["cache"]["io"]["bufferSize"] self.logger = Logger("dataset '" + self.label + "'") """ Store the current working dataset label. This will change as new intermediary datasets are created. """ self.currentDsLabel = self.label self.currentIsIntermediary = False if fromExisting: self.dsLen = self.cache.len(self.label) else: # remove existing self.logger.debug("Removing existing") # Set total number of buffers self.buffers = self.dsLen / self.bufferSize self.buffers = self.buffers + 1 if self.dsLen % self.bufferSize > 0 else self.buffers for i in xrange(0, self.buffers + 1): start = self.bufferSize * i end = self.bufferSize * (i + 1) pickleTimer = Timer() buf = pickle_dumps(ChainableList(clist[start:end])) self.profiler.add("masterPickle", pickleTimer.since()) cacheTimer = Timer() self.cache.pushBuffer(label, i, buf) self.profiler.add("masterCache", cacheTimer.since()) self.logger.debug("Initialized with %d buffers" % self.buffers) self.logger.debug(json.dumps(self.profiler.toDict(), indent=2))
class DataSet(ChainableList): def __init__(self, cache, config, clist=None, label=None, fromExisting=False, bufferSize=None): self.cache = cache self.config = config self.meta = Meta(self.cache) self.label = label if label else "okds_%s" % uuid.uuid1() self.opsList = [] localTimer = Timer() self.profiler = Profiler() if clist is None and not fromExisting: raise ValueError("Must provide either clist or fromExisting") if clist is not None and fromExisting: raise ValueError("Cannot provide both clist and fromExisting") if fromExisting and bufferSize is not None: raise ValueError("Cannot specify bufferSize for existing dataset") if fromExisting and label is None: raise ValueError("Must specify label for existing dataset") if clist is not None: self.dsLen = len(clist) if bufferSize is not None: self.bufferSize = bufferSize else: self.bufferSize = self.config["cache"]["io"]["bufferSize"] self.logger = Logger("dataset '" + self.label + "'") """ Store the current working dataset label. This will change as new intermediary datasets are created. """ self.currentDsLabel = self.label self.currentIsIntermediary = False if fromExisting: self.dsLen = self.cache.len(self.label) else: # remove existing self.logger.debug("Removing existing") # Set total number of buffers self.buffers = self.dsLen / self.bufferSize self.buffers = self.buffers + 1 if self.dsLen % self.bufferSize > 0 else self.buffers for i in xrange(0, self.buffers + 1): start = self.bufferSize * i end = self.bufferSize * (i + 1) pickleTimer = Timer() buf = pickle_dumps(ChainableList(clist[start:end])) self.profiler.add("masterPickle", pickleTimer.since()) cacheTimer = Timer() self.cache.pushBuffer(label, i, buf) self.profiler.add("masterCache", cacheTimer.since()) self.logger.debug("Initialized with %d buffers" % self.buffers) self.logger.debug(json.dumps(self.profiler.toDict(), indent=2)) def createIntermediary(self): prefix = self.label + "_intermediary_" timer = Timer() self.currentDsLabel = prefix + str(self.cache.incr(prefix)) self.logger.debug("Creating intermediary '%s'" % self.currentDsLabel) self.profiler.add("masterCache", timer.since()) self.currentIsIntermediary = True return self.currentDsLabel def flatMap(self, fn): self.opsList.append({ "method": "flatMap", "fn": fn }) return self def map(self, fn): self.opsList.append({ "method": "map", "fn": fn }) return self def filter(self, fn): self.opsList.append({ "method": "filter", "fn": fn }) return self # XXX No distributed implementation yet. #def reduce(self, fn): # self.opsList.append({ "method": "reduce", "fn": fn }) # return self.collect() # list items must be tuples of the form (key, ChainableList(values)) - like spark's LabeledPoint def reduceByKey(self, fn): res = ChainableList([]) groups = ChainableList([ (key, ChainableList(group)) for key, group in groupby(sorted(self), lambda x: x[0]) ])\ .map(lambda (key, items): (key, items.map(lambda x: x[1]))) for key, values in groups: self.logger.trace(values) res.append((key, reduce(fn, values))) return res def collect(self): self.profiler = Profiler() localTimer = Timer() res = ChainableList([]) for k in sorted(self.cache.getKeys(self.currentDsLabel), key=lambda x: int(x)): cacheTimer = Timer() buf = self.cache.get(self.currentDsLabel, k) self.profiler.add("collectCache", cacheTimer.since()) res.extend(pickle.loads(buf)) self.profiler.add("collectMaster", localTimer.since()) if self.currentIsIntermediary: self.logger.debug("Removing intermediary" + self.currentDsLabel) self.meta.remove(self.currentDsLabel) self.currentDsLabel = self.label return res def getProfile(self, f=None): if f: f(self.profiler.toDict()) else: return self.profiler.toDict() def label(self, label): self.meta.rename(self.currentDsLabel, label) self.currentDsLabel = label self.currentIsIntermediary = False def __del__(self): self.meta.remove(self.currentDsLabel)
class Master(ChainableList): """ Master is both the client-facing server process and also the grid controller. """ def __init__(self, config, cache, bufferSize): self.cache = cache self.config = config self.logger = Logger("master") self.meta = Meta(self.cache) self.dataSets = {} localTimer = Timer() self.profiler = Profiler() """ zmq init """ try: raw_input except NameError: # Python 3 raw_input = input zmqTimer = Timer() context = zmq.Context() # Sender self.sender = context.socket(zmq.PUSH) self.sender.bind("tcp://*:" + str(self.config["cluster"]["master"]["port"])) self.logger.debug("Initialized sender socket") # Sink self.sink = context.socket(zmq.PULL) self.sink.bind("tcp://*:" + str(self.config["cluster"]["return"]["port"])) self.logger.debug("Initialized sink socket") # Server self.server = context.socket(zmq.REP) self.server.bind("tcp://*:" + str(self.config["server"]["port"])) self.logger.debug("Initialized server socket") self.profiler.add("localZmq", zmqTimer.since()) def compute(self, label, intermediaryLabel, opsList): self.logger.debug("Starting compute on %s" % label) self.profiler = Profiler() localTimer = Timer() cacheTimer = Timer() keys = self.cache.getKeys(label) self.profiler.add("computeCache", cacheTimer.since()) self.logger.debug("Got %d keys" % len(keys)) source = label dest = intermediaryLabel self.meta.register(dest, { "opsList": opsList, "buffers": len(keys), "isIntermediary": True }) for key in keys: self.logger.trace("Sending key %s" % key) pickleTimer = Timer() msg = pickle_dumps({ "offset": key, "sourceLabel": source, "destLabel": dest }) self.profiler.add("computePickle", pickleTimer.since()) zmqTimer = Timer() self.sender.send(msg) self.profiler.add("computeZmq", zmqTimer.since()) results = 0 while results != len(keys) - 1: self.logger.trace("Received %d out of %d results" % (results, len(keys) - 1)) zmqTimer = Timer() res = self.sink.recv_pyobj() self.profiler.add("computeZmq", zmqTimer.since()) self.profiler.append(res["profiler"]) results = results + 1 self.profiler.add("computeOverall", localTimer.since()) self.logger.info("compute complete") self.logger.debug(json.dumps(self.profiler.toDict(), indent=2)) return self def getProfile(self, fn=None): if fn: fn(self.profiler.toDict()) else: return self.profiler.toDict() def mainLoop(self): while True: self.logger.debug("Receiving") req = pickle.loads(self.server.recv()) data = req.get("data") if req["method"] == "create": self.logger.debug("create called for dataset %s, id %s" % (req["data"]["label"], req["id"])) ds = DataSet(self.cache, self.config, data["clist"], label=data["label"], fromExisting=data["fromExisting"], bufferSize=data["bufferSize"]) print(ds.currentDsLabel) self.dataSets[req["id"]] = ds self.server.send(pickle_dumps({"status": "ok"})) elif req["method"] in ["map", "flatMap", "reduceByKey", "filter"]: ds = self.dataSets[req["id"]] self.logger.debug( "%s called for dataset %s, id %s" % (req["method"], ds.currentDsLabel, req["id"])) getattr(ds, req["method"])(data) self.server.send(pickle_dumps({"status": "ok"})) elif req["method"] in ["collect", "reduce"]: ds = self.dataSets[req["id"]] self.logger.debug( "%s called for dataset %s, id %s" % (req["method"], ds.currentDsLabel, req["id"])) self.compute(ds.currentDsLabel, ds.createIntermediary(), ds.opsList) res = ds.collect() if req["method"] == "reduce": res = res.reduce(data) self.server.send(pickle_dumps({"status": "ok", "data": res})) elif req["method"] == "compute": ds = self.dataSets[req["id"]] self.logger.debug("compute called for dataset %s, id %s" % (ds.currentDsLabel, req["id"])) self.compute(ds.currentDsLabel, ds.createIntermediary(), ds.opsList) self.server.send(pickle_dumps({"status": "ok"})) else: self.logger.debug("unknown method %s" % req["method"])
#!/usr/bin/env python from okdataset.clist import ChainableList from okdataset.context import Context from okdataset.logger import Logger logger = Logger("flatmaparray example") context = Context() logger.info("Building big list") l = ChainableList([ x for x in xrange(0, 100) ]) logger.info("Building dataset") ds = context.dataSet(l, label="flatMap list", bufferSize=1) logger.info("Calling flatMap") def fm(x): for i in [ "a", "b", "c" ]: yield [ x, i ] res = ds.flatMap(fm).collect() print(res) logger.info("All done!")
def __init__(self, cache): self.label = "okmeta" self.cache = cache self.logger = Logger(self.label)
#!/usr/bin/env python from okdataset.clist import ChainableList from okdataset.context import Context from okdataset.dataset import DataSet from okdataset.logger import Logger logger = Logger("maparray example") logger.info("Building big list") l = ChainableList([1, 1, 1, 2, 2, 3, 6, 9, 9, 9, 12]) logger.info("Building dataset") context = Context() ds = context.dataSet(l, bufferSize=1) a = 1 logger.info("Calling chain") ds.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) print ds.collect()
def __init__(self, config, cache): self.logger = Logger("worker") self.offsets = {} meta = Meta(cache) self.currentDestLabel = "" self.opsList = None cluster = config["cluster"] context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect("tcp://" + cluster["master"]["host"] + ":" + str(cluster["master"]["port"])) self.logger.debug("Initialized receiver socket") returner = context.socket(zmq.PUSH) returner.connect("tcp://" + cluster["return"]["host"] + ":" + str(cluster["return"]["port"])) self.logger.debug("Initialized returner socket") self.logger.info("Worker initialized") while True: profiler = Profiler() local = Timer() zmqTimer = Timer() msg = receiver.recv() profiler.add("workerZmq", zmqTimer.since()) pickleTimer = Timer() msg = pickle.loads(msg) profiler.add("workerPickle", pickleTimer.since()) self.logger.trace("Received message: " + str(msg)) if self.currentDestLabel != msg["destLabel"]: self.opsList = meta.get(msg["destLabel"])["opsList"] self.currentDestLabel = msg["destLabel"] cacheTimer = Timer() buf = cache.get(msg["sourceLabel"], msg["offset"]) profiler.add("workerCache", cacheTimer.since()) pickleTimer = Timer() buf = pickle.loads(buf) profiler.add("workerPickle", pickleTimer.since()) self.logger.trace("Received buffer") res = buf for op in self.opsList: res = getattr(res, op["method"])(op["fn"]) self.logger.trace("Processed buffer") reply = { "destLabel": msg["destLabel"], "offset": msg["offset"], "status": "ok", "profiler": profiler } # in case of flatMap if len(res) != len(buf): reply["size"] = len(res) pickleTimer = Timer() res = pickle_dumps(res) profiler.add("workerPickle", pickleTimer.since()) cacheTimer = Timer() cache.pushBuffer(msg["destLabel"], msg["offset"], res) profiler.add("workerCache", cacheTimer.since()) self.logger.trace("Processed buffer") profiler.add("workerOverall", local.since()) returner.send_pyobj(reply) self.logger.trace("Reply sent")