def start_remote_worker(worker, st, ed): """ Start processes on a worker machine. The machine will launch worker processes ``st`` through ``ed``. :param worker: hostname to connect to. :param st: First process index to start. :param ed: Last process to start. """ if FLAGS.use_threads and worker == "localhost": util.log_info("Using threads.") for i in range(st, ed): p = threading.Thread(target=spartan.worker._start_worker, args=((socket.gethostname(), FLAGS.port_base), i)) p.daemon = True p.start() time.sleep(0.1) return util.log_info("Starting worker %d:%d on host %s", st, ed, worker) if FLAGS.oprofile: os.system("mkdir operf.%s" % worker) ssh_args = ["ssh", "-oForwardX11=no", worker] args = ["cd %s && " % os.path.abspath(os.path.curdir)] if FLAGS.xterm: args += ["xterm", "-e"] if FLAGS.oprofile: args += ["operf -e CPU_CLK_UNHALTED:100000000", "-g", "-d", "operf.%s" % worker] args += [ #'gdb', '-ex', 'run', '--args', "python", "-m spartan.worker", "--master=%s:%d" % (socket.gethostname(), FLAGS.port_base), "--count=%d" % (ed - st), "--heartbeat_interval=%d" % FLAGS.heartbeat_interval, ] # add flags from config/user for (name, value) in FLAGS: if name in ["worker_list", "print_options"]: continue args += [repr(value)] # print >>sys.stderr, args util.log_debug("Running worker %s", " ".join(args)) time.sleep(0.1) # TODO: improve this to make log break at newline if worker != "localhost": p = subprocess.Popen(ssh_args + args, executable="ssh") else: p = subprocess.Popen(" ".join(args), shell=True, stdin=subprocess.PIPE) return p
def start_remote_worker(worker, st, ed): ''' Start processes on a worker machine. The machine will launch worker processes ``st`` through ``ed``. :param worker: hostname to connect to. :param st: First process index to start. :param ed: Last process to start. ''' if FLAGS.use_threads and worker == 'localhost': util.log_info('Using threads.') for i in range(st, ed): p = threading.Thread(target=spartan.worker._start_worker, args=((socket.gethostname(), FLAGS.port_base), i)) p.daemon = True p.start() time.sleep(0.1) return util.log_info('Starting worker %d:%d on host %s', st, ed, worker) if FLAGS.oprofile: os.system('mkdir operf.%s' % worker) ssh_args = ['ssh', '-oForwardX11=no', worker ] args = ['cd %s && ' % os.path.abspath(os.path.curdir)] if FLAGS.xterm: args += ['xterm', '-e',] if FLAGS.oprofile: args += ['operf -e CPU_CLK_UNHALTED:100000000', '-g', '-d', 'operf.%s' % worker] args += [ #'gdb', '-ex', 'run', '--args', 'python', '-m spartan.worker', '--master=%s:%d' % (socket.gethostname(), FLAGS.port_base), '--count=%d' % (ed - st), '--heartbeat_interval=%d' % FLAGS.heartbeat_interval ] # add flags from config/user for (name, value) in FLAGS: if name in ['worker_list', 'print_options']: continue args += [repr(value)] #print >>sys.stderr, args util.log_debug('Running worker %s', ' '.join(args)) time.sleep(0.1) if worker != 'localhost': p = subprocess.Popen(ssh_args + args, executable='ssh') else: p = subprocess.Popen(' '.join(args), shell=True, stdin=subprocess.PIPE) return p
def bind(self): host, port = self.addr host = socket.gethostbyname(host) util.log_debug('Binding... %s', (host, port)) if port == -1: self.addr = (host, self._zmq.bind_to_random_port('tcp://%s' % host)) else: try: self._zmq.bind('tcp://%s:%d' % (host, port)) except zmq.ZMQError: util.log_info('Failed to bind (%s, %d)' % (host, port)) raise
def compile_parakeet_source(src): '''Compile source code defining a parakeet function.''' util.log_debug('Compiling parakeet source.') tmpfile = tempfile.NamedTemporaryFile(delete=True, prefix='spartan-local-', suffix='.py') tmpfile.write(src) tmpfile.flush() #util.log_info('File: %s, Source: \n %s \n', tmpfile.name, src) #os.rename(tmpfile.name, srcfile) #atexit.register(lambda: os.remove(srcfile)) try: module = imp.load_source('parakeet_temp', tmpfile.name) except Exception, ex: util.log_info('Failed to build parakeet wrapper') util.log_debug('Source was: %s', src) raise CodegenException(ex.message, ex.args)
def maybe_steal_tile(self, req, handle): ''' This is called when a worker has finished processing all of it's current tiles, and is looking for more work to do. We check if there are any outstanding tiles on existing workers to steal from. Args: req (UpdateAndStealTileReq): handle (PendingRequest): ''' self._worker_statuses[req.worker_id].kernel_remain_tiles = [] # update the migrated tile if req.old_tile_id is not None: util.log_debug('worker(%s) update old_tile:%s new_tile:%s', req.worker_id, req.old_tile_id, req.new_tile_id) for array in self._arrays: ex = array.blob_to_ex.get(req.old_tile_id) if ex is not None: array.tiles[ex] = req.new_tile_id array.blob_to_ex[req.new_tile_id] = ex del array.blob_to_ex[req.old_tile_id] self._ctx.destroy(req.old_tile_id) break # apply a new tile for execution slow_workers = sorted(self._worker_statuses.iteritems(), key=lambda x: len(x[1].kernel_remain_tiles), reverse=True) for slow_worker in slow_workers: if len(slow_worker[1].kernel_remain_tiles) == 0: break tile_id = slow_worker[1].kernel_remain_tiles[0] if self._ctx.cancel_tile(slow_worker[0], tile_id): util.log_debug('move tile:%s from worker(%s) to worker(%s)', tile_id, slow_worker[0], req.worker_id) slow_worker[1].kernel_remain_tiles.remove(tile_id) resp = core.TileIdMessage(tile_id=tile_id) handle.done(resp) return resp = core.TileIdMessage(tile_id=None) handle.done(resp)
def maybe_steal_tile(self, req, handle): """ This is called when a worker has finished processing all of it's current tiles, and is looking for more work to do. We check if there are any outstanding tiles on existing workers to steal from. Args: req (UpdateAndStealTileReq): handle (PendingRequest): """ self._worker_statuses[req.worker_id].kernel_remain_tiles = [] # update the migrated tile if req.old_tile_id is not None: util.log_debug("worker(%s) update old_tile:%s new_tile:%s", req.worker_id, req.old_tile_id, req.new_tile_id) for array in self._arrays: ex = array.blob_to_ex.get(req.old_tile_id) if ex is not None: array.tiles[ex] = req.new_tile_id array.blob_to_ex[req.new_tile_id] = ex del array.blob_to_ex[req.old_tile_id] self._ctx.destroy(req.old_tile_id) break # apply a new tile for execution slow_workers = sorted( self._worker_statuses.iteritems(), key=lambda x: len(x[1].kernel_remain_tiles), reverse=True ) for slow_worker in slow_workers: if len(slow_worker[1].kernel_remain_tiles) == 0: break tile_id = slow_worker[1].kernel_remain_tiles[0] if self._ctx.cancel_tile(slow_worker[0], tile_id): util.log_debug("move tile:%s from worker(%s) to worker(%s)", tile_id, slow_worker[0], req.worker_id) slow_worker[1].kernel_remain_tiles.remove(tile_id) resp = core.TileIdMessage(tile_id=tile_id) handle.done(resp) return resp = core.TileIdMessage(tile_id=None) handle.done(resp)
def compile_parakeet_source(src): '''Compile source code defining a parakeet function.''' util.log_debug('Compiling parakeet source.') tmpfile = tempfile.NamedTemporaryFile(delete=True, prefix='spartan-local-', suffix='.py') tmpfile.write(src) tmpfile.flush() #util.log_info('File: %s, Source: \n %s \n', tmpfile.name, src) #os.rename(tmpfile.name, srcfile) #atexit.register(lambda: os.remove(srcfile)) try: module = imp.load_source('parakeet_temp', tmpfile.name) except Exception, ex: util.log_info('Failed to build parakeet wrapper') util.log_debug('Source was: %s', src) raise CodegenException(ex.message, ex.args)
def start_remote_worker(worker, st, ed): ''' Start processes on a worker machine. The machine will launch worker processes ``st`` through ``ed``. :param worker: hostname to connect to. :param st: First process index to start. :param ed: Last process to start. ''' if FLAGS.use_threads and worker == 'localhost': util.log_info('Using threads.') for i in range(st, ed): p = threading.Thread(target=spartan.worker._start_worker, args=((socket.gethostname(), FLAGS.port_base), i)) p.daemon = True p.start() time.sleep(0.1) return util.log_info('Starting worker %d:%d on host %s', st, ed, worker) if FLAGS.oprofile: os.system('mkdir operf.%s' % worker) ssh_args = ['ssh', '-oForwardX11=no', worker] args = ['cd %s && ' % os.path.abspath(os.path.curdir)] if FLAGS.xterm: args += [ 'xterm', '-e', ] if FLAGS.oprofile: args += [ 'operf -e CPU_CLK_UNHALTED:100000000', '-g', '-d', 'operf.%s' % worker ] args += [ #'gdb', '-ex', 'run', '--args', 'python', '-m spartan.worker', '--master=%s:%d' % (socket.gethostname(), FLAGS.port_base), '--count=%d' % (ed - st), '--heartbeat_interval=%d' % FLAGS.heartbeat_interval ] # add flags from config/user for (name, value) in FLAGS: if name in ['worker_list', 'print_options']: continue args += [repr(value)] #print >>sys.stderr, args util.log_debug('Running worker %s', ' '.join(args)) time.sleep(0.1) if worker != 'localhost': p = subprocess.Popen(ssh_args + args, executable='ssh') else: p = subprocess.Popen(' '.join(args), shell=True, stdin=subprocess.PIPE) return p