Exemplo n.º 1
0
def start_remote_worker(worker, st, ed):
    """
  Start processes on a worker machine.

  The machine will launch worker processes ``st`` through ``ed``.

  :param worker: hostname to connect to.
  :param st: First process index to start.
  :param ed: Last process to start.
  """
    if FLAGS.use_threads and worker == "localhost":
        util.log_info("Using threads.")
        for i in range(st, ed):
            p = threading.Thread(target=spartan.worker._start_worker, args=((socket.gethostname(), FLAGS.port_base), i))
            p.daemon = True
            p.start()
        time.sleep(0.1)
        return

    util.log_info("Starting worker %d:%d on host %s", st, ed, worker)
    if FLAGS.oprofile:
        os.system("mkdir operf.%s" % worker)

    ssh_args = ["ssh", "-oForwardX11=no", worker]

    args = ["cd %s && " % os.path.abspath(os.path.curdir)]

    if FLAGS.xterm:
        args += ["xterm", "-e"]

    if FLAGS.oprofile:
        args += ["operf -e CPU_CLK_UNHALTED:100000000", "-g", "-d", "operf.%s" % worker]

    args += [
        #'gdb', '-ex', 'run', '--args',
        "python",
        "-m spartan.worker",
        "--master=%s:%d" % (socket.gethostname(), FLAGS.port_base),
        "--count=%d" % (ed - st),
        "--heartbeat_interval=%d" % FLAGS.heartbeat_interval,
    ]

    # add flags from config/user
    for (name, value) in FLAGS:
        if name in ["worker_list", "print_options"]:
            continue
        args += [repr(value)]

    # print >>sys.stderr, args
    util.log_debug("Running worker %s", " ".join(args))
    time.sleep(0.1)
    # TODO: improve this to make log break at newline
    if worker != "localhost":
        p = subprocess.Popen(ssh_args + args, executable="ssh")
    else:
        p = subprocess.Popen(" ".join(args), shell=True, stdin=subprocess.PIPE)

    return p
Exemplo n.º 2
0
def start_remote_worker(worker, st, ed):
  '''
  Start processes on a worker machine.

  The machine will launch worker processes ``st`` through ``ed``.

  :param worker: hostname to connect to.
  :param st: First process index to start.
  :param ed: Last process to start.
  '''
  if FLAGS.use_threads and worker == 'localhost':
    util.log_info('Using threads.')
    for i in range(st, ed):
      p = threading.Thread(target=spartan.worker._start_worker,
                           args=((socket.gethostname(), FLAGS.port_base), i))
      p.daemon = True
      p.start()
    time.sleep(0.1)
    return

  util.log_info('Starting worker %d:%d on host %s', st, ed, worker)
  if FLAGS.oprofile:
    os.system('mkdir operf.%s' % worker)

  ssh_args = ['ssh', '-oForwardX11=no', worker ]

  args = ['cd %s && ' % os.path.abspath(os.path.curdir)]

  if FLAGS.xterm:
    args += ['xterm', '-e',]

  if FLAGS.oprofile:
    args += ['operf -e CPU_CLK_UNHALTED:100000000', '-g', '-d', 'operf.%s' % worker]

  args += [
          #'gdb', '-ex', 'run', '--args',
          'python', '-m spartan.worker',
          '--master=%s:%d' % (socket.gethostname(), FLAGS.port_base),
          '--count=%d' % (ed - st),
          '--heartbeat_interval=%d' % FLAGS.heartbeat_interval
          ]

  # add flags from config/user
  for (name, value) in FLAGS:
    if name in ['worker_list', 'print_options']: continue
    args += [repr(value)]

  #print >>sys.stderr, args
  util.log_debug('Running worker %s', ' '.join(args))
  time.sleep(0.1)
  if worker != 'localhost':
    p = subprocess.Popen(ssh_args + args, executable='ssh')
  else:
    p = subprocess.Popen(' '.join(args), shell=True, stdin=subprocess.PIPE)

  return p
Exemplo n.º 3
0
 def bind(self):
   host, port = self.addr
   host = socket.gethostbyname(host)
   util.log_debug('Binding... %s', (host, port))
   if port == -1:
     self.addr = (host, self._zmq.bind_to_random_port('tcp://%s' % host))
   else:
     try:
       self._zmq.bind('tcp://%s:%d' % (host, port))
     except zmq.ZMQError:
       util.log_info('Failed to bind (%s, %d)' % (host, port))
       raise
Exemplo n.º 4
0
def compile_parakeet_source(src):
  '''Compile source code defining a parakeet function.'''
  util.log_debug('Compiling parakeet source.')
  tmpfile = tempfile.NamedTemporaryFile(delete=True, prefix='spartan-local-', suffix='.py')
  tmpfile.write(src)
  tmpfile.flush()

  #util.log_info('File: %s, Source: \n %s \n', tmpfile.name, src)

  #os.rename(tmpfile.name, srcfile)
  #atexit.register(lambda: os.remove(srcfile))

  try:
    module = imp.load_source('parakeet_temp', tmpfile.name)
  except Exception, ex:
    util.log_info('Failed to build parakeet wrapper')
    util.log_debug('Source was: %s', src)
    raise CodegenException(ex.message, ex.args)
Exemplo n.º 5
0
    def maybe_steal_tile(self, req, handle):
        '''
    This is called when a worker has finished processing all of it's current tiles,
    and is looking for more work to do.
    We check if there are any outstanding tiles on existing workers to steal from.

    Args:
      req (UpdateAndStealTileReq):
      handle (PendingRequest):
    '''
        self._worker_statuses[req.worker_id].kernel_remain_tiles = []

        # update the migrated tile
        if req.old_tile_id is not None:
            util.log_debug('worker(%s) update old_tile:%s new_tile:%s',
                           req.worker_id, req.old_tile_id, req.new_tile_id)
            for array in self._arrays:
                ex = array.blob_to_ex.get(req.old_tile_id)
                if ex is not None:
                    array.tiles[ex] = req.new_tile_id
                    array.blob_to_ex[req.new_tile_id] = ex

                    del array.blob_to_ex[req.old_tile_id]
                    self._ctx.destroy(req.old_tile_id)
                    break

        # apply a new tile for execution
        slow_workers = sorted(self._worker_statuses.iteritems(),
                              key=lambda x: len(x[1].kernel_remain_tiles),
                              reverse=True)
        for slow_worker in slow_workers:
            if len(slow_worker[1].kernel_remain_tiles) == 0: break

            tile_id = slow_worker[1].kernel_remain_tiles[0]
            if self._ctx.cancel_tile(slow_worker[0], tile_id):
                util.log_debug('move tile:%s from worker(%s) to worker(%s)',
                               tile_id, slow_worker[0], req.worker_id)
                slow_worker[1].kernel_remain_tiles.remove(tile_id)
                resp = core.TileIdMessage(tile_id=tile_id)
                handle.done(resp)
                return

        resp = core.TileIdMessage(tile_id=None)
        handle.done(resp)
Exemplo n.º 6
0
    def maybe_steal_tile(self, req, handle):
        """
    This is called when a worker has finished processing all of it's current tiles,
    and is looking for more work to do.
    We check if there are any outstanding tiles on existing workers to steal from.

    Args:
      req (UpdateAndStealTileReq):
      handle (PendingRequest):
    """
        self._worker_statuses[req.worker_id].kernel_remain_tiles = []

        # update the migrated tile
        if req.old_tile_id is not None:
            util.log_debug("worker(%s) update old_tile:%s new_tile:%s", req.worker_id, req.old_tile_id, req.new_tile_id)
            for array in self._arrays:
                ex = array.blob_to_ex.get(req.old_tile_id)
                if ex is not None:
                    array.tiles[ex] = req.new_tile_id
                    array.blob_to_ex[req.new_tile_id] = ex

                    del array.blob_to_ex[req.old_tile_id]
                    self._ctx.destroy(req.old_tile_id)
                    break

        # apply a new tile for execution
        slow_workers = sorted(
            self._worker_statuses.iteritems(), key=lambda x: len(x[1].kernel_remain_tiles), reverse=True
        )
        for slow_worker in slow_workers:
            if len(slow_worker[1].kernel_remain_tiles) == 0:
                break

            tile_id = slow_worker[1].kernel_remain_tiles[0]
            if self._ctx.cancel_tile(slow_worker[0], tile_id):
                util.log_debug("move tile:%s from worker(%s) to worker(%s)", tile_id, slow_worker[0], req.worker_id)
                slow_worker[1].kernel_remain_tiles.remove(tile_id)
                resp = core.TileIdMessage(tile_id=tile_id)
                handle.done(resp)
                return

        resp = core.TileIdMessage(tile_id=None)
        handle.done(resp)
Exemplo n.º 7
0
def compile_parakeet_source(src):
    '''Compile source code defining a parakeet function.'''
    util.log_debug('Compiling parakeet source.')
    tmpfile = tempfile.NamedTemporaryFile(delete=True,
                                          prefix='spartan-local-',
                                          suffix='.py')
    tmpfile.write(src)
    tmpfile.flush()

    #util.log_info('File: %s, Source: \n %s \n', tmpfile.name, src)

    #os.rename(tmpfile.name, srcfile)
    #atexit.register(lambda: os.remove(srcfile))

    try:
        module = imp.load_source('parakeet_temp', tmpfile.name)
    except Exception, ex:
        util.log_info('Failed to build parakeet wrapper')
        util.log_debug('Source was: %s', src)
        raise CodegenException(ex.message, ex.args)
Exemplo n.º 8
0
def start_remote_worker(worker, st, ed):
    '''
  Start processes on a worker machine.

  The machine will launch worker processes ``st`` through ``ed``.

  :param worker: hostname to connect to.
  :param st: First process index to start.
  :param ed: Last process to start.
  '''
    if FLAGS.use_threads and worker == 'localhost':
        util.log_info('Using threads.')
        for i in range(st, ed):
            p = threading.Thread(target=spartan.worker._start_worker,
                                 args=((socket.gethostname(), FLAGS.port_base),
                                       i))
            p.daemon = True
            p.start()
        time.sleep(0.1)
        return

    util.log_info('Starting worker %d:%d on host %s', st, ed, worker)
    if FLAGS.oprofile:
        os.system('mkdir operf.%s' % worker)

    ssh_args = ['ssh', '-oForwardX11=no', worker]

    args = ['cd %s && ' % os.path.abspath(os.path.curdir)]

    if FLAGS.xterm:
        args += [
            'xterm',
            '-e',
        ]

    if FLAGS.oprofile:
        args += [
            'operf -e CPU_CLK_UNHALTED:100000000', '-g', '-d',
            'operf.%s' % worker
        ]

    args += [
        #'gdb', '-ex', 'run', '--args',
        'python',
        '-m spartan.worker',
        '--master=%s:%d' % (socket.gethostname(), FLAGS.port_base),
        '--count=%d' % (ed - st),
        '--heartbeat_interval=%d' % FLAGS.heartbeat_interval
    ]

    # add flags from config/user
    for (name, value) in FLAGS:
        if name in ['worker_list', 'print_options']: continue
        args += [repr(value)]

    #print >>sys.stderr, args
    util.log_debug('Running worker %s', ' '.join(args))
    time.sleep(0.1)
    if worker != 'localhost':
        p = subprocess.Popen(ssh_args + args, executable='ssh')
    else:
        p = subprocess.Popen(' '.join(args), shell=True, stdin=subprocess.PIPE)

    return p