Exemplo n.º 1
0
    def _ClassifyFree(self, free, pool_name):

        # Ensure free machines are unique.
        free = setlib.make_dict(free)
        free = free.keys()
        free.sort()

        inter = setlib.intersect(free, self._used_hosts)
        if inter:
            prodlib.log('Ignoring used machines: %s' % string.join(inter))
            free = setlib.diff(free, inter)

        # Preload hardware data.
        self._mach_mgr.MachineList(free, load_hardware=1)

        # Group free machines by their defining characteristics.  This
        # allows us to examine a smaller set of free machines as
        # candidates for a replacement.
        freedict = {}

        # Prune out free without hardware information.
        for host in free:
            mach = self._mach_mgr.Machine(host)
            if not mach or not mach.hardware(): continue
            freedict.setdefault(mach.ClassString(), []).append(mach.name())

        prodlib.log('Found %d free machs from %s.\n' % (len(free), pool_name))

        return freedict
Exemplo n.º 2
0
  def _FreeDict(self, coloc, pool_name):
    """
    Get a free dictionary corresponding to the pool name and coloc.
    Args:
      coloc: coloc
      pool: pool name
    Returns:
      { 'free_class' : ['mach'] }
    """

    # Specified pools are not restricted by coloc.
    if pool_name == 'specified': coloc = None

    pool = self._free_pools.get((coloc, pool_name))
    if pool is not None: return pool

    # Find free machines if not specified by user.
    if pool_name == 'specified':
      pool = self._ClassifyFree(self._free, pool_name)
    else:
      prodlib.log('Finding free machs from %s:%s.' % (coloc, pool_name))
      free = []
      pool = self._ClassifyFree(free, pool_name)

    self._free_pools[(coloc, pool_name)] = pool
    return pool
Exemplo n.º 3
0
def main(argv):
  import getopt

  global send_mail, MAILTO  # so mail works regardless where we fail!

  send_mail = 0         # disable mail unless user wants it. It is enabled
                        # by default only so we can catch major syntax errors.
  batch = 0             # assume we're running from command line

  try:
    (optlist, args) = getopt.getopt(argv, 'n',
                     ['re=', 'delay=', 'ports=', 'loop', 'noexec',
                      'nolock', 'force_lock', 'start=', 'kill=',
                      'mailto=', 'nomail', 'fromcron', 'batch', 'mach=',
                      'nodataversion', 'sets=', 'sandbox=', 'setpgrp',
                      'kill_batch_size=', 'nocheckpoint', 'checkpoint_time=',
                      'validate', 'useinvalidconfig', 'nolooprestarts',
                      'corphack', 'corptest', 'babyalias=',
                      'lockdir=', 'maxiters=', 'config_dir=',
                      'restarts_file=', 'nobabycheck', 'ssh_user='******'nortsignals', 'regtest',
                      ]
                    )
  except getopt.error, e:
    prodlib.log("getopt error: %s" % e)
    usage()
Exemplo n.º 4
0
    def _FreeDict(self, coloc, pool_name):
        """
    Get a free dictionary corresponding to the pool name and coloc.
    Args:
      coloc: coloc
      pool: pool name
    Returns:
      { 'free_class' : ['mach'] }
    """

        # Specified pools are not restricted by coloc.
        if pool_name == 'specified': coloc = None

        pool = self._free_pools.get((coloc, pool_name))
        if pool is not None: return pool

        # Find free machines if not specified by user.
        if pool_name == 'specified':
            pool = self._ClassifyFree(self._free, pool_name)
        else:
            prodlib.log('Finding free machs from %s:%s.' % (coloc, pool_name))
            free = []
            pool = self._ClassifyFree(free, pool_name)

        self._free_pools[(coloc, pool_name)] = pool
        return pool
Exemplo n.º 5
0
  def _ClassifyFree(self, free, pool_name):

    # Ensure free machines are unique.
    free = setlib.make_dict(free)
    free = free.keys()
    free.sort()

    inter = setlib.intersect(free, self._used_hosts)
    if inter:
      prodlib.log('Ignoring used machines: %s' % string.join(inter))
      free = setlib.diff(free, inter)

    # Preload hardware data.
    self._mach_mgr.MachineList(free, load_hardware=1)

    # Group free machines by their defining characteristics.  This
    # allows us to examine a smaller set of free machines as
    # candidates for a replacement.
    freedict = {}

    # Prune out free without hardware information.
    for host in free:
      mach = self._mach_mgr.Machine(host)
      if not mach or not mach.hardware(): continue
      freedict.setdefault(mach.ClassString(), []).append(mach.name())

    prodlib.log('Found %d free machs from %s.\n' % (len(free), pool_name))

    return freedict
Exemplo n.º 6
0
def NormalizeTypeLevel(typelvl):
    cnt = string.count(typelvl, ':')
    # typelvl is in correct format (i.e. 'mtype:lvl')
    if cnt == 1:
        return typelvl
    # force typelvl (i.e. 'mtype')
    elif cnt == 0:
        return typelvl + ':0'  # assume lvl 0 if unknown
    # invalid format (i.e. 'mtype::lvl')
    else:
        prodlib.log("Invalid typelvl format: %s" % typelvl)
        raise RuntimeError
Exemplo n.º 7
0
def NormalizeTypeLevel(typelvl):
  cnt = string.count(typelvl, ':')
  # typelvl is in correct format (i.e. 'mtype:lvl')
  if cnt == 1:
    return typelvl
  # force typelvl (i.e. 'mtype')
  elif cnt == 0:
    return typelvl + ':0'            # assume lvl 0 if unknown
  # invalid format (i.e. 'mtype::lvl')
  else:
    prodlib.log("Invalid typelvl format: %s" % typelvl)
    raise RuntimeError
Exemplo n.º 8
0
    def go(self, timeout=None):
        # done registering the machines
        if self.monitor_in != None:
            self.monitor_in.close()
            self.monitor_in = None

        # if we have nothing to do just return
        if not self.restartfns:
            return self.restarts

        reached_maxiters = 0
        while timeout > 0 or timeout == None:
            start_time = time.time()
            # We select to give wait most the specified timeout
            (ioready, _, _) = select.select(
                [self.monitor_out],
                [],  # don't care about "write"-s
                [],  # ... or errors
                timeout)

            if self.monitor_out not in ioready:
                return self.restarts

            # Adjust the time left for us
            if timeout != None:
                timeout = timeout - time.time() + start_time

            line = self.monitor_out.readline(
            )  # each line now means "restart me!"
            if 'MAXITERS\n' == line:
                assert self.maxiters > 0
                assert not reached_maxiters
                reached_maxiters = 1
                continue
            if not line:
                if reached_maxiters:
                    break
                prodlib.log(
                    "EOF from monitor subprocess! (Subprocess died?) Exiting.")
                prodlib.log("Make sure you have monitor installed " +
                            "(google/bin/monitor)")
                sys.exit(1)
            assert not reached_maxiters
            print "Restarting %s" % line[:-1]
            sys.stdout.flush()  # mix this with monitor's messages
            printable_hostport = line[:-1]
            self.restartfns[printable_hostport](
            )  # line (minus \n) is key to table
            original_hostport = self.originaldata[printable_hostport]
            self.restarts[original_hostport] = self.restarts.get(
                original_hostport, 0) + 1

        return self.restarts
Exemplo n.º 9
0
  def RemoveSets(self, srv_mgr, srvsetnums):

    prodlib.log('Beginning RemoveSet:\n')
    removed = []

    for srvsetnum in srvsetnums:

      (srvset, num) = string.split(srvsetnum, ':')
      num = int(num)
      set = srv_mgr.Set(srvset)
      if set.property('auto_assigned'): continue
      ports = set.Ports()

      # For each port, cut off servers that are located > num.
      for port in ports:
        servers = set.ServersForPort(port)
        cnt = len(servers) - num
        if cnt <= 0: continue

        for _ in range(cnt):
          server = servers[-1]
          removed.append('%s' % server)
          prodlib.log(' Removed server %s' % server)
          srv_mgr.RemoveServer(server)

    prodlib.log('\nremoved="%s"' % string.join(removed))
    prodlib.log('\nFinished RemoveSet.')
    return (removed, [])
Exemplo n.º 10
0
def main(argv):
    import getopt

    global send_mail, MAILTO  # so mail works regardless where we fail!

    send_mail = 0  # disable mail unless user wants it. It is enabled
    # by default only so we can catch major syntax errors.
    batch = 0  # assume we're running from command line

    try:
        (optlist, args) = getopt.getopt(argv, 'n', [
            're=',
            'delay=',
            'ports=',
            'loop',
            'noexec',
            'nolock',
            'force_lock',
            'start=',
            'kill=',
            'mailto=',
            'nomail',
            'fromcron',
            'batch',
            'mach=',
            'nodataversion',
            'sets=',
            'sandbox=',
            'setpgrp',
            'kill_batch_size=',
            'nocheckpoint',
            'checkpoint_time=',
            'validate',
            'useinvalidconfig',
            'nolooprestarts',
            'corphack',
            'corptest',
            'babyalias=',
            'lockdir=',
            'maxiters=',
            'config_dir=',
            'restarts_file=',
            'nobabycheck',
            'ssh_user='******'nortsignals',
            'regtest',
        ])
    except getopt.error, e:
        prodlib.log("getopt error: %s" % e)
        usage()
Exemplo n.º 11
0
    def RemoveSets(self, srv_mgr, srvsetnums):

        prodlib.log('Beginning RemoveSet:\n')
        removed = []

        for srvsetnum in srvsetnums:

            (srvset, num) = string.split(srvsetnum, ':')
            num = int(num)
            set = srv_mgr.Set(srvset)
            if set.property('auto_assigned'): continue
            ports = set.Ports()

            # For each port, cut off servers that are located > num.
            for port in ports:
                servers = set.ServersForPort(port)
                cnt = len(servers) - num
                if cnt <= 0: continue

                for _ in range(cnt):
                    server = servers[-1]
                    removed.append('%s' % server)
                    prodlib.log(' Removed server %s' % server)
                    srv_mgr.RemoveServer(server)

        prodlib.log('\nremoved="%s"' % string.join(removed))
        prodlib.log('\nFinished RemoveSet.')
        return (removed, [])
Exemplo n.º 12
0
  def RemoveNum(self, srv_mgr, srvsetnums):

    prodlib.log('Beginning RemoveNum:\n')
    removed = []

    for srvsetnum in srvsetnums:

      (srvset, num) = string.split(srvsetnum, ':')
      num = int(num)
      set = srv_mgr.Set(srvset)
      if set.property('auto_assigned'): continue
      indices = set.Indices()[:]
      indices.reverse()

      for index in indices:
        servers = set.ServersForIndex(index)[:]
        servers.reverse()
        # TODO: We may want to be less strict about not allowing
        # servers to be removed.
        for server in servers:
          if len(srv_mgr.ServersForHost(server.host())) > 1:
            raise Error, '%s has multiple roles' % server
          if num <= 0: break
          removed.append('%s' % server)
          srv_mgr.RemoveServer(server)
          prodlib.log(' Removed server %s' % server)
          num = num-1
        if num <= 0: break

    prodlib.log('\nremoved="%s"' % string.join(removed))
    prodlib.log('\nFinished RemoveNum.')
    return (removed, [])
Exemplo n.º 13
0
    def RemoveNum(self, srv_mgr, srvsetnums):

        prodlib.log('Beginning RemoveNum:\n')
        removed = []

        for srvsetnum in srvsetnums:

            (srvset, num) = string.split(srvsetnum, ':')
            num = int(num)
            set = srv_mgr.Set(srvset)
            if set.property('auto_assigned'): continue
            indices = set.Indices()[:]
            indices.reverse()

            for index in indices:
                servers = set.ServersForIndex(index)[:]
                servers.reverse()
                # TODO: We may want to be less strict about not allowing
                # servers to be removed.
                for server in servers:
                    if len(srv_mgr.ServersForHost(server.host())) > 1:
                        raise Error, '%s has multiple roles' % server
                    if num <= 0: break
                    removed.append('%s' % server)
                    srv_mgr.RemoveServer(server)
                    prodlib.log(' Removed server %s' % server)
                    num = num - 1
                if num <= 0: break

        prodlib.log('\nremoved="%s"' % string.join(removed))
        prodlib.log('\nFinished RemoveNum.')
        return (removed, [])
Exemplo n.º 14
0
def GetLocalDataFileInfo(mtype):
  prop_info = GetProperty(mtype, 'local_data_files')
  src_target_files_list = []
  # now fill in the default values
  if prop_info:
    for p_dict in prop_info:
      src_target_files_list.append( (p_dict.get('srcpath',    ''),
                                     p_dict.get('targetpath', ''),
                                     p_dict.get('files',      ['*']))
                                  )
    #end for
  else:
    prodlib.log('No local_data_files for %s.' % mtype)
  # end if
  return src_target_files_list
Exemplo n.º 15
0
def GetLocalDataFileInfo(mtype):
    prop_info = GetProperty(mtype, 'local_data_files')
    src_target_files_list = []
    # now fill in the default values
    if prop_info:
        for p_dict in prop_info:
            src_target_files_list.append(
                (p_dict.get('srcpath',
                            ''), p_dict.get('targetpath',
                                            ''), p_dict.get('files', ['*'])))
        #end for
    else:
        prodlib.log('No local_data_files for %s.' % mtype)
    # end if
    return src_target_files_list
Exemplo n.º 16
0
 def handle_error(self, *info):
   # The prototype for handle_error is different for 2.x and 1.5:
   #   2.x: def handle_error (self)
   #   1.5: def handle_error (self, *info):
   #           with exception info in info
   if not info:
     # python 2.x
     _, exc_type, exc_value, exc_traceback = asyncore.compact_traceback()
   else:
     # python 1.5
     (exc_type, exc_value, exc_traceback) = info
   self.err_ = (exc_type, exc_value)
   prodlib.log("error encountered: %s-%s" % self.err_) # stderr logging!
   del exc_traceback
   self.close()
Exemplo n.º 17
0
  def go (self, timeout = None):
    # done registering the machines
    if self.monitor_in != None:
      self.monitor_in.close()
      self.monitor_in = None

    # if we have nothing to do just return
    if not self.restartfns:
      return self.restarts

    reached_maxiters = 0
    while timeout > 0 or timeout == None:
      start_time = time.time()
      # We select to give wait most the specified timeout
      (ioready, _, _) = select.select([self.monitor_out],
                                      [],    # don't care about "write"-s
                                      [],    # ... or errors
                                      timeout)

      if self.monitor_out not in ioready:
        return self.restarts

      # Adjust the time left for us
      if timeout != None:
        timeout =  timeout - time.time() + start_time

      line = self.monitor_out.readline() # each line now means "restart me!"
      if 'MAXITERS\n' == line:
        assert self.maxiters > 0
        assert not reached_maxiters
        reached_maxiters = 1
        continue
      if not line:
        if reached_maxiters:
          break
        prodlib.log("EOF from monitor subprocess! (Subprocess died?) Exiting.")
        prodlib.log("Make sure you have monitor installed " +
                    "(google/bin/monitor)")
        sys.exit(1)
      assert not reached_maxiters
      print "Restarting %s" % line[:-1]
      sys.stdout.flush()                 # mix this with monitor's messages
      printable_hostport = line[:-1]
      self.restartfns[printable_hostport]() # line (minus \n) is key to table
      original_hostport = self.originaldata[printable_hostport]
      self.restarts[original_hostport] = self.restarts.get(original_hostport, 0) + 1

    return self.restarts
Exemplo n.º 18
0
 def handle_error(self, *info):
     # The prototype for handle_error is different for 2.x and 1.5:
     #   2.x: def handle_error (self)
     #   1.5: def handle_error (self, *info):
     #           with exception info in info
     if not info:
         # python 2.x
         _, exc_type, exc_value, exc_traceback = asyncore.compact_traceback(
         )
     else:
         # python 1.5
         (exc_type, exc_value, exc_traceback) = info
     self.err_ = (exc_type, exc_value)
     prodlib.log("error encountered: %s-%s" % self.err_)  # stderr logging!
     del exc_traceback
     self.close()
Exemplo n.º 19
0
  def __call__(self, port, hint=None):
    """Map a port into its associated server type."""

    mtype = self.port_to_type.get(port, None)
    if mtype is not None:
      return mtype

    # The hint optional parameter speeds up code in serverlib.py, because it
    # runs through ports in sorted order, and some mtypes have many shards.
    if hint is not None:
      (lower_bound, upper_bound) = self.type_to_bounds[hint]
      if lower_bound <= port < upper_bound:
        mtype = hint
        self.port_to_type[port] = mtype
        return mtype

    if self.bounds_array is None:
      self.initialize_search()

    # Binary search:
    if port < self.bounds_array[0]:  # first, a sanity check
      prodlib.log('WARNING: returning unknown servertype for: %d' % port)
      return 'unknown'
    first_possible = 0
    n_possible = len(self.bounds_array)
    while n_possible > 1:
      n_first_half = int(n_possible / 2.0)
      probe = first_possible + n_first_half
      if port < self.bounds_array[probe]:
        n_possible = n_first_half
      else:
        n_possible = n_possible - n_first_half
        first_possible = first_possible + n_first_half
    lower_bound = self.bounds_array[first_possible]
    mtype = self.lower_bound_to_type.get(lower_bound, 'unknown')
    self.port_to_type[port] = mtype

    if mtype == 'unknown':
      prodlib.log('WARNING: returning unknown servertype for: %d' % port)
    return mtype
Exemplo n.º 20
0
    def __call__(self, port, hint=None):
        """Map a port into its associated server type."""

        mtype = self.port_to_type.get(port, None)
        if mtype is not None:
            return mtype

        # The hint optional parameter speeds up code in serverlib.py, because it
        # runs through ports in sorted order, and some mtypes have many shards.
        if hint is not None:
            (lower_bound, upper_bound) = self.type_to_bounds[hint]
            if lower_bound <= port < upper_bound:
                mtype = hint
                self.port_to_type[port] = mtype
                return mtype

        if self.bounds_array is None:
            self.initialize_search()

        # Binary search:
        if port < self.bounds_array[0]:  # first, a sanity check
            prodlib.log('WARNING: returning unknown servertype for: %d' % port)
            return 'unknown'
        first_possible = 0
        n_possible = len(self.bounds_array)
        while n_possible > 1:
            n_first_half = int(n_possible / 2.0)
            probe = first_possible + n_first_half
            if port < self.bounds_array[probe]:
                n_possible = n_first_half
            else:
                n_possible = n_possible - n_first_half
                first_possible = first_possible + n_first_half
        lower_bound = self.bounds_array[first_possible]
        mtype = self.lower_bound_to_type.get(lower_bound, 'unknown')
        self.port_to_type[port] = mtype

        if mtype == 'unknown':
            prodlib.log('WARNING: returning unknown servertype for: %d' % port)
        return mtype
Exemplo n.º 21
0
def AsynRequest(hostportlist, request, timeout, retrycnt=3, half_shutdown=1):
  retries = {}
  clients = {}

  hostports = hostportlist
  delay = 1
  while hostports:
    # Clear the asyncore socket map before every loop. Otherwise, if we have a
    # dead server, then the socket corresponding to that server will stay in
    # the map for all the following retry attempts.
    asyncore.socket_map = {}
    # For each AsynClient, a socket is created and the map updated so that
    # the socket can be polled during the next loop() call
    for hostport in hostports:
      clients[hostport] = AsynClient(hostport,
                                     request,
                                     half_shutdown=half_shutdown)  # connect
      retries[hostport] = retries.get(hostport, 0) + 1   # update counter

    # enter the select loop
    loop(timeout)

    # go through the client list and see if anyone failed
    hostports = []      # start from scratch. assume all replies came back fine
    for hostport, client in clients.items():
      if client.failed() and retries[hostport] < retrycnt:
        # Some error occured. Put it back in the list and try again
        prodlib.log("Error on %s port %s: %s %s" % (hostport + client.err_))
        hostports.append(hostport)

    if hostports:              # any retries needed?
      time.sleep(delay)        # ... then wait for better times
      if delay < 20:
        delay = delay * 2      # exponential backoff

  return clients.values()
Exemplo n.º 22
0
def AsynRequest(hostportlist, request, timeout, retrycnt=3, half_shutdown=1):
    retries = {}
    clients = {}

    hostports = hostportlist
    delay = 1
    while hostports:
        # Clear the asyncore socket map before every loop. Otherwise, if we have a
        # dead server, then the socket corresponding to that server will stay in
        # the map for all the following retry attempts.
        asyncore.socket_map = {}
        # For each AsynClient, a socket is created and the map updated so that
        # the socket can be polled during the next loop() call
        for hostport in hostports:
            clients[hostport] = AsynClient(
                hostport, request, half_shutdown=half_shutdown)  # connect
            retries[hostport] = retries.get(hostport, 0) + 1  # update counter

        # enter the select loop
        loop(timeout)

        # go through the client list and see if anyone failed
        hostports = []  # start from scratch. assume all replies came back fine
        for hostport, client in clients.items():
            if client.failed() and retries[hostport] < retrycnt:
                # Some error occured. Put it back in the list and try again
                prodlib.log("Error on %s port %s: %s %s" %
                            (hostport + client.err_))
                hostports.append(hostport)

        if hostports:  # any retries needed?
            time.sleep(delay)  # ... then wait for better times
            if delay < 20:
                delay = delay * 2  # exponential backoff

    return clients.values()
Exemplo n.º 23
0
    def Swap(self, srv_mgr, src, dst):

        cnstr_mgr = srv_mgr.constraint_mgr()

        added = []
        removed = []

        prodlib.log('Beginning Swap:\n')
        for server in srv_mgr.ServersForSpec(src):
            if server.property('auto_assigned'): continue
            prodlib.log(' Swapping %s with %s' % (server, dst))
            removed.append('%s' % server)
            srv_mgr.ReplaceServer(server, dst)
            added.append('%s' % server)
            results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1)
            if results: prodlib.log(' WARNING: %s' % results[0])
        prodlib.log('\nFinished Swap.')

        return (added, removed, [])
Exemplo n.º 24
0
  def Swap(self, srv_mgr, src, dst):

    cnstr_mgr = srv_mgr.constraint_mgr()

    added = []
    removed = []

    prodlib.log('Beginning Swap:\n')
    for server in srv_mgr.ServersForSpec(src):
      if server.property('auto_assigned'): continue
      prodlib.log(' Swapping %s with %s' % (server, dst))
      removed.append('%s' % server)
      srv_mgr.ReplaceServer(server, dst)
      added.append('%s' % server)
      results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1)
      if results: prodlib.log(' WARNING: %s' % results[0])
    prodlib.log('\nFinished Swap.')

    return (added, removed, [])
Exemplo n.º 25
0
    def Add(self, srv_mgr, names):

        cnstr_mgr = srv_mgr.constraint_mgr()

        added = []

        prodlib.log('Beginning Add:\n')
        for name in names:
            server = serverlib.Server()
            server.InitFromName(name)
            prodlib.log(' Adding %s' % server)
            srv_mgr.AddServer(server)
            if server.property('auto_assigned'):
                raise Error, 'Cannot add auto assigned server: %s' % server
            added.append('%s' % server)
            results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1)
            if results: prodlib.log(' WARNING: %s' % results[0])
        prodlib.log('\nFinished Add.')

        return (added, [])
Exemplo n.º 26
0
    def Remove(self, srv_mgr, names):

        # Silence pychecker.
        removed = []

        prodlib.log('Beginning Removal:\n')
        for name in names:
            # Make a copy since the servers for host will be internally modified.
            servers = srv_mgr.ServersForSpec(name)[:]
            if not servers:
                prodlib.log('WARNING: no servers matched: %s' % name)

            for server in servers:
                if server.property('auto_assigned'): continue
                prodlib.log(' Removing %s' % server)
                srv_mgr.RemoveServer(server)
                removed.append('%s' % server)
        prodlib.log('\nFinished Removal.')

        return (removed, [])
Exemplo n.º 27
0
  def Remove(self, srv_mgr, names):

    # Silence pychecker.
    removed = []

    prodlib.log('Beginning Removal:\n')
    for name in names:
      # Make a copy since the servers for host will be internally modified.
      servers = srv_mgr.ServersForSpec(name)[:]
      if not servers:
        prodlib.log('WARNING: no servers matched: %s' % name)

      for server in servers:
        if server.property('auto_assigned'): continue
        prodlib.log(' Removing %s' % server)
        srv_mgr.RemoveServer(server)
        removed.append('%s' % server)
    prodlib.log('\nFinished Removal.')

    return (removed, [])
Exemplo n.º 28
0
  def Add(self, srv_mgr, names):

    cnstr_mgr = srv_mgr.constraint_mgr()

    added = []

    prodlib.log('Beginning Add:\n')
    for name in names:
      server = serverlib.Server()
      server.InitFromName(name)
      prodlib.log(' Adding %s' % server)
      srv_mgr.AddServer(server)
      if server.property('auto_assigned'):
        raise Error, 'Cannot add auto assigned server: %s' % server
      added.append('%s' % server)
      results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1)
      if results: prodlib.log(' WARNING: %s' % results[0])
    prodlib.log('\nFinished Add.')

    return (added, [])
Exemplo n.º 29
0
  def _AllocateHostFromFreePool(self, srv_mgr, server, pool,
                                free_dict, force=0, exclude=None):
    """
    Allocate a machine for the passed in server from specific pool.
    """

    cnstr_mgr = srv_mgr.constraint_mgr()

    # Find currently used compatible machines.
    used_hosts = {}
    if self._used:
      used_hosts = cnstr_mgr.Constraint('sharing').CompatibleHosts(srv_mgr,
                                                                   server)

    # Find free machine set from the free_dict - we get one of each class
    # of machines and save the others in its class.  We know that we can
    # rank members of the same class with the same score.
    free = {}
    for (machclass, hosts) in free_dict.items():
      if not hosts: continue
      free[hosts[0]] = hosts

    prodlib.log(' Allocating server for %s (used=%d, free=%d, pool=%s)' % \
          (server, len(used_hosts), len(free), pool))

    hosts = used_hosts.keys() + free.keys()
    # Exclude optional excludes.
    if self._exclude: hosts = setlib.diff(hosts, self._exclude)
    # Exclude locally specified excludes.
    if exclude: hosts = setlib.diff(hosts, exclude)
    random.shuffle(hosts)

    # Save the original host.
    orig_host = server.host()

    results = []
    failed = []

    # Assign weights and prune out ones that don't fit.
    for host in hosts:
      # Replace the server's host with the candidate host and verify.
      if self._verbose: prodlib.log('    ranking candidate: %s' % host)
      srv_mgr.ReplaceServer(server, host)
      servers = [server] + used_hosts.get(host, [])
      ver_results = cnstr_mgr.VerifyServer(srv_mgr, server,
                                           servers=servers,
                                           force=force)

      if self._verbose:
        for res in ver_results:
          if res.error(): status = 'fail'
          else: status = 'ok'
          prodlib.log('      %s: %s' % (status, res))

      if ver_results[-1].error():
        failed.append(host)
      else:
        # Compute total weight assigned to machine.
        weight = 0.0
        for res in ver_results:
          weight = weight + res.weight()
        if self._verbose:
          prodlib.log('      weight: %.2f' % weight)
        # Append results for machine.  We augment the hosts with
        # free machines of the same class since these should receive
        # the same score.
        if free.has_key(host):
          for free_host in free[host]:
            results.append((free_host, weight))
        else:
          results.append((host, weight))

    # Sort the results by highest weight to find the best candidate.
    results.sort(lambda x,y: -cmp(x[1], y[1]))

    for (host, weight) in results:

      prodlib.log(' Trying %s (%.2f)' % (host, weight))

      # Set server to new host.
      srv_mgr.ReplaceServer(server, host)

      if not used_hosts.has_key(host):
        # Remove the machine from the free list if necessary.
        key = self._mach_mgr.Machine(host).ClassString()
        hosts = free_dict[key]
        hosts.remove(host)
        if hosts == []: del(free_dict[key])

      # Return the server with its newly allocated host.
      prodlib.log(' Allocated %s (%.2f)' % (server, weight))
      return server

    # Failed so replace old host.
    srv_mgr.ReplaceServer(server, orig_host)

    prodlib.log(' Unable to allocate server.')
    return None
Exemplo n.º 30
0
    def AddSets(self, srv_mgrs, srvsetnums, do_min=0):

        # Allow interface to take a single server manager.
        if type(srv_mgrs) != types.ListType:
            srv_mgrs = [srv_mgrs]

        # Find free dictionary for this set.
        srv_mgr = srv_mgrs[0]
        cnstr_mgr = srv_mgr.constraint_mgr()

        # Set up the sets to process from constraints if not specified.
        if not srvsetnums:
            srvsetnums = {}

            # First iterate through all existing sets and add those sets
            # that have a shard length constraint.  Note that if the shardlen
            # constraint is specified in defaults this will add all currently
            # created sets.
            for set in srv_mgr.Sets():
                if set.property('auto_assigned'): continue
                if cnstr_mgr.Constraint('shardlen').Constraint(set.name()):
                    srvsetnums[set.name()] = 1

            # There may be some sets specified that are not currently in
            # existance.  Iterate through the list of explicitly specified
            # shardlen constraints and add sets for their types.
            for srvset in cnstr_mgr.Constraint('shardlen').server_sets():
                srvsetnums[srvset] = 1

            srvsetnums = srvsetnums.keys()
            srvsetnums.sort()

        # Set up the number of each type to add.
        tmp = []
        for srvsetnum in srvsetnums:
            if string.find(srvsetnum, ':') == -1:
                # TODO: Right now we're storing constraint descs in the constraint
                # manager.  This will move in another checkin to the server sets
                # and will be more readable when accessing.
                vals = cnstr_mgr.Constraint('shardlen').Constraint(srvsetnum)
                # If no shardlen constrs were specified for this type then ignore.
                if not vals:
                    prodlib.log(
                        'Cannot add sets for %s - no shardlen constraint' %
                        srvsetnum)
                    continue
                srvsetnum = '%s:%s,%s' % (srvsetnum, vals[0], vals[1])
            tmp.append(srvsetnum)
        srvsetnums = tmp

        prodlib.log('Beginning AddSet:\n')

        added = []
        failed = []
        tried = []

        for srvsetnum in srvsetnums:

            (srvset, num) = string.split(srvsetnum, ':')
            num = string.split(num, ',')
            if len(num) == 2:
                min = int(num[0])
                max = int(num[1])
            else:
                min = int(num[0])
                max = min

            if do_min:
                cnt = min
            else:
                cnt = max

            # For balancer sets, we add these with the same port range as the
            # balanced set if they were not present in the server manager object.
            if srvset[0] == '+':
                for srv_mgr in srv_mgrs:
                    balset = srv_mgr.Set(srvset[1:])
                    if not balset:
                        raise Error, 'Cannot add set %s: no balanced set' % srvset
                    set = srv_mgr.AddSet(srvset, balset.level())
                    # Ensure port ranges are matched to balanced set.
                    for port in balset.Ports():
                        set.AddPort(port)

            # Build union of all ports for this type.
            ports = {}
            for srv_mgr in srv_mgrs:
                set = srv_mgr.Set(srvset)
                if set != None:
                    cur_ports = set.Ports()
                    for port in cur_ports:
                        ports[port] = 1
            ports = ports.keys()
            ports.sort()

            # Iterate up the slices so that we replace short shards first.
            for i in range(cnt + 1):

                # To check if all allocations failed on the ports.
                every_port_failed = 1

                # For each port check if this shard is short.
                for port in ports:
                    for srv_mgr in srv_mgrs:

                        cnstr_mgr = srv_mgr.constraint_mgr()
                        set = srv_mgr.Set(srvset)
                        if set.property('auto_assigned'): continue
                        if port not in set.Ports(): continue

                        # If we have enough servers just skip.
                        num = len(set.ServersForPort(port))
                        if num >= i:
                            every_port_failed = 0
                            continue

                        # Create a placeholder server object.
                        server = serverlib.Server()
                        server.InitFromName('%s%s:%s' % (srvset, i, port))
                        srv_mgr.AddServer(server)
                        # Try and allocate a host for it.
                        force = 0
                        if num < min: force = 1
                        if self._AllocateHost(srv_mgr, server, force):
                            every_port_failed = 0
                            added.append('%s' % server)
                        else:
                            # Failed so remove it from the map.
                            srv_mgr.RemoveServer(server)
                            if force:
                                failed.append('%s' % server)
                            else:
                                tried.append('%s' % server)

                # Break if we tried every port for this server manager and
                # could not allocate anything.
                if every_port_failed: break

        prodlib.log('\nsuccess="%s"' % string.join(added))
        prodlib.log('failed="%s"' % string.join(failed))
        prodlib.log('tried="%s"' % string.join(tried))
        if self._free:
            prodlib.log('free="%s"' % string.join(self._RemainingFree()))
        return (added, failed, tried, self._RemainingFree())
Exemplo n.º 31
0
def ComputeServers(config, types, machine_re,
                   restrictports, myhostname, excluded, do_ckpt, ckpt_time,
                   sets, restrict_servers=None, ssh_user=None):

  srv_mgr = config.GetServerManager()
  servers = srv_mgr.Servers(wanted_sets=types, wanted_ports=restrictports,
                            wanted_indices=sets)
  computed_servers = []

  for server in servers:

    set = srv_mgr.Set(server.srvset())
    host = server.host()
    port = server.port()

    if restrict_servers and not restrict_servers.get(str(server), 0):
      continue

    mtype = server.servertype()

    if not WantedServer(host, port, mtype, config,
                        machine_re, myhostname, excluded):
      # Server has not been selected for inclusion.
      continue

    # Allow overriding ssh_user.
    if server.property('ssh_user'):
      ssh_user = server.property('ssh_user')

    # For AM transition, if there is no binary user set, then
    # do not use the ssh_user so we can still go on as root.
    # We will run the babysitter as root and it will ssh into
    # machines with binary_user set as prodsetup, and it will
    # ssh into machines without binary_user set as root.
    # TODO: Remove this when we are finished with conversion.
    binary_user = set.property('binary_user')
    if not binary_user or binary_user == 'root':
      ssh_user = None

    # Print out an informational string for the user.
    set_str = ''
    if sets: set_str = ' - set %s' % server.index()
    print "Checking %s:%d (%s%s)" % \
          (host, servertype.GetServingPort(port), mtype, set_str)

    safe_start_time = server.index() * server.property('inter_set_delay')

    try:
      hostip = socket.gethostbyname(host)
    except socket.error, e:
      prodlib.log("DNS error for %s: %s. Skipping." % (host, e))
      continue

    # Form restart closure
    restartfn = lambda f=server.Start, m=print_only(), u=ssh_user: \
      f(m, u)

    # Form kill closure
    killfn = lambda f=server.Stop, u=ssh_user, ck=do_ckpt, ct=ckpt_time: \
      f(2, u, ck, ct)

    server.set_property('hostip', hostip)
    server.set_property('safe_start_time', safe_start_time)
    server.set_property('restartfn', restartfn)
    server.set_property('killfn', killfn)

    computed_servers.append(server)
Exemplo n.º 32
0
    def _AllocateHostFromFreePool(self,
                                  srv_mgr,
                                  server,
                                  pool,
                                  free_dict,
                                  force=0,
                                  exclude=None):
        """
    Allocate a machine for the passed in server from specific pool.
    """

        cnstr_mgr = srv_mgr.constraint_mgr()

        # Find currently used compatible machines.
        used_hosts = {}
        if self._used:
            used_hosts = cnstr_mgr.Constraint('sharing').CompatibleHosts(
                srv_mgr, server)

        # Find free machine set from the free_dict - we get one of each class
        # of machines and save the others in its class.  We know that we can
        # rank members of the same class with the same score.
        free = {}
        for (machclass, hosts) in free_dict.items():
            if not hosts: continue
            free[hosts[0]] = hosts

        prodlib.log(' Allocating server for %s (used=%d, free=%d, pool=%s)' % \
              (server, len(used_hosts), len(free), pool))

        hosts = used_hosts.keys() + free.keys()
        # Exclude optional excludes.
        if self._exclude: hosts = setlib.diff(hosts, self._exclude)
        # Exclude locally specified excludes.
        if exclude: hosts = setlib.diff(hosts, exclude)
        random.shuffle(hosts)

        # Save the original host.
        orig_host = server.host()

        results = []
        failed = []

        # Assign weights and prune out ones that don't fit.
        for host in hosts:
            # Replace the server's host with the candidate host and verify.
            if self._verbose: prodlib.log('    ranking candidate: %s' % host)
            srv_mgr.ReplaceServer(server, host)
            servers = [server] + used_hosts.get(host, [])
            ver_results = cnstr_mgr.VerifyServer(srv_mgr,
                                                 server,
                                                 servers=servers,
                                                 force=force)

            if self._verbose:
                for res in ver_results:
                    if res.error(): status = 'fail'
                    else: status = 'ok'
                    prodlib.log('      %s: %s' % (status, res))

            if ver_results[-1].error():
                failed.append(host)
            else:
                # Compute total weight assigned to machine.
                weight = 0.0
                for res in ver_results:
                    weight = weight + res.weight()
                if self._verbose:
                    prodlib.log('      weight: %.2f' % weight)
                # Append results for machine.  We augment the hosts with
                # free machines of the same class since these should receive
                # the same score.
                if free.has_key(host):
                    for free_host in free[host]:
                        results.append((free_host, weight))
                else:
                    results.append((host, weight))

        # Sort the results by highest weight to find the best candidate.
        results.sort(lambda x, y: -cmp(x[1], y[1]))

        for (host, weight) in results:

            prodlib.log(' Trying %s (%.2f)' % (host, weight))

            # Set server to new host.
            srv_mgr.ReplaceServer(server, host)

            if not used_hosts.has_key(host):
                # Remove the machine from the free list if necessary.
                key = self._mach_mgr.Machine(host).ClassString()
                hosts = free_dict[key]
                hosts.remove(host)
                if hosts == []: del (free_dict[key])

            # Return the server with its newly allocated host.
            prodlib.log(' Allocated %s (%.2f)' % (server, weight))
            return server

        # Failed so replace old host.
        srv_mgr.ReplaceServer(server, orig_host)

        prodlib.log(' Unable to allocate server.')
        return None
Exemplo n.º 33
0
def DoBabysit(servers, config, maxiters=0, monitor_port_increment=0,
              extra_restarts=None, restart_requests=None, nolooprestarts=0,
              nortsignals=0, succinterval=None, failinterval=None):

  # map from (host, port) to the corresponding server . We need this
  # because el.go returns us results in form of host ports while we
  # want servers.
  hostport_srvinfo_map = {}

  if nolooprestarts:
    if extra_restarts:
      sleep_time = max(map(lambda m: m.property('safe_start_time'),
                           extra_restarts))
    else:
      prodlib.log('Babysitter loop has nothing to do.. exiting')
      sys.exit()
    # endif
    monitor_command = "sleep %d # " % sleep_time
  else:
    monitor_command = "%s/google/bin/monitor --status_port=%s" % (
                             sitecustomize.GOOGLEBASE,
                             servertype.GetPortBase('monitor') +
                             monitor_port_increment)
  # endif

  if nortsignals:
    monitor_command = monitor_command + " --nortsignals"
  if succinterval:
    monitor_command = monitor_command + " --succinterval=%s" % succinterval
  if failinterval:
    monitor_command = monitor_command + " --failinterval=%s" % failinterval

  el = monitor_event_loop(monitor_command, maxiters)

  for server in servers:
    # Cannot babysit virtual servers (ports >= 65536)
    if server.port() >= 65536:
      continue
    if server.property('skip_babysitting'):
      print "WARNING: babysitting disabled for %s:%s" % (server.host(),
                                                         server.port())
      continue

    if not nolooprestarts:

      datadir = server.datadir()
      if datadir is None: datadir = ''
      query = server.property('request_info')
      # some queries require dataversion. Right now dataversion is
      # the datadir for all servers.
      query = query % {'dataversion': datadir}

      el.register((server.host(), servertype.GetServingPort(server.balport()),
                   server.port(), server.property('hostip'),
                   server.property('restartfn')),
                   query, server.property('response_len'),
                   server.property('test_timeouts'))
      host_port = (server.host(), server.port())
      hostport_srvinfo_map[host_port] = server
    # endif

  if extra_restarts and restart_requests:
    start_time = time.time()
    for server in extra_restarts:
      # Insure that enough seconds have passed since we start
      passed_time = time.time() - start_time
      if server.property('safe_start_time') > passed_time:
        delay = server.property('safe_start_time') - passed_time
        print "Spending  %s seconds monitoring." % delay
        if not print_only():
          el.go(timeout=delay)
        else:
          print "Actually simulating a sleep of %s" % delay

      # Call restart function.
      server.property('restartfn')()
      # And mark this guy restarted
      restart_requests.MarkRestarted(str(server))

  # Start the actual babysitter.
  restarted_srv_list = []
  argv_checker = babysitter_argv_checker.BabysitterArgvChecker(config)
  logging.info("%s %s: Restarting servers whose argv has changed. "
                   % (time.ctime(), time.tzname[0]))
  hostports = argv_checker.RestartIfArgvChanged()
  for hostport in hostports:
    restarted_srv_list.append(hostport_srvinfo_map[hostport])

  if not print_only():
    restarts = el.go()
    restarts = restarts.keys()
    for hostport in restarts:
      restarted_srv_list.append(hostport_srvinfo_map[hostport])

  return restarted_srv_list
Exemplo n.º 34
0
  def Replace(self, srv_mgr, names):

    prodlib.log('Beginning Replacements:\n')

    replaced = []
    succeeded = []
    failed = []

    # Find hosts to replace so we can exclude these from
    # being used as candidates.
    replace_hosts = []
    for name in names:
      servers = srv_mgr.ServersForSpec(name)
      for server in servers: replace_hosts.append(server.host())

    for name in names:

      # Find servers for name.  We copy this since the internal
      # array is modified when servers get replaced.
      servers = srv_mgr.ServersForSpec(name)[:]
      if not servers:
        prodlib.log('WARNING: no servers matched: %s' % name)

      for server in servers:
        if server.property('auto_assigned'): continue
        orig = '%s' % server
        if not self._AllocateHost(srv_mgr, server, exclude=replace_hosts):
          failed.append(orig)
        else:
          succeeded.append(orig)
          replaced.append('%s' % server)

    prodlib.log('\nreplace="%s"' % string.join(replaced))
    prodlib.log('success="%s"' % string.join(succeeded))
    prodlib.log('fail="%s"' % string.join(failed))
    if self._free:
      prodlib.log('free="%s"' % string.join(self._RemainingFree()))
    return (replaced, succeeded, failed, self._RemainingFree())
Exemplo n.º 35
0
  def Repair(self, srv_mgr, tries=10):

    cnstr_mgr = srv_mgr.constraint_mgr()

    prodlib.log('Beginning Repair:\n')

    replaced = []
    succeeded = []
    failed = {}
    seen_replaced = {}

    # Try to repair a fixed number of rounds.
    # We just repair one server from each problem each round.
    # This should fix most problems but for distribution/sharing
    # violations, multiple fixes for the problem may be needed.
    for _ in range(tries):

      # Find constraint violations.
      ver_results = cnstr_mgr.VerifyServers(srv_mgr)
      if not ver_results:
        prodlib.log('\nNo more problems to repair.')
        break
      replacements = []

      # Find new machines to try and fix.
      any_left = 0
      for res in ver_results:
        server = res.servers()[0]
        if not seen_replaced.has_key('%s' % server):
          any_left = 1
          replacements.append(server)
          seen_replaced['%s' % server] = 1

      # We tried all these before.
      if not any_left:
        prodlib.log('\nUnable to make further progress.')
        break

      # Fix problems by replacing the server.
      for server in replacements:
        if server.property('auto_assigned'): continue
        orig = '%s' % server
        if not self._AllocateHost(srv_mgr, server):
          failed[orig] = 1
        else:
          succeeded.append(orig)
          replaced.append('%s' % server)

    failed = failed.keys()

    prodlib.log('\nreplace="%s"' % string.join(replaced))
    prodlib.log('success="%s"' % string.join(succeeded))
    prodlib.log('fail="%s"' % string.join(failed))
    if self._free:
      prodlib.log('free="%s"' % string.join(self._RemainingFree()))

    ver_results = cnstr_mgr.VerifyServers(srv_mgr)
    if ver_results:
      prodlib.log('Unable to fully repair: Still more errors in config.\n')
      for res in ver_results:
        prodlib.log(' %s' % res)
    else:
      prodlib.log('Succesfully repaired')
    return (replaced, succeeded, failed, self._RemainingFree())
Exemplo n.º 36
0
def DoBabysit(servers,
              config,
              maxiters=0,
              monitor_port_increment=0,
              extra_restarts=None,
              restart_requests=None,
              nolooprestarts=0,
              nortsignals=0,
              succinterval=None,
              failinterval=None):

    # map from (host, port) to the corresponding server . We need this
    # because el.go returns us results in form of host ports while we
    # want servers.
    hostport_srvinfo_map = {}

    if nolooprestarts:
        if extra_restarts:
            sleep_time = max(
                map(lambda m: m.property('safe_start_time'), extra_restarts))
        else:
            prodlib.log('Babysitter loop has nothing to do.. exiting')
            sys.exit()
        # endif
        monitor_command = "sleep %d # " % sleep_time
    else:
        monitor_command = "%s/google/bin/monitor --status_port=%s" % (
            sitecustomize.GOOGLEBASE,
            servertype.GetPortBase('monitor') + monitor_port_increment)
    # endif

    if nortsignals:
        monitor_command = monitor_command + " --nortsignals"
    if succinterval:
        monitor_command = monitor_command + " --succinterval=%s" % succinterval
    if failinterval:
        monitor_command = monitor_command + " --failinterval=%s" % failinterval

    el = monitor_event_loop(monitor_command, maxiters)

    for server in servers:
        # Cannot babysit virtual servers (ports >= 65536)
        if server.port() >= 65536:
            continue
        if server.property('skip_babysitting'):
            print "WARNING: babysitting disabled for %s:%s" % (server.host(),
                                                               server.port())
            continue

        if not nolooprestarts:

            datadir = server.datadir()
            if datadir is None: datadir = ''
            query = server.property('request_info')
            # some queries require dataversion. Right now dataversion is
            # the datadir for all servers.
            query = query % {'dataversion': datadir}

            el.register(
                (server.host(), servertype.GetServingPort(server.balport()),
                 server.port(), server.property('hostip'),
                 server.property('restartfn')), query,
                server.property('response_len'),
                server.property('test_timeouts'))
            host_port = (server.host(), server.port())
            hostport_srvinfo_map[host_port] = server
        # endif

    if extra_restarts and restart_requests:
        start_time = time.time()
        for server in extra_restarts:
            # Insure that enough seconds have passed since we start
            passed_time = time.time() - start_time
            if server.property('safe_start_time') > passed_time:
                delay = server.property('safe_start_time') - passed_time
                print "Spending  %s seconds monitoring." % delay
                if not print_only():
                    el.go(timeout=delay)
                else:
                    print "Actually simulating a sleep of %s" % delay

            # Call restart function.
            server.property('restartfn')()
            # And mark this guy restarted
            restart_requests.MarkRestarted(str(server))

    # Start the actual babysitter.
    restarted_srv_list = []
    argv_checker = babysitter_argv_checker.BabysitterArgvChecker(config)
    logging.info("%s %s: Restarting servers whose argv has changed. " %
                 (time.ctime(), time.tzname[0]))
    hostports = argv_checker.RestartIfArgvChanged()
    for hostport in hostports:
        restarted_srv_list.append(hostport_srvinfo_map[hostport])

    if not print_only():
        restarts = el.go()
        restarts = restarts.keys()
        for hostport in restarts:
            restarted_srv_list.append(hostport_srvinfo_map[hostport])

    return restarted_srv_list
Exemplo n.º 37
0
  def Assign(self, configs, operation, args):
    """Run assigner.
    Args:
      configs: [googleconfig.Config, ...] - config objects to repair.
      operation: 'op' - assignment operation to perform.
      args: [arg1, ...] - arguments for the operation.
    Returns:
      0 on successful replacements
      1 on no changes could be made
    """

    # Check if we have multiple configs with crawl, and get common owner.
    # Unowned machines that have servers reserved on them are
    # assigned to owner.
    is_crawl_config = 0
    for c in configs:
      cfg_owner = c.GetServerManager().property('owner')
      if self._owner is None:
        self._owner = cfg_owner
      elif self._owner != cfg_owner:
        prodlib.log("Can't deal with multiple owners in cfgs: %s vs. %s" %
                    (self._owner, cfg_owner))
        return 1
      if not is_crawl_config:
        is_crawl_config = c.var('CRAWLMASTER') != None
      # endif
    # endfor
    if is_crawl_config and len(configs) > 1:
      prodlib.log('More than one config specified per crawl change.'
                  ' Can\'t deal with this yet')
      return 1
    # endif

    # Perform requested operation.
    config_changes = self.RunOperation(configs, operation, args)

    changes = []

    # Create final change list and save config.
    for (config, add, rem, fail) in config_changes:

      srv_mgr = config.GetServerManager()

      if not add and not rem:
        prodlib.log('No changes for %s.' % config.GetConfigFileName())
        continue

      for (add_srv, rem_srv) in map(lambda x, y: (x, y), add, rem):
        if add_srv: add_srv = _ServersFromSpecs([add_srv], srv_mgr)[0]
        if rem_srv: rem_srv = _ServersFromSpecs([rem_srv], srv_mgr)[0]
        changes.append((add_srv, rem_srv, _GetGFSCluster(config)))

      if self._save:
        out_file = os.path.basename(config.GetConfigFileName()) + '.out'
        config.SaveServers(out_file)
        prodlib.log('Saved to %s.' % out_file)

    if not changes:
      prodlib.log('No changes for any configs.')
      return 1

    # Find added servers.
    new_servers = [i for (i, _, _) in changes if i is not None]

    fail = []

    # Remove these allocated from pending changes.
    new_changes = []
    for (add_srv, rem_srv, gfs_cluster) in changes:
      if add_srv in fail: continue
      new_changes.append((add_srv, rem_srv, gfs_cluster))
    changes = new_changes

    # Return failure if any setup failed.
    if fail:
      return 1
    else:
      return 0
Exemplo n.º 38
0
    def Replace(self, srv_mgr, names):

        prodlib.log('Beginning Replacements:\n')

        replaced = []
        succeeded = []
        failed = []

        # Find hosts to replace so we can exclude these from
        # being used as candidates.
        replace_hosts = []
        for name in names:
            servers = srv_mgr.ServersForSpec(name)
            for server in servers:
                replace_hosts.append(server.host())

        for name in names:

            # Find servers for name.  We copy this since the internal
            # array is modified when servers get replaced.
            servers = srv_mgr.ServersForSpec(name)[:]
            if not servers:
                prodlib.log('WARNING: no servers matched: %s' % name)

            for server in servers:
                if server.property('auto_assigned'): continue
                orig = '%s' % server
                if not self._AllocateHost(
                        srv_mgr, server, exclude=replace_hosts):
                    failed.append(orig)
                else:
                    succeeded.append(orig)
                    replaced.append('%s' % server)

        prodlib.log('\nreplace="%s"' % string.join(replaced))
        prodlib.log('success="%s"' % string.join(succeeded))
        prodlib.log('fail="%s"' % string.join(failed))
        if self._free:
            prodlib.log('free="%s"' % string.join(self._RemainingFree()))
        return (replaced, succeeded, failed, self._RemainingFree())
Exemplo n.º 39
0
            if flag == '--mach':
                # list of machines specified. Compute the corresponding regexp
                # that will match them all (and only them): (mach1)|(mach2)|...
                machines = string.split(value, ',')
                if len(machines) == 1:
                    machines = string.split(
                        value)  # maybe it's space-separated?
                regexpstr = '^((%s))$' % string.join(machines, ')|(')
            else:
                regexpstr = value

            # TODO: allow multiple regexps
            if not machine_re:
                machine_re = re.compile(regexpstr)
            else:
                prodlib.log("Only one of --re= or --mach= is allowed")
                usage()
        elif flag == '--delay':
            delay = float(value)
        elif flag == '--ports':
            restrictports = prodlib.CollectTypes(value, {})
        elif flag == '--noexec' or flag == '-n':
            print_only(1)
        elif flag == '--corptest':
            # For testing, don't do any DNS lookups to speed up processing.
            # Also, sprinkled through the code are calls to machdistance.ParseMachine
            # which fail for corp machines - hack local hostname into something
            # that passes it.
            socket.gethostbyname = lambda x: x
            socket.gethostname = lambda: 'exyz1'
            corptest = 1
Exemplo n.º 40
0
    def Repair(self, srv_mgr, tries=10):

        cnstr_mgr = srv_mgr.constraint_mgr()

        prodlib.log('Beginning Repair:\n')

        replaced = []
        succeeded = []
        failed = {}
        seen_replaced = {}

        # Try to repair a fixed number of rounds.
        # We just repair one server from each problem each round.
        # This should fix most problems but for distribution/sharing
        # violations, multiple fixes for the problem may be needed.
        for _ in range(tries):

            # Find constraint violations.
            ver_results = cnstr_mgr.VerifyServers(srv_mgr)
            if not ver_results:
                prodlib.log('\nNo more problems to repair.')
                break
            replacements = []

            # Find new machines to try and fix.
            any_left = 0
            for res in ver_results:
                server = res.servers()[0]
                if not seen_replaced.has_key('%s' % server):
                    any_left = 1
                    replacements.append(server)
                    seen_replaced['%s' % server] = 1

            # We tried all these before.
            if not any_left:
                prodlib.log('\nUnable to make further progress.')
                break

            # Fix problems by replacing the server.
            for server in replacements:
                if server.property('auto_assigned'): continue
                orig = '%s' % server
                if not self._AllocateHost(srv_mgr, server):
                    failed[orig] = 1
                else:
                    succeeded.append(orig)
                    replaced.append('%s' % server)

        failed = failed.keys()

        prodlib.log('\nreplace="%s"' % string.join(replaced))
        prodlib.log('success="%s"' % string.join(succeeded))
        prodlib.log('fail="%s"' % string.join(failed))
        if self._free:
            prodlib.log('free="%s"' % string.join(self._RemainingFree()))

        ver_results = cnstr_mgr.VerifyServers(srv_mgr)
        if ver_results:
            prodlib.log(
                'Unable to fully repair: Still more errors in config.\n')
            for res in ver_results:
                prodlib.log(' %s' % res)
        else:
            prodlib.log('Succesfully repaired')
        return (replaced, succeeded, failed, self._RemainingFree())
Exemplo n.º 41
0
def ComputeServers(config,
                   types,
                   machine_re,
                   restrictports,
                   myhostname,
                   excluded,
                   do_ckpt,
                   ckpt_time,
                   sets,
                   restrict_servers=None,
                   ssh_user=None):

    srv_mgr = config.GetServerManager()
    servers = srv_mgr.Servers(wanted_sets=types,
                              wanted_ports=restrictports,
                              wanted_indices=sets)
    computed_servers = []

    for server in servers:

        set = srv_mgr.Set(server.srvset())
        host = server.host()
        port = server.port()

        if restrict_servers and not restrict_servers.get(str(server), 0):
            continue

        mtype = server.servertype()

        if not WantedServer(host, port, mtype, config, machine_re, myhostname,
                            excluded):
            # Server has not been selected for inclusion.
            continue

        # Allow overriding ssh_user.
        if server.property('ssh_user'):
            ssh_user = server.property('ssh_user')

        # For AM transition, if there is no binary user set, then
        # do not use the ssh_user so we can still go on as root.
        # We will run the babysitter as root and it will ssh into
        # machines with binary_user set as prodsetup, and it will
        # ssh into machines without binary_user set as root.
        # TODO: Remove this when we are finished with conversion.
        binary_user = set.property('binary_user')
        if not binary_user or binary_user == 'root':
            ssh_user = None

        # Print out an informational string for the user.
        set_str = ''
        if sets: set_str = ' - set %s' % server.index()
        print "Checking %s:%d (%s%s)" % \
              (host, servertype.GetServingPort(port), mtype, set_str)

        safe_start_time = server.index() * server.property('inter_set_delay')

        try:
            hostip = socket.gethostbyname(host)
        except socket.error, e:
            prodlib.log("DNS error for %s: %s. Skipping." % (host, e))
            continue

        # Form restart closure
        restartfn = lambda f=server.Start, m=print_only(), u=ssh_user: \
          f(m, u)

        # Form kill closure
        killfn = lambda f=server.Stop, u=ssh_user, ck=do_ckpt, ct=ckpt_time: \
          f(2, u, ck, ct)

        server.set_property('hostip', hostip)
        server.set_property('safe_start_time', safe_start_time)
        server.set_property('restartfn', restartfn)
        server.set_property('killfn', killfn)

        computed_servers.append(server)
Exemplo n.º 42
0
  def AddSets(self, srv_mgrs, srvsetnums, do_min=0):

    # Allow interface to take a single server manager.
    if type(srv_mgrs) != types.ListType:
      srv_mgrs = [srv_mgrs]

    # Find free dictionary for this set.
    srv_mgr = srv_mgrs[0]
    cnstr_mgr = srv_mgr.constraint_mgr()

    # Set up the sets to process from constraints if not specified.
    if not srvsetnums:
      srvsetnums = {}

      # First iterate through all existing sets and add those sets
      # that have a shard length constraint.  Note that if the shardlen
      # constraint is specified in defaults this will add all currently
      # created sets.
      for set in srv_mgr.Sets():
        if set.property('auto_assigned'): continue
        if cnstr_mgr.Constraint('shardlen').Constraint(set.name()):
          srvsetnums[set.name()] = 1

      # There may be some sets specified that are not currently in
      # existance.  Iterate through the list of explicitly specified
      # shardlen constraints and add sets for their types.
      for srvset in cnstr_mgr.Constraint('shardlen').server_sets():
        srvsetnums[srvset] = 1

      srvsetnums = srvsetnums.keys()
      srvsetnums.sort()

    # Set up the number of each type to add.
    tmp = []
    for srvsetnum in srvsetnums:
      if string.find(srvsetnum, ':') == -1:
        # TODO: Right now we're storing constraint descs in the constraint
        # manager.  This will move in another checkin to the server sets
        # and will be more readable when accessing.
        vals = cnstr_mgr.Constraint('shardlen').Constraint(srvsetnum)
        # If no shardlen constrs were specified for this type then ignore.
        if not vals:
          prodlib.log('Cannot add sets for %s - no shardlen constraint' %
                      srvsetnum)
          continue
        srvsetnum = '%s:%s,%s' % (srvsetnum, vals[0], vals[1])
      tmp.append(srvsetnum)
    srvsetnums = tmp

    prodlib.log('Beginning AddSet:\n')

    added = []
    failed = []
    tried = []

    for srvsetnum in srvsetnums:

      (srvset, num) = string.split(srvsetnum, ':')
      num = string.split(num, ',')
      if len(num) == 2:
        min = int(num[0])
        max = int(num[1])
      else:
        min = int(num[0])
        max = min

      if do_min:
        cnt = min
      else:
        cnt = max

      # For balancer sets, we add these with the same port range as the
      # balanced set if they were not present in the server manager object.
      if srvset[0] == '+':
        for srv_mgr in srv_mgrs:
          balset = srv_mgr.Set(srvset[1:])
          if not balset:
            raise Error, 'Cannot add set %s: no balanced set' % srvset
          set = srv_mgr.AddSet(srvset, balset.level())
          # Ensure port ranges are matched to balanced set.
          for port in balset.Ports(): set.AddPort(port)

      # Build union of all ports for this type.
      ports = {}
      for srv_mgr in srv_mgrs:
        set = srv_mgr.Set(srvset)
        if set != None:
          cur_ports = set.Ports()
          for port in cur_ports: ports[port] = 1
      ports = ports.keys()
      ports.sort()

      # Iterate up the slices so that we replace short shards first.
      for i in range(cnt+1):

        # To check if all allocations failed on the ports.
        every_port_failed = 1

        # For each port check if this shard is short.
        for port in ports:
          for srv_mgr in srv_mgrs:

            cnstr_mgr = srv_mgr.constraint_mgr()
            set = srv_mgr.Set(srvset)
            if set.property('auto_assigned'): continue
            if port not in set.Ports(): continue

            # If we have enough servers just skip.
            num = len(set.ServersForPort(port))
            if num >= i:
              every_port_failed = 0
              continue

            # Create a placeholder server object.
            server = serverlib.Server()
            server.InitFromName('%s%s:%s' % (srvset, i, port))
            srv_mgr.AddServer(server)
            # Try and allocate a host for it.
            force = 0
            if num < min: force = 1
            if self._AllocateHost(srv_mgr, server, force):
              every_port_failed = 0
              added.append('%s' % server)
            else:
              # Failed so remove it from the map.
              srv_mgr.RemoveServer(server)
              if force:
                failed.append('%s' % server)
              else:
                tried.append('%s' % server)

        # Break if we tried every port for this server manager and
        # could not allocate anything.
        if every_port_failed: break

    prodlib.log('\nsuccess="%s"' % string.join(added))
    prodlib.log('failed="%s"' % string.join(failed))
    prodlib.log('tried="%s"' % string.join(tried))
    if self._free:
      prodlib.log('free="%s"' % string.join(self._RemainingFree()))
    return (added, failed, tried, self._RemainingFree())
Exemplo n.º 43
0
    if flag in ['--re', '--mach']:
      if flag == '--mach':
        # list of machines specified. Compute the corresponding regexp
        # that will match them all (and only them): (mach1)|(mach2)|...
        machines = string.split(value, ',')
        if len(machines) == 1:
          machines = string.split(value)     # maybe it's space-separated?
        regexpstr = '^((%s))$' % string.join(machines, ')|(')
      else:
        regexpstr = value

      # TODO: allow multiple regexps
      if not machine_re:
        machine_re = re.compile(regexpstr)
      else:
        prodlib.log("Only one of --re= or --mach= is allowed")
        usage()
    elif flag == '--delay':
      delay = float(value)
    elif flag == '--ports':
      restrictports = prodlib.CollectTypes(value, {})
    elif flag == '--noexec' or flag == '-n':
      print_only(1)
    elif flag == '--corptest':
      # For testing, don't do any DNS lookups to speed up processing.
      # Also, sprinkled through the code are calls to machdistance.ParseMachine
      # which fail for corp machines - hack local hostname into something
      # that passes it.
      socket.gethostbyname = lambda x: x
      socket.gethostname = lambda : 'exyz1'
      corptest = 1
Exemplo n.º 44
0
    def Assign(self, configs, operation, args):
        """Run assigner.
    Args:
      configs: [googleconfig.Config, ...] - config objects to repair.
      operation: 'op' - assignment operation to perform.
      args: [arg1, ...] - arguments for the operation.
    Returns:
      0 on successful replacements
      1 on no changes could be made
    """

        # Check if we have multiple configs with crawl, and get common owner.
        # Unowned machines that have servers reserved on them are
        # assigned to owner.
        is_crawl_config = 0
        for c in configs:
            cfg_owner = c.GetServerManager().property('owner')
            if self._owner is None:
                self._owner = cfg_owner
            elif self._owner != cfg_owner:
                prodlib.log(
                    "Can't deal with multiple owners in cfgs: %s vs. %s" %
                    (self._owner, cfg_owner))
                return 1
            if not is_crawl_config:
                is_crawl_config = c.var('CRAWLMASTER') != None
            # endif
        # endfor
        if is_crawl_config and len(configs) > 1:
            prodlib.log('More than one config specified per crawl change.'
                        ' Can\'t deal with this yet')
            return 1
        # endif

        # Perform requested operation.
        config_changes = self.RunOperation(configs, operation, args)

        changes = []

        # Create final change list and save config.
        for (config, add, rem, fail) in config_changes:

            srv_mgr = config.GetServerManager()

            if not add and not rem:
                prodlib.log('No changes for %s.' % config.GetConfigFileName())
                continue

            for (add_srv, rem_srv) in map(lambda x, y: (x, y), add, rem):
                if add_srv: add_srv = _ServersFromSpecs([add_srv], srv_mgr)[0]
                if rem_srv: rem_srv = _ServersFromSpecs([rem_srv], srv_mgr)[0]
                changes.append((add_srv, rem_srv, _GetGFSCluster(config)))

            if self._save:
                out_file = os.path.basename(
                    config.GetConfigFileName()) + '.out'
                config.SaveServers(out_file)
                prodlib.log('Saved to %s.' % out_file)

        if not changes:
            prodlib.log('No changes for any configs.')
            return 1

        # Find added servers.
        new_servers = [i for (i, _, _) in changes if i is not None]

        fail = []

        # Remove these allocated from pending changes.
        new_changes = []
        for (add_srv, rem_srv, gfs_cluster) in changes:
            if add_srv in fail: continue
            new_changes.append((add_srv, rem_srv, gfs_cluster))
        changes = new_changes

        # Return failure if any setup failed.
        if fail:
            return 1
        else:
            return 0