예제 #1
0
def _StatusFileCmd(cmd, version, out=[], extra_arg='', unittestdir=None):
    """Perform a command on the RESET_STATE status file.

  On a cluster, runs lockserv <cmd> /ls/ent4-x-x/RESET_STATE.
  On a oneway, runs cmd on /export/hda3/4.x.x/RESET_STATE
  cmd should be cat, setcontents, or rm.
  Return: None for oneway, 0 for success, 1 for error
  Command output returned in out.
  """

    if unittestdir != None or 1 == len(core_utils.GetNodes()):
        # unitest or Oneway
        if unittestdir != None:
            file = '/%s/%s/RESET_STATE' % (unittestdir, version)
        else:
            file = '/export/hda3/%s/RESET_STATE' % version
        if cmd == 'cat':
            status = _ExecuteCommand('cat %s' % file, out=out)
        elif cmd == 'setcontents':
            status = _ExecuteCommand('echo "%s" > %s' % (extra_arg, file))
        elif cmd == 'rm':
            status = _ExecuteCommand('rm -f %s' % file)
        else:
            logging.error('StatusFileCmd: bad command %s' % cmd)
            return 1
        return status

    lockserv_cmd_prefix = core_utils.GetLSClientCmd(
        version, install_utilities.is_test(version))
    chubby_file = '/ls/%s/RESET_STATE' % core_utils.GetCellName(version)
    lockserv_cmd = '%s %s %s %s' % (lockserv_cmd_prefix, cmd, chubby_file,
                                    extra_arg)
    logging.info('Reset index: executing %s' % lockserv_cmd)
    status = _ExecuteCommand(lockserv_cmd)
    return status
예제 #2
0
def _StatusFileCmd(cmd, version, out=[], extra_arg='', unittestdir=None):
  """Perform a command on the RESET_STATE status file.

  On a cluster, runs lockserv <cmd> /ls/ent4-x-x/RESET_STATE.
  On a oneway, runs cmd on /export/hda3/4.x.x/RESET_STATE
  cmd should be cat, setcontents, or rm.
  Return: None for oneway, 0 for success, 1 for error
  Command output returned in out.
  """

  if unittestdir != None or 1 == len(core_utils.GetNodes()):
    # unitest or Oneway
    if unittestdir != None:
      file = '/%s/%s/RESET_STATE' % (unittestdir, version)
    else:
      file = '/export/hda3/%s/RESET_STATE' % version
    if cmd == 'cat':
      status = _ExecuteCommand('cat %s' % file, out=out)
    elif cmd == 'setcontents':
      status = _ExecuteCommand('echo "%s" > %s' % (extra_arg, file))
    elif cmd == 'rm':
      status = _ExecuteCommand('rm -f %s' % file)
    else:
      logging.error('StatusFileCmd: bad command %s' % cmd)
      return 1
    return status

  lockserv_cmd_prefix = core_utils.GetLSClientCmd(version,
      install_utilities.is_test(version))
  chubby_file = '/ls/%s/RESET_STATE' % core_utils.GetCellName(version)
  lockserv_cmd = '%s %s %s %s' % (
                   lockserv_cmd_prefix, cmd, chubby_file, extra_arg)
  logging.info('Reset index: executing %s' % lockserv_cmd)
  status = _ExecuteCommand(lockserv_cmd)
  return status
예제 #3
0
    def remove(self, machine):
        """  This removes a machine from the configuration  """

        if machine not in self.cfg.getGlobalParam('MACHINES'):
            logging.error("%s doesn't exist" % machine)
            return 1

        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)
        # if possible stop the core services, ignore return code
        install_utilities.stop_core(ver, home, [machine])

        if machine == E.getCrtHostName():
            logging.error("Cannot remove self")
            return 1

        # Halt the machine if APC is used.
        error = self.halt(machine)

        self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine)
        self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine)
        ret = core_utils.AddDeadNode(ver, testver, machine)
        # remove the chunkserver running on the node
        gfs_utils.DeleteGFSChunkservers(ver, testver, [machine])
        if ret:
            logging.error('Cannot add dead node to the lockserver.')
            # we ignore this error for now

        # now we need to remove the data disks that were on this machine
        data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS')
        if data_disks.has_key(machine):
            del data_disks[machine]
            if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks):
                return 1

        # This also saves the config file
        if not self.cfg.DoMachineAllocation():
            return 1

        # Now we need to restart babysitter because the old one
        # is out of sync after this
        serve_service_cmd = (
            ". %s && "
            "cd %s/local/google3/enterprise/legacy/scripts && "
            "./serve_service.py %s" %
            (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME')))
        E.exe("%s %s" % (serve_service_cmd, "babysit"))

        self.restart_crawl_processes(serve_service_cmd)

        if not mail_already_sent(M.MSG_MACHINEREMOVED % machine):
            SendMail.send(self.cfg, None, false,
                          M.MSG_MACHINEREMOVED % machine, "", true)

        return error
예제 #4
0
  def remove(self, machine):
    """  This removes a machine from the configuration  """

    if machine not in self.cfg.getGlobalParam('MACHINES'):
      logging.error("%s doesn't exist" % machine)
      return 1

    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)
    # if possible stop the core services, ignore return code
    install_utilities.stop_core(ver, home, [machine])

    if machine == E.getCrtHostName():
      logging.error("Cannot remove self")
      return 1

    # Halt the machine if APC is used.
    error = self.halt(machine)

    self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine)
    self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine)
    ret = core_utils.AddDeadNode(ver, testver, machine)
    # remove the chunkserver running on the node
    gfs_utils.DeleteGFSChunkservers(ver, testver, [machine])
    if ret:
      logging.error('Cannot add dead node to the lockserver.')
      # we ignore this error for now

    # now we need to remove the data disks that were on this machine
    data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS')
    if data_disks.has_key(machine):
      del data_disks[machine]
      if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks):
        return 1

    # This also saves the config file
    if not self.cfg.DoMachineAllocation():
      return 1

    # Now we need to restart babysitter because the old one
    # is out of sync after this
    serve_service_cmd = (". %s && "
        "cd %s/local/google3/enterprise/legacy/scripts && "
        "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME')))
    E.exe("%s %s" % (serve_service_cmd, "babysit"))

    self.restart_crawl_processes(serve_service_cmd)

    if not mail_already_sent(M.MSG_MACHINEREMOVED % machine):
      SendMail.send(self.cfg, None, false,
                 M.MSG_MACHINEREMOVED % machine, "", true)

    return error
예제 #5
0
def _ClearIndex(cfg, version):
    """Clear the index directories in GFS/bigfiles.
  Return: '' for success, error for failure
  """
    logging.info('Reset Index: ClearIndex')
    logging.flush()

    # Delete (local) urltracker data on oneway.
    urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY')
    if os.access(urltracker_dir, os.R_OK):
        cmd = ('rm -R -f %s' % (urltracker_dir))
        logging.info('Deleting local urltracker directory: %s' %
                     (urltracker_dir))
        if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)):
            return 'File removal failed.'
    else:
        logging.info('No local urltracker data to delete')

    if cfg.getGlobalParam(C.GFS_CELL):
        logging.info('Deleting GFS files')
        gfs_aliases = core_utils.GetGFSAliases(
            version, install_utilities.is_test(version))
        dirs_not_removed = _RemoveTopLevelDirs(cfg,
                                               '/gfs/ent/',
                                               gfs_aliases=gfs_aliases)
        if len(dirs_not_removed) > 0:
            return 'Shared file removal failed.'

    logging.info('Deleting bigfiles')
    datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME')
    dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir)
    if len(dirs_not_removed) > 0:
        return 'File removal failed.'

    # delete spelling data on oneway:
    spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR')
    if spell_root[-1] == '/':
        spell_root = spell_root[:-1]

    if os.access(spell_root, os.R_OK):
        cmd = ('rm -R -f %s' % spell_root)
        logging.info('Deleting local (non-gfs) spelling data')
        if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)):
            return 'File removal failed.'
    else:
        logging.info('No local (non-gfs) spelling data to delete')

    return ''
예제 #6
0
 def babysit(self):
   if self.local_machine_is_master:
     if self.is_time_to_restart():
       self.record_restart()
       self._stop(op="babysit")
       # adjust gsa master node. This only applies to clusters, as
       # desired_gsa_master_node will be None for one-way.
       desired_gsa_master_node = core_utils.DesiredMasterNode()
       if (desired_gsa_master_node != None and
           desired_gsa_master_node != self.local_machine):
         is_testver = install_utilities.is_test(self.version)
         find_master.ForceMaster(desired_gsa_master_node, is_testver)
         return 1
     return self._start(op="babysit")
   else:
     return self._stop(op="babysit")
예제 #7
0
  def SVSErrorsStatus(self, lockserv_cmd_out=None):
    """ Check SVS errors recorded by gsa-master

    Args:
      lockserv_cmd_out: {'ent1': 'machine problem Unknown\n'}
        (for unit test only)
    Return: status, desc (e.g. 0, []).
            status is 1 if there are SVS erros. Otherwise, status is 0.
    """

    # Add any SVS errors (from gsa-master) to the problem list
    all_machs_status = 0
    desc = []
    if self._ent_config == 'CLUSTER':
      if lockserv_cmd_out is None:
        version = self._cfg.getGlobalParam('VERSION')
        lockserv_cmd_prefix = core_utils.GetLSClientCmd(version,
          install_utilities.is_test(version))
      for machine in self._live_machines:
        if lockserv_cmd_out is None:
          chubby_file = '/ls/%s/svs_%s' % (core_utils.GetCellName(version),
                                           machine)
          lockserv_cmd = '%s cat %s' % (lockserv_cmd_prefix, chubby_file)
          out = []
          lockserv_status = E.execute(['localhost'], lockserv_cmd, out, 60)
        else:
          lockserv_status = 0
          if machine in lockserv_cmd_out:
            out = [lockserv_cmd_out[machine]]
          else:
            out = []
        if lockserv_status == 0 and len(out) > 0 and out[0] != '':
          errors = out[0].splitlines()
          status = 0
          for i in range(0, len(errors)):
            if (errors[i].find('unrecoverable error') >= 0 or
                errors[i].find('file system error') >= 0):
              errors[i] = '' # Ignore this error
            else:
              status = 1 # Show an error
          if status:
            # A svs error has been recorded
            all_machs_status = max(all_machs_status, status)
            errors = [e for e in errors if e != '']
            # add machine name
            desc.append('%s: %s' % (machine, ' '.join(errors)))
    return all_machs_status, desc
예제 #8
0
 def babysit(self):
     if self.local_machine_is_master:
         if self.is_time_to_restart():
             self.record_restart()
             self._stop(op="babysit")
             # adjust gsa master node. This only applies to clusters, as
             # desired_gsa_master_node will be None for one-way.
             desired_gsa_master_node = core_utils.DesiredMasterNode()
             if (desired_gsa_master_node != None
                     and desired_gsa_master_node != self.local_machine):
                 is_testver = install_utilities.is_test(self.version)
                 find_master.ForceMaster(desired_gsa_master_node,
                                         is_testver)
                 return 1
         return self._start(op="babysit")
     else:
         return self._stop(op="babysit")
예제 #9
0
  def init_service(self, ent_home):
    """
    Does the actual initialization. Reads the config file in the
    cp (EntConfig) member and initializes some members for easy access
    to usual parameters
    """

    self.cp = entconfig.EntConfig(ent_home)
    if not self.cp.Load():
      sys.exit("Cannot load the config file %s" % self.cp.GetConfigFileName())

    # Get some params for easy access
    self.configfile    = self.cp.GetConfigFileName()
    self.version       = str(self.cp.var("VERSION"))
    self.entid_tag     = "ENT_ID=%s_%s" % (self.version, self.service_name)
    self.ent_user      = self.cp.var("ENTERPRISE_USER")
    self.ent_group     = self.cp.var("ENTERPRISE_GROUP")
    self.ent_bashrc    = self.cp.var("ENTERPRISE_BASHRC")
    self.ent_home      = self.cp.var("ENTERPRISE_HOME")
    self.googlebot_dir = self.cp.var("GOOGLEBOT_DIR")
    self.version_tmpdir= "%s/tmp" % self.cp.var("ENTERPRISE_HOME")
    self.tmpdir        = self.cp.var("TMPDIR")
    self.logdir        = self.cp.var("LOGDIR")
    self.datadir       = self.cp.var("DATADIR")
    self.scripts_dir   = ("%s/local/google3/enterprise/legacy/scripts" %
                          self.ent_home)
    self.util_dir      = ("%s/local/google3/enterprise/legacy/util" %
                          self.ent_home)
    self.machines      = self.cp.var("MACHINES")

    # The master depends on the install state : for active / test / install
    # we have the adminrunner on the master, else we get it from MASTER
    # parameter
    self.install_state = install_utilities.install_state(
      self.version, rootdir = self.cp.var('ENT_DISK_ROOT'))
    self.local_machine  = E.getCrtHostName()
    testver = install_utilities.is_test(self.version)
    if self.install_state in ["ACTIVE", "TEST", "INSTALL"]:
      try:
        self.master_machine = find_master.FindMasterUsingChubby(self.cp.var('VERSION'))
      except core_utils.EntMasterError, e:
        # Something is seriously wrong.
        logging.error("ERROR: Couldn't determine master")
        # Assume we aren't master, so we can at least do inactivate
        self.master_machine = None
예제 #10
0
def _ClearIndex(cfg, version):
  """Clear the index directories in GFS/bigfiles.
  Return: '' for success, error for failure
  """
  logging.info('Reset Index: ClearIndex')
  logging.flush()

  # Delete (local) urltracker data on oneway.
  urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY')
  if os.access(urltracker_dir, os.R_OK):
    cmd = ('rm -R -f %s' % (urltracker_dir))
    logging.info('Deleting local urltracker directory: %s' % (urltracker_dir))
    if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)):
      return 'File removal failed.'
  else:
    logging.info('No local urltracker data to delete')

  if cfg.getGlobalParam(C.GFS_CELL):
    logging.info('Deleting GFS files')
    gfs_aliases = core_utils.GetGFSAliases(version,
                                           install_utilities.is_test(version))
    dirs_not_removed = _RemoveTopLevelDirs(cfg, '/gfs/ent/', gfs_aliases=gfs_aliases)
    if len(dirs_not_removed) > 0:
      return 'Shared file removal failed.'

  logging.info('Deleting bigfiles')
  datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME')
  dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir)
  if len(dirs_not_removed) > 0:
    return 'File removal failed.'

  # delete spelling data on oneway:
  spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR')
  if spell_root[-1] == '/':
    spell_root = spell_root[:-1]

  if os.access(spell_root, os.R_OK):
    cmd = ('rm -R -f %s' % spell_root);
    logging.info('Deleting local (non-gfs) spelling data')
    if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)):
      return 'File removal failed.'
  else:
    logging.info('No local (non-gfs) spelling data to delete')

  return ''
예제 #11
0
def AvoidGFSMasterOnNode(config, node):
  """ avoiding running primary gfs master on a node

  Arguments:
    config: instance of entconfig
    node:   'ent1'
  """

  ver = config.VERSION
  testver = install_utilities.is_test(ver)
  # first make sure there is a primary master
  out = gfs_utils.EnsureGFSMasterRunning(ver, testver)
  if out is not None:
    logging.error("GFSMaster_NoMaster alert detected, "
                  "but fix was not successful. Error message: [%s]" % out)
  else:
    gfs_utils.AvoidGFSMasterOnNode(ver, testver, node)
  # ensure gfs chunkservers are added after gfs master is running
  gfs_utils.AddGFSChunkservers(ver, testver, config.MACHINES)
예제 #12
0
def AvoidGFSMasterOnNode(config, node):
    """ avoiding running primary gfs master on a node

  Arguments:
    config: instance of entconfig
    node:   'ent1'
  """

    ver = config.VERSION
    testver = install_utilities.is_test(ver)
    # first make sure there is a primary master
    out = gfs_utils.EnsureGFSMasterRunning(ver, testver)
    if out is not None:
        logging.error("GFSMaster_NoMaster alert detected, "
                      "but fix was not successful. Error message: [%s]" % out)
    else:
        gfs_utils.AvoidGFSMasterOnNode(ver, testver, node)
    # ensure gfs chunkservers are added after gfs master is running
    gfs_utils.AddGFSChunkservers(ver, testver, config.MACHINES)
예제 #13
0
  def add(self, machine, apc_outlet):
    """
    This adds a machine to the configuration
    """
    # We can add a machine only when we are in active state
    if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
      logging.error("Can add a machine only when we are in active state")
      return 1

    # First test for accessibility of the machine.
    if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
      logging.error("Could not ssh into the machine %s" % machine)
      return 1

    # start the svs on the remote machine
    restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine)
    if E.execute([E.getCrtHostName()],
                     SECURE_WRAPPER_COMMAND % ( \
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          "-p2",
                          restart_svs_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not start svs on machine %s" % machine)
      return 1

    # wait for some time for svs to come up
    time.sleep(5)
    # check to see if the svs is up and is the right version
    if not svs_utilities.PingAndCheckSvsVersion(
                          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine):
      logging.error("Svs not running correctly on machine %s" % machine)
      return 1
    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)

    # update MACHINES
    machines = self.cfg.getGlobalParam('MACHINES')
    if machine not in machines:
      machines.append(machine)
    self.cfg.setGlobalParam('MACHINES', machines)

    ret = core_utils.RemDeadNode(ver, testver, machine)
    if ret:
      logging.error('Cannot remove dead node from lockserver.')
      # we ignore this error for now

    # We just added a new machine into the config
    # this will lead to a change in concentrator config
    # so we need to re-run serve service which will
    # write the new config and restart the concentrator
    serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./serve_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    E.exe("%s %s" % (serve_cmd, "babysit"))

    num_tries = 5
    cur_try = 0
    while cur_try < num_tries:
      cur_try = cur_try + 1
      all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) 
      bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) 
      if bad_disks and all_disks:
        break
      time.sleep(60)
    if all_disks == None or bad_disks == None:
      logging.error("Could not get machine information about %s" % machine)
      return 1

    bad_disks = string.split(bad_disks, ' ')
    all_disks = string.split(all_disks, ' ')
    good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
    good_disks = map(lambda x: "%s3" % x, good_disks)
    # change sda3 to hda3 etc.
    good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

    # Preprocess disks before adding to remove duplicates.
    unique_good_disks = []
    [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks]

    # Add disks
    self.updatedisk(machine, unique_good_disks, true)

    # apc map update
    apc_map = self.cfg.globalParams.var_copy('APC_MAP')
    apc_map[machine] = apc_util.PortMap(apc_outlet)
    if not self.cfg.setGlobalParam('APC_MAP', apc_map):
      logging.error("ERROR setting apc map to %s" % repr(apc_map))
      return 1

    # create appropriate datadirs on that machine
    if not self.cfg.createDataDirs([machine], node_replacement = 1):
      logging.error("ERROR could not create datadirs on machine %s" % machine)
      return 1

    # Replicate the config
    self.cfg.replicateConfigOnMachine(machine)

    # Reconfigure net on the target machine
    if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams,
                                                [machine], i_am_master=0):
      logging.error('reconfigurenet failed for %s' % machine)
      return 1

    # Start core services on the new node
    if not install_utilities.start_core(ver, home, [machine], ignore=0):
      logging.error("ERROR could not start core services on %s" % machine)
      return 1
    # Add the chunkserver back
    gfs_utils.AddGFSChunkservers(ver, testver, [machine])

    # first we need to do Machine allocation.
    # this will assign things that will satisfy the constraints
    if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
      logging.error("ERROR doing machine allocation")
      return 1

    # now try to relllocate some servers from existing machines to the new machine
    replaced = self.cfg.AllocateServersToNewMachine(machine)
    if not replaced:
      logging.error("ERROR allocating services to the new machine")
      return 1

    # first we need to restart the babysitter
    E.exe("%s %s" % (serve_cmd, "babysit"))
    time.sleep(60)

    # Now we need to stop all the replaced services
    for server_string in replaced:
      server = serverlib.Server()
      server.InitFromName(server_string)
      replaced_type = server.servertype()
      kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
      if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
        logging.error("ERROR killing %s running on port %d on %s" % \
                             (replaced_type, server.port(), server.host()))


    # we should make it active
    if not install_utilities.set_install_state(machine,
                             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                             "ACTIVE"):
      logging.error("ERROR changing state on machine %s. "
                    "Please make it active and activate and "
                    "start crawl service on it" % machine)
      return 1

    crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./crawl_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK:
      logging.error("Could not start crawl service on %s" % machine)
      return 1

    # save all the params
    self.cfg.saveParams()

    # for faster crawl recovery, lets restart all crawl processes
    self.restart_crawl_processes(serve_cmd)

    # activate the crawl and logcontrol service on the remote machine
    crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                            crawl_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate crawl service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1
    log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           log_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate logcontrol service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           serve_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate serve service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    logging.info("Machine %s successfully added into the system" % machine)

    if not mail_already_sent(M.MSG_MACHINEADDED % machine):
      SendMail.send(self.cfg, None, false,
                  M.MSG_MACHINEADDED % machine, "", true)
    return 0
예제 #14
0
def FindMasterUsingChubby(ver):
  """
  Find the master using chubby based master election.
  """
  return core_utils.GetGSAMaster(ver, install_utilities.is_test(ver))
예제 #15
0
    def add(self, machine, apc_outlet):
        """
    This adds a machine to the configuration
    """
        # We can add a machine only when we are in active state
        if install_utilities.install_state(
                self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
            logging.error("Can add a machine only when we are in active state")
            return 1

        # First test for accessibility of the machine.
        if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
            logging.error("Could not ssh into the machine %s" % machine)
            return 1

        # start the svs on the remote machine
        restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
            self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine)
        if E.execute([E.getCrtHostName()],
                         SECURE_WRAPPER_COMMAND % ( \
                              self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                              "-p2",
                              restart_svs_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not start svs on machine %s" % machine)
            return 1

        # wait for some time for svs to come up
        time.sleep(5)
        # check to see if the svs is up and is the right version
        if not svs_utilities.PingAndCheckSvsVersion(
                self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine):
            logging.error("Svs not running correctly on machine %s" % machine)
            return 1
        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)

        # update MACHINES
        machines = self.cfg.getGlobalParam('MACHINES')
        if machine not in machines:
            machines.append(machine)
        self.cfg.setGlobalParam('MACHINES', machines)

        ret = core_utils.RemDeadNode(ver, testver, machine)
        if ret:
            logging.error('Cannot remove dead node from lockserver.')
            # we ignore this error for now

        # We just added a new machine into the config
        # this will lead to a change in concentrator config
        # so we need to re-run serve service which will
        # write the new config and restart the concentrator
        serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        E.exe("%s %s" % (serve_cmd, "babysit"))

        num_tries = 5
        cur_try = 0
        while cur_try < num_tries:
            cur_try = cur_try + 1
            all_disks = self.cfg.mach_param_cache.GetFact(
                "mounted-drives", machine)
            bad_disks = self.cfg.mach_param_cache.GetFact(
                "var_log_badhds", machine)
            if bad_disks and all_disks:
                break
            time.sleep(60)
        if all_disks == None or bad_disks == None:
            logging.error("Could not get machine information about %s" %
                          machine)
            return 1

        bad_disks = string.split(bad_disks, ' ')
        all_disks = string.split(all_disks, ' ')
        good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
        good_disks = map(lambda x: "%s3" % x, good_disks)
        # change sda3 to hda3 etc.
        good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

        # Preprocess disks before adding to remove duplicates.
        unique_good_disks = []
        [
            unique_good_disks.append(disk) for disk in good_disks
            if disk not in unique_good_disks
        ]

        # Add disks
        self.updatedisk(machine, unique_good_disks, true)

        # apc map update
        apc_map = self.cfg.globalParams.var_copy('APC_MAP')
        apc_map[machine] = apc_util.PortMap(apc_outlet)
        if not self.cfg.setGlobalParam('APC_MAP', apc_map):
            logging.error("ERROR setting apc map to %s" % repr(apc_map))
            return 1

        # create appropriate datadirs on that machine
        if not self.cfg.createDataDirs([machine], node_replacement=1):
            logging.error("ERROR could not create datadirs on machine %s" %
                          machine)
            return 1

        # Replicate the config
        self.cfg.replicateConfigOnMachine(machine)

        # Reconfigure net on the target machine
        if not reconfigurenet_util.doReconfigureNet(
                self.cfg.globalParams, [machine], i_am_master=0):
            logging.error('reconfigurenet failed for %s' % machine)
            return 1

        # Start core services on the new node
        if not install_utilities.start_core(ver, home, [machine], ignore=0):
            logging.error("ERROR could not start core services on %s" %
                          machine)
            return 1
        # Add the chunkserver back
        gfs_utils.AddGFSChunkservers(ver, testver, [machine])

        # first we need to do Machine allocation.
        # this will assign things that will satisfy the constraints
        if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
            logging.error("ERROR doing machine allocation")
            return 1

        # now try to relllocate some servers from existing machines to the new machine
        replaced = self.cfg.AllocateServersToNewMachine(machine)
        if not replaced:
            logging.error("ERROR allocating services to the new machine")
            return 1

        # first we need to restart the babysitter
        E.exe("%s %s" % (serve_cmd, "babysit"))
        time.sleep(60)

        # Now we need to stop all the replaced services
        for server_string in replaced:
            server = serverlib.Server()
            server.InitFromName(server_string)
            replaced_type = server.servertype()
            kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
            if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
                logging.error("ERROR killing %s running on port %d on %s" % \
                                     (replaced_type, server.port(), server.host()))

        # we should make it active
        if not install_utilities.set_install_state(
                machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"):
            logging.error("ERROR changing state on machine %s. "
                          "Please make it active and activate and "
                          "start crawl service on it" % machine)
            return 1

        crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./crawl_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        if E.execute([machine], "%s %s" %
                     (crawl_cmd, "start"), None, 1) != E.ERR_OK:
            logging.error("Could not start crawl service on %s" % machine)
            return 1

        # save all the params
        self.cfg.saveParams()

        # for faster crawl recovery, lets restart all crawl processes
        self.restart_crawl_processes(serve_cmd)

        # activate the crawl and logcontrol service on the remote machine
        crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                                crawl_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate crawl service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1
        log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               log_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error(
                "Could not activate logcontrol service on machine %s" %
                machine)
            logging.error("Please activate by hand")
            return 1

        serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               serve_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate serve service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1

        logging.info("Machine %s successfully added into the system" % machine)

        if not mail_already_sent(M.MSG_MACHINEADDED % machine):
            SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine,
                          "", true)
        return 0
예제 #16
0
def FindMasterUsingChubby(ver):
    """
  Find the master using chubby based master election.
  """
    return core_utils.GetGSAMaster(ver, install_utilities.is_test(ver))
예제 #17
0
  def execute(self, argv):
    """
    This executes the service given the command line arguments.
    The first two argument are 'ent_home' and 'task' than is it's the
    children job to parse the extra args if it wants by overriding parse_args
    """

    # Args parsing
    if len(argv) < 3:
      sys.exit(self.usage())

    # Get the first two arguments and intitialize
    self.init_service(argv[1])
    self.task = string.strip(argv[2])

    # Get the other arguments and call the parsing function
    flags_argv = [argv[0]]
    flags_argv.extend(argv[3:])
    self.parse_args(flags_argv)

    # Extra checks
    if not self.service_to_be_up():
      sys.exit('%s not active' % self.service_name)

    # check if the node is enabled
    testver = install_utilities.is_test(self.version)
    if core_utils.AmIDisabled(self.version, testver):
      logging.error('I am disabled.')
      sys.exit(-1)

    if (self.performs_only_on_master and
        self.task not in ("activate", "deactivate")):
      if not self.local_machine_is_master:
        logging.error('I am not the master')
        self.nop()
        sys.exit(0)


    if not self.task:
      sys.exit(self.usage())

    # Execute the operations behind a lock
    lockfile = "%s/%s_service_lock_%s" % (self.tmpdir,
                                          self.service_name,
                                          self.version)
    pidfile = "%s/%s_service_pid_%s" % (self.tmpdir,
                                          self.service_name,
                                          self.version)

    # Execute the task: unlocked on activate/deactivate /
    # locked else
    if self.task in ["activate", "deactivate"]:
      do_task(self, self.task)
    else:
      if self.check_previous_cron_job:
        # kill previous cron job if lockfile timestamp is too old
        valid_lock_duration = self.secs_to_kill_previous_job
        # for "stop" and "restart" task, do it immediately
        if self.task in ["stop", "restart"]:
          valid_lock_duration = 0
        E.exec_locked(lockfile, 1, do_task, (self, self.task,), {},
                      valid_lock_duration, pidfile)
      else:
        # Lock will time out after 60 rounds of 10 seconds
        E.exec_locked(lockfile, 60, do_task, (self, self.task,))