Exemplo n.º 1
0
def SetInitState(cfg, state):
  """Sets system's initialization state. For oneway, it stores it in
  C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file
  /ls/ent<version>/ENT_SYSTEM_INIT_STATE.

  @param cfg - of type configurator.
  @param state - string
  """
  # oneway?
  if 1 == len(core_utils.GetNodes()):
    cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state)
    return

  tmpfile = E.mktemp('/export/hda3/tmp')
  try:
    f = open(tmpfile, 'w')
    f.write(state)
    f.close()
  except IOError:
    logging.fatal('Cannot write to temp file %s' % tmpfile)
    return
  version = cfg.getGlobalParam('VERSION')
  lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version))
  chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version)
  write_cmd =  '%s cp %s %s/%s' % (lockserv_cmd_prefix,
      tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE')
  logging.info('setting system init state to: %s', state)
  E.exe_or_fail(write_cmd)
  E.exe('rm -rf %s' % tmpfile)
Exemplo n.º 2
0
    def remove(self, machine):
        """  This removes a machine from the configuration  """

        if machine not in self.cfg.getGlobalParam('MACHINES'):
            logging.error("%s doesn't exist" % machine)
            return 1

        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)
        # if possible stop the core services, ignore return code
        install_utilities.stop_core(ver, home, [machine])

        if machine == E.getCrtHostName():
            logging.error("Cannot remove self")
            return 1

        # Halt the machine if APC is used.
        error = self.halt(machine)

        self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine)
        self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine)
        ret = core_utils.AddDeadNode(ver, testver, machine)
        # remove the chunkserver running on the node
        gfs_utils.DeleteGFSChunkservers(ver, testver, [machine])
        if ret:
            logging.error('Cannot add dead node to the lockserver.')
            # we ignore this error for now

        # now we need to remove the data disks that were on this machine
        data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS')
        if data_disks.has_key(machine):
            del data_disks[machine]
            if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks):
                return 1

        # This also saves the config file
        if not self.cfg.DoMachineAllocation():
            return 1

        # Now we need to restart babysitter because the old one
        # is out of sync after this
        serve_service_cmd = (
            ". %s && "
            "cd %s/local/google3/enterprise/legacy/scripts && "
            "./serve_service.py %s" %
            (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME')))
        E.exe("%s %s" % (serve_service_cmd, "babysit"))

        self.restart_crawl_processes(serve_service_cmd)

        if not mail_already_sent(M.MSG_MACHINEREMOVED % machine):
            SendMail.send(self.cfg, None, false,
                          M.MSG_MACHINEREMOVED % machine, "", true)

        return error
Exemplo n.º 3
0
  def remove(self, machine):
    """  This removes a machine from the configuration  """

    if machine not in self.cfg.getGlobalParam('MACHINES'):
      logging.error("%s doesn't exist" % machine)
      return 1

    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)
    # if possible stop the core services, ignore return code
    install_utilities.stop_core(ver, home, [machine])

    if machine == E.getCrtHostName():
      logging.error("Cannot remove self")
      return 1

    # Halt the machine if APC is used.
    error = self.halt(machine)

    self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine)
    self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine)
    ret = core_utils.AddDeadNode(ver, testver, machine)
    # remove the chunkserver running on the node
    gfs_utils.DeleteGFSChunkservers(ver, testver, [machine])
    if ret:
      logging.error('Cannot add dead node to the lockserver.')
      # we ignore this error for now

    # now we need to remove the data disks that were on this machine
    data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS')
    if data_disks.has_key(machine):
      del data_disks[machine]
      if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks):
        return 1

    # This also saves the config file
    if not self.cfg.DoMachineAllocation():
      return 1

    # Now we need to restart babysitter because the old one
    # is out of sync after this
    serve_service_cmd = (". %s && "
        "cd %s/local/google3/enterprise/legacy/scripts && "
        "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME')))
    E.exe("%s %s" % (serve_service_cmd, "babysit"))

    self.restart_crawl_processes(serve_service_cmd)

    if not mail_already_sent(M.MSG_MACHINEREMOVED % machine):
      SendMail.send(self.cfg, None, false,
                 M.MSG_MACHINEREMOVED % machine, "", true)

    return error
Exemplo n.º 4
0
 def start_service(self, service):
     """
 Activates and starts the given service on the given machine
 """
     logging.info("ACTIVATE: %s service %s on %s" %
                  (service, self.version_, self.machine_))
     cmd = E.nonblocking_cmd("/etc/rc.d/init.d/%s_%s activate" %
                             (service, self.version_))
     ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd)))
     if ret == 0:
         logging.info("START: %s service %s on %s" %
                      (service, self.version_, self.machine_))
         cmd = E.nonblocking_cmd("/etc/rc.d/init.d/%s_%s start" %
                                 (service, self.version_))
         ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd)))
     return ret
Exemplo n.º 5
0
 def start_service(self, service):
   """
   Activates and starts the given service on the given machine
   """
   logging.info("ACTIVATE: %s service %s on %s" % (
     service, self.version_, self.machine_))
   cmd = E.nonblocking_cmd(
     "/etc/rc.d/init.d/%s_%s activate" % (service, self.version_))
   ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd)))
   if ret == 0:
     logging.info("START: %s service %s on %s" % (
         service, self.version_, self.machine_))
     cmd = E.nonblocking_cmd(
         "/etc/rc.d/init.d/%s_%s start" % (service, self.version_))
     ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd)))
   return ret
Exemplo n.º 6
0
 def EnsureDirectory(self, fileutil_args, path):
   logging.info("ensuring directory %s" % path)
   if string.find(path, "/gfs/") == 0:
     # string starts with /gfs/ so path is gfs directory
     cmnd = "fileutil %s -f mkdir -p %s" % (fileutil_args, path)
   else:
     cmnd = "mkdir -p %s" % path
   res = E.exe(cmnd)
   logging.info("Result of command %s is %d" % (cmnd, res))
Exemplo n.º 7
0
 def EnsureDirectory(self, fileutil_args, path):
     logging.info("ensuring directory %s" % path)
     if string.find(path, "/gfs/") == 0:
         # string starts with /gfs/ so path is gfs directory
         cmnd = "fileutil %s -f mkdir -p %s" % (fileutil_args, path)
     else:
         cmnd = "mkdir -p %s" % path
     res = E.exe(cmnd)
     logging.info("Result of command %s is %d" % (cmnd, res))
Exemplo n.º 8
0
 def restart_babysitter(self):
     'Returns bool - true on success and false on failure'
     serve_service_cmd = ". %s && " \
         "cd %s/local/google3/enterprise/legacy/scripts && " \
         "./serve_service.py %s " % (
             self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
             self.cfg.getGlobalParam('ENTERPRISE_HOME'))
     # Run babysitter in the background.
     return E.exe("%s %s &" % (serve_service_cmd, "babysit")) == 0
Exemplo n.º 9
0
 def restart_babysitter(self):
   'Returns bool - true on success and false on failure'
   serve_service_cmd = ". %s && " \
       "cd %s/local/google3/enterprise/legacy/scripts && " \
       "./serve_service.py %s " % (
           self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
           self.cfg.getGlobalParam('ENTERPRISE_HOME'),
           self.cfg.getGlobalParam('ENTERPRISE_HOME'))
   # Run babysitter in the background.
   return E.exe("%s %s &" % (serve_service_cmd, "babysit")) == 0
Exemplo n.º 10
0
    def delete_pagerank_barriers(self):
        """
    Deletes barrier files used by pr_main
    """
        # Sanity check to see we are indeed running pr_main
        pr_prog = self.cp.var('PAGERANKER_PROG')
        if pr_prog != 'pr_main':
            logging.fatal('Not using pr_main anymore')

        # Get all required parameter from entconfig
        barrier_name_prefix = '%s/barriers' % self.cp.var('NAMESPACE_PREFIX')
        datadir = self.cp.var('DATADIR')
        gfs_aliases = self.cp.var('GFS_ALIASES')

        # Nuke'em.  When there is only a single pr_main running (shards=1), it
        # does this during its startup.
        cmd = ('%s/fileutil --datadir=%s --gfs_aliases=%s '
               '--bnsresolver_use_svelte=false '
               ' rm -f %s.barrier_progpr_*_of_*_op*_iter*' %
               (self.bin_dir, datadir, gfs_aliases, barrier_name_prefix))
        logging.info('Deleting barriers - %s' % cmd)
        E.exe(cmd)
Exemplo n.º 11
0
  def delete_pagerank_barriers(self):
    """
    Deletes barrier files used by pr_main
    """
    # Sanity check to see we are indeed running pr_main
    pr_prog = self.cp.var('PAGERANKER_PROG')
    if pr_prog != 'pr_main':
      logging.fatal('Not using pr_main anymore')

    # Get all required parameter from entconfig
    barrier_name_prefix = '%s/barriers' % self.cp.var('NAMESPACE_PREFIX')
    datadir = self.cp.var('DATADIR')
    gfs_aliases = self.cp.var('GFS_ALIASES')

    # Nuke'em.  When there is only a single pr_main running (shards=1), it
    # does this during its startup.
    cmd = ('%s/fileutil --datadir=%s --gfs_aliases=%s '
           '--bnsresolver_use_svelte=false '
           ' rm -f %s.barrier_progpr_*_of_*_op*_iter*' %
          (self.bin_dir, datadir, gfs_aliases, barrier_name_prefix))
    logging.info('Deleting barriers - %s' % cmd)
    E.exe(cmd)
Exemplo n.º 12
0
    def createDataDirs(self, machines, onlyifneeded=false, node_replacement=0):
        """
    Create the directories for an index.
    Note that this function will get executed when a node is added back to the
    cluster.
    Input: onlyifneeded: If true, createDataDirs will only proceed if necessary;
      if search.config is missing, we assume enterprise-data needs to be re-created
    @return boolean Success status
    """
        if onlyifneeded:
            # The presence or absence of search.config indicates if we need to
            # re-create the directories
            config = '%s/search.config' % self.getGlobalParam(C.DATADIR)
            if E.access(machines, config, 'f'):
                logging.info('createDataDirs: search.config already exists')
                return true
            else:
                logging.info(
                    "createDataDirs: search.config doesn't exist; re-creating")
        logging.info("Create enterprise datadir...")
        if not data_directory.create(
                self.getGlobalParam(C.DATADISK),
                self.getGlobalParam("ENT_BIGFILE_DATADISKS"),
                self.getGlobalParam(C.DATACHUNK_PREFIX), "enterprise",
                self.getGlobalParam(C.BIN_DIRS), machines):
            logging.error("Error creating datadir")
            return false

        logging.info("Create querycache datadir...")
        if not data_directory.create(
                "%s/../querycache" % self.getGlobalParam(C.DATADISK),
                self.getGlobalParam("ENT_BIGFILE_DATADISKS"),
                "%s/../querycache" % self.getGlobalParam(C.DATACHUNK_PREFIX),
                "cache", self.getGlobalParam(C.BIN_DIRS), machines):
            logging.error("Error creating datadir")
            return false

        # Create FEEDS_DIR and FEED_STATUS_DIR for one-way
        if not self.getGlobalParam(C.GFS_CELL):
            cmnd = "mkdir -p %s; mkdir -p %s" % (self.getGlobalParam(
                'FEEDS_DIR'), self.getGlobalParam('FEED_STATUS_DIR'))
            res = E.exe(cmnd)
            logging.info("Result of command %s is %d" % (cmnd, res))

        # Create ONEBOX_LOGS_DIR for one-way
        if not self.getGlobalParam(C.GFS_CELL):
            cmnd = "mkdir -p %s" % self.getGlobalParam('ONEBOX_LOGS_DIR')
            res = E.exe(cmnd)
            logging.info("Result of command %s is %d" % (cmnd, res))

        # Create directory for rt-index cache
        if self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR'):
            d = self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR')
            out = []
            cmd = "mkdir -p %s; test -d %s" % (d, d)
            if E.ERR_OK != E.execute(machines, cmd, out, false):
                logging.error(
                    "Error creating cache directory for rtslave: %s" % out)
                return false

        # Ram cache directory is the mount point itself, so we don't need to create it.
        #self.getGlobalParam('RTSLAVE_RAM_DIR_FOR_INDEX_CACHING')

        return true
Exemplo n.º 13
0
  def createDataDirs(self, machines, onlyifneeded=false, node_replacement=0):
    """
    Create the directories for an index.
    Note that this function will get executed when a node is added back to the
    cluster.
    Input: onlyifneeded: If true, createDataDirs will only proceed if necessary;
      if search.config is missing, we assume enterprise-data needs to be re-created
    @return boolean Success status
    """
    if onlyifneeded:
      # The presence or absence of search.config indicates if we need to
      # re-create the directories
      config = '%s/search.config' % self.getGlobalParam(C.DATADIR)
      if E.access(machines, config, 'f'):
        logging.info('createDataDirs: search.config already exists')
        return true
      else:
        logging.info("createDataDirs: search.config doesn't exist; re-creating")
    logging.info("Create enterprise datadir...")
    if not data_directory.create(
      self.getGlobalParam(C.DATADISK),
      self.getGlobalParam("ENT_BIGFILE_DATADISKS"),
      self.getGlobalParam(C.DATACHUNK_PREFIX),
      "enterprise",
      self.getGlobalParam(C.BIN_DIRS),
      machines):
      logging.error("Error creating datadir")
      return false

    logging.info("Create querycache datadir...")
    if not data_directory.create(
      "%s/../querycache" % self.getGlobalParam(C.DATADISK),
      self.getGlobalParam("ENT_BIGFILE_DATADISKS"),
      "%s/../querycache" % self.getGlobalParam(C.DATACHUNK_PREFIX),
      "cache",
      self.getGlobalParam(C.BIN_DIRS),
      machines):
      logging.error("Error creating datadir")
      return false


    # Create FEEDS_DIR and FEED_STATUS_DIR for one-way
    if not self.getGlobalParam(C.GFS_CELL):
      cmnd = "mkdir -p %s; mkdir -p %s" % (self.getGlobalParam('FEEDS_DIR'),
                                     self.getGlobalParam('FEED_STATUS_DIR'))
      res = E.exe(cmnd)
      logging.info("Result of command %s is %d" % (cmnd, res))

    # Create ONEBOX_LOGS_DIR for one-way
    if not self.getGlobalParam(C.GFS_CELL):
      cmnd = "mkdir -p %s" % self.getGlobalParam('ONEBOX_LOGS_DIR')
      res = E.exe(cmnd)
      logging.info("Result of command %s is %d" % (cmnd, res))

    # Create directory for rt-index cache
    if self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR'):
      d = self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR')
      out = []
      cmd = "mkdir -p %s; test -d %s" % (d,d)
      if E.ERR_OK != E.execute(machines, cmd, out, false):
        logging.error("Error creating cache directory for rtslave: %s" % out)
        return false

    # Ram cache directory is the mount point itself, so we don't need to create it.
    #self.getGlobalParam('RTSLAVE_RAM_DIR_FOR_INDEX_CACHING')

    return true
Exemplo n.º 14
0
 def restart_crawl_processes(self, serve_service_cmd):
     # lets restart a few crawl related servers
     # so that the bringup is quick
     components = "--components=pr_main,urlmanager,"\
                  "urlserver,bot,contentfilter"
     E.exe("%s %s %s" % (serve_service_cmd, "start", components))
Exemplo n.º 15
0
    def add(self, machine, apc_outlet):
        """
    This adds a machine to the configuration
    """
        # We can add a machine only when we are in active state
        if install_utilities.install_state(
                self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
            logging.error("Can add a machine only when we are in active state")
            return 1

        # First test for accessibility of the machine.
        if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
            logging.error("Could not ssh into the machine %s" % machine)
            return 1

        # start the svs on the remote machine
        restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
            self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine)
        if E.execute([E.getCrtHostName()],
                         SECURE_WRAPPER_COMMAND % ( \
                              self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                              "-p2",
                              restart_svs_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not start svs on machine %s" % machine)
            return 1

        # wait for some time for svs to come up
        time.sleep(5)
        # check to see if the svs is up and is the right version
        if not svs_utilities.PingAndCheckSvsVersion(
                self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine):
            logging.error("Svs not running correctly on machine %s" % machine)
            return 1
        ver = self.cfg.getGlobalParam('VERSION')
        home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
        testver = install_utilities.is_test(ver)

        # update MACHINES
        machines = self.cfg.getGlobalParam('MACHINES')
        if machine not in machines:
            machines.append(machine)
        self.cfg.setGlobalParam('MACHINES', machines)

        ret = core_utils.RemDeadNode(ver, testver, machine)
        if ret:
            logging.error('Cannot remove dead node from lockserver.')
            # we ignore this error for now

        # We just added a new machine into the config
        # this will lead to a change in concentrator config
        # so we need to re-run serve service which will
        # write the new config and restart the concentrator
        serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        E.exe("%s %s" % (serve_cmd, "babysit"))

        num_tries = 5
        cur_try = 0
        while cur_try < num_tries:
            cur_try = cur_try + 1
            all_disks = self.cfg.mach_param_cache.GetFact(
                "mounted-drives", machine)
            bad_disks = self.cfg.mach_param_cache.GetFact(
                "var_log_badhds", machine)
            if bad_disks and all_disks:
                break
            time.sleep(60)
        if all_disks == None or bad_disks == None:
            logging.error("Could not get machine information about %s" %
                          machine)
            return 1

        bad_disks = string.split(bad_disks, ' ')
        all_disks = string.split(all_disks, ' ')
        good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
        good_disks = map(lambda x: "%s3" % x, good_disks)
        # change sda3 to hda3 etc.
        good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

        # Preprocess disks before adding to remove duplicates.
        unique_good_disks = []
        [
            unique_good_disks.append(disk) for disk in good_disks
            if disk not in unique_good_disks
        ]

        # Add disks
        self.updatedisk(machine, unique_good_disks, true)

        # apc map update
        apc_map = self.cfg.globalParams.var_copy('APC_MAP')
        apc_map[machine] = apc_util.PortMap(apc_outlet)
        if not self.cfg.setGlobalParam('APC_MAP', apc_map):
            logging.error("ERROR setting apc map to %s" % repr(apc_map))
            return 1

        # create appropriate datadirs on that machine
        if not self.cfg.createDataDirs([machine], node_replacement=1):
            logging.error("ERROR could not create datadirs on machine %s" %
                          machine)
            return 1

        # Replicate the config
        self.cfg.replicateConfigOnMachine(machine)

        # Reconfigure net on the target machine
        if not reconfigurenet_util.doReconfigureNet(
                self.cfg.globalParams, [machine], i_am_master=0):
            logging.error('reconfigurenet failed for %s' % machine)
            return 1

        # Start core services on the new node
        if not install_utilities.start_core(ver, home, [machine], ignore=0):
            logging.error("ERROR could not start core services on %s" %
                          machine)
            return 1
        # Add the chunkserver back
        gfs_utils.AddGFSChunkservers(ver, testver, [machine])

        # first we need to do Machine allocation.
        # this will assign things that will satisfy the constraints
        if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
            logging.error("ERROR doing machine allocation")
            return 1

        # now try to relllocate some servers from existing machines to the new machine
        replaced = self.cfg.AllocateServersToNewMachine(machine)
        if not replaced:
            logging.error("ERROR allocating services to the new machine")
            return 1

        # first we need to restart the babysitter
        E.exe("%s %s" % (serve_cmd, "babysit"))
        time.sleep(60)

        # Now we need to stop all the replaced services
        for server_string in replaced:
            server = serverlib.Server()
            server.InitFromName(server_string)
            replaced_type = server.servertype()
            kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
            if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
                logging.error("ERROR killing %s running on port %d on %s" % \
                                     (replaced_type, server.port(), server.host()))

        # we should make it active
        if not install_utilities.set_install_state(
                machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"):
            logging.error("ERROR changing state on machine %s. "
                          "Please make it active and activate and "
                          "start crawl service on it" % machine)
            return 1

        crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                          "./crawl_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'))
        if E.execute([machine], "%s %s" %
                     (crawl_cmd, "start"), None, 1) != E.ERR_OK:
            logging.error("Could not start crawl service on %s" % machine)
            return 1

        # save all the params
        self.cfg.saveParams()

        # for faster crawl recovery, lets restart all crawl processes
        self.restart_crawl_processes(serve_cmd)

        # activate the crawl and logcontrol service on the remote machine
        crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                                crawl_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate crawl service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1
        log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               log_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error(
                "Could not activate logcontrol service on machine %s" %
                machine)
            logging.error("Please activate by hand")
            return 1

        serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                               "</dev/null" % self.cfg.getGlobalParam('VERSION')
        if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                                self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                                "-e",
                               serve_activate_cmd),
                         None, 0) != E.ERR_OK:
            logging.error("Could not activate serve service on machine %s" %
                          machine)
            logging.error("Please activate by hand")
            return 1

        logging.info("Machine %s successfully added into the system" % machine)

        if not mail_already_sent(M.MSG_MACHINEADDED % machine):
            SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine,
                          "", true)
        return 0
Exemplo n.º 16
0
  def add(self, machine, apc_outlet):
    """
    This adds a machine to the configuration
    """
    # We can add a machine only when we are in active state
    if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE":
      logging.error("Can add a machine only when we are in active state")
      return 1

    # First test for accessibility of the machine.
    if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK:
      logging.error("Could not ssh into the machine %s" % machine)
      return 1

    # start the svs on the remote machine
    restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % (
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine)
    if E.execute([E.getCrtHostName()],
                     SECURE_WRAPPER_COMMAND % ( \
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          "-p2",
                          restart_svs_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not start svs on machine %s" % machine)
      return 1

    # wait for some time for svs to come up
    time.sleep(5)
    # check to see if the svs is up and is the right version
    if not svs_utilities.PingAndCheckSvsVersion(
                          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                          machine):
      logging.error("Svs not running correctly on machine %s" % machine)
      return 1
    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)

    # update MACHINES
    machines = self.cfg.getGlobalParam('MACHINES')
    if machine not in machines:
      machines.append(machine)
    self.cfg.setGlobalParam('MACHINES', machines)

    ret = core_utils.RemDeadNode(ver, testver, machine)
    if ret:
      logging.error('Cannot remove dead node from lockserver.')
      # we ignore this error for now

    # We just added a new machine into the config
    # this will lead to a change in concentrator config
    # so we need to re-run serve service which will
    # write the new config and restart the concentrator
    serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./serve_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    E.exe("%s %s" % (serve_cmd, "babysit"))

    num_tries = 5
    cur_try = 0
    while cur_try < num_tries:
      cur_try = cur_try + 1
      all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) 
      bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) 
      if bad_disks and all_disks:
        break
      time.sleep(60)
    if all_disks == None or bad_disks == None:
      logging.error("Could not get machine information about %s" % machine)
      return 1

    bad_disks = string.split(bad_disks, ' ')
    all_disks = string.split(all_disks, ' ')
    good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks)
    good_disks = map(lambda x: "%s3" % x, good_disks)
    # change sda3 to hda3 etc.
    good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks)

    # Preprocess disks before adding to remove duplicates.
    unique_good_disks = []
    [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks]

    # Add disks
    self.updatedisk(machine, unique_good_disks, true)

    # apc map update
    apc_map = self.cfg.globalParams.var_copy('APC_MAP')
    apc_map[machine] = apc_util.PortMap(apc_outlet)
    if not self.cfg.setGlobalParam('APC_MAP', apc_map):
      logging.error("ERROR setting apc map to %s" % repr(apc_map))
      return 1

    # create appropriate datadirs on that machine
    if not self.cfg.createDataDirs([machine], node_replacement = 1):
      logging.error("ERROR could not create datadirs on machine %s" % machine)
      return 1

    # Replicate the config
    self.cfg.replicateConfigOnMachine(machine)

    # Reconfigure net on the target machine
    if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams,
                                                [machine], i_am_master=0):
      logging.error('reconfigurenet failed for %s' % machine)
      return 1

    # Start core services on the new node
    if not install_utilities.start_core(ver, home, [machine], ignore=0):
      logging.error("ERROR could not start core services on %s" % machine)
      return 1
    # Add the chunkserver back
    gfs_utils.AddGFSChunkservers(ver, testver, [machine])

    # first we need to do Machine allocation.
    # this will assign things that will satisfy the constraints
    if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']):
      logging.error("ERROR doing machine allocation")
      return 1

    # now try to relllocate some servers from existing machines to the new machine
    replaced = self.cfg.AllocateServersToNewMachine(machine)
    if not replaced:
      logging.error("ERROR allocating services to the new machine")
      return 1

    # first we need to restart the babysitter
    E.exe("%s %s" % (serve_cmd, "babysit"))
    time.sleep(60)

    # Now we need to stop all the replaced services
    for server_string in replaced:
      server = serverlib.Server()
      server.InitFromName(server_string)
      replaced_type = server.servertype()
      kill_cmd = servertype.GetKillCmd(replaced_type, server.port())
      if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK:
        logging.error("ERROR killing %s running on port %d on %s" % \
                             (replaced_type, server.port(), server.host()))


    # we should make it active
    if not install_utilities.set_install_state(machine,
                             self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                             "ACTIVE"):
      logging.error("ERROR changing state on machine %s. "
                    "Please make it active and activate and "
                    "start crawl service on it" % machine)
      return 1

    crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \
                      "./crawl_service.py %s" % (
      self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'),
      self.cfg.getGlobalParam('ENTERPRISE_HOME'))
    if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK:
      logging.error("Could not start crawl service on %s" % machine)
      return 1

    # save all the params
    self.cfg.saveParams()

    # for faster crawl recovery, lets restart all crawl processes
    self.restart_crawl_processes(serve_cmd)

    # activate the crawl and logcontrol service on the remote machine
    crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                            crawl_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate crawl service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1
    log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           log_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate logcontrol service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \
                           "</dev/null" % self.cfg.getGlobalParam('VERSION')
    if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \
                            self.cfg.getGlobalParam('ENTERPRISE_HOME'),
                            "-e",
                           serve_activate_cmd),
                     None, 0) != E.ERR_OK:
      logging.error("Could not activate serve service on machine %s" % machine)
      logging.error("Please activate by hand")
      return 1

    logging.info("Machine %s successfully added into the system" % machine)

    if not mail_already_sent(M.MSG_MACHINEADDED % machine):
      SendMail.send(self.cfg, None, false,
                  M.MSG_MACHINEADDED % machine, "", true)
    return 0
Exemplo n.º 17
0
  def EnsureSpellingData(self, reset = 0):
    """
    This ensures that initial spelling data is present.
    If reset is set we clear ENT_SPELL_SERVING_ID and revert
    files to initial state.
    """

    logging.info("ensuring presence of initial spelling data")
    serving_id_cfg_name = 'ENT_SPELL_SERVING_ID'

    # if reset is set - blow away runtime dictionary version. (this is
    # useful after index has been reset).
    if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1):
      self.setGlobalParam(serving_id_cfg_name, 0);

    if (self.hasGlobalParam(serving_id_cfg_name)) and \
       (self.getGlobalParam(serving_id_cfg_name) == 0):
      fileutil_args = ""
      if self.hasGlobalParam('GFS_ALIASES'):
        fileutil_args = "--gfs_aliases='%s'" % \
                        self.getGlobalParam('GFS_ALIASES')
        fileutil_args += " --bnsresolver_use_svelte=false"
      if self.hasGlobalParam('DATADIR'):
        fileutil_args = "%s --datadir=%s" % \
                        (fileutil_args, self.getGlobalParam('DATADIR'))
      # note: assumes that the parent of spell_root exists
      spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR')
      end = len(spell_root) - 1
      if spell_root[end] == '/':
        spell_root = spell_root[0:end]
      target_path = "%s/spell-0" % \
                    self.getGlobalParam('ENT_SPELL_ROOT_DIR')
      self.EnsureDirectory(fileutil_args, spell_root)
      self.EnsureDirectory(fileutil_args, "%s" % spell_root)
      self.EnsureDirectory(fileutil_args, "%s" % target_path)
      logging.info("ensuring files")
      if not self.hasGlobalParam('ENTERPRISE_HOME'):
        logging.fatal("No ENTERPRISE_HOME config parameter")
        return
      src_path = "%s/../spelling-data/runtime" % \
                 self.getGlobalParam('ENTERPRISE_HOME')
      cmnd = "(cd %s ; " % src_path
      cmnd = cmnd + "for f in *.spelling.* ; "
      cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \
                    (fileutil_args, src_path, target_path)
      res = E.exe(cmnd)
      logging.info("Result of command %s is %d" % (cmnd, res) )
      # ensure spelling data is present
      num_src_files = self.CountSpellingFiles(fileutil_args, src_path)
      logging.info("There are %d spelling files in the source directory" % \
                   num_src_files)
      num_target_files = \
                       self.CountSpellingFiles(fileutil_args, target_path)
      logging.info("There are %d spelling files in the target directory"% \
                   num_target_files)
      if num_src_files == num_target_files:
        logging.info("spelling data present")
      else:
        logging.fatal("failed to ensure presence of spelling data")
        return

    else:
      logging.info("no config param %s, or it's not 0" % serving_id_cfg_name)
      logging.info("skipping spelling data check")
Exemplo n.º 18
0
    def EnsureSpellingData(self, reset=0):
        """
    This ensures that initial spelling data is present.
    If reset is set we clear ENT_SPELL_SERVING_ID and revert
    files to initial state.
    """

        logging.info("ensuring presence of initial spelling data")
        serving_id_cfg_name = 'ENT_SPELL_SERVING_ID'

        # if reset is set - blow away runtime dictionary version. (this is
        # useful after index has been reset).
        if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1):
            self.setGlobalParam(serving_id_cfg_name, 0)

        if (self.hasGlobalParam(serving_id_cfg_name)) and \
           (self.getGlobalParam(serving_id_cfg_name) == 0):
            fileutil_args = ""
            if self.hasGlobalParam('GFS_ALIASES'):
                fileutil_args = "--gfs_aliases='%s'" % \
                                self.getGlobalParam('GFS_ALIASES')
                fileutil_args += " --bnsresolver_use_svelte=false"
            if self.hasGlobalParam('DATADIR'):
                fileutil_args = "%s --datadir=%s" % \
                                (fileutil_args, self.getGlobalParam('DATADIR'))
            # note: assumes that the parent of spell_root exists
            spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR')
            end = len(spell_root) - 1
            if spell_root[end] == '/':
                spell_root = spell_root[0:end]
            target_path = "%s/spell-0" % \
                          self.getGlobalParam('ENT_SPELL_ROOT_DIR')
            self.EnsureDirectory(fileutil_args, spell_root)
            self.EnsureDirectory(fileutil_args, "%s" % spell_root)
            self.EnsureDirectory(fileutil_args, "%s" % target_path)
            logging.info("ensuring files")
            if not self.hasGlobalParam('ENTERPRISE_HOME'):
                logging.fatal("No ENTERPRISE_HOME config parameter")
                return
            src_path = "%s/../spelling-data/runtime" % \
                       self.getGlobalParam('ENTERPRISE_HOME')
            cmnd = "(cd %s ; " % src_path
            cmnd = cmnd + "for f in *.spelling.* ; "
            cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \
                          (fileutil_args, src_path, target_path)
            res = E.exe(cmnd)
            logging.info("Result of command %s is %d" % (cmnd, res))
            # ensure spelling data is present
            num_src_files = self.CountSpellingFiles(fileutil_args, src_path)
            logging.info("There are %d spelling files in the source directory" % \
                         num_src_files)
            num_target_files = \
                             self.CountSpellingFiles(fileutil_args, target_path)
            logging.info("There are %d spelling files in the target directory"% \
                         num_target_files)
            if num_src_files == num_target_files:
                logging.info("spelling data present")
            else:
                logging.fatal("failed to ensure presence of spelling data")
                return

        else:
            logging.info("no config param %s, or it's not 0" %
                         serving_id_cfg_name)
            logging.info("skipping spelling data check")
Exemplo n.º 19
0
 def restart_crawl_processes(self, serve_service_cmd):
   # lets restart a few crawl related servers
   # so that the bringup is quick
   components = "--components=pr_main,urlmanager,"\
                "urlserver,bot,contentfilter"
   E.exe("%s %s %s" % (serve_service_cmd, "start", components))