def SyncOneboxLog(config):
  """Syncs Local Onebox log file with GFS Onebox Log file ONLY on clusters.
  As of 4.6.4, this is called from scripts/periodic_script.py and from
  onebox_handler.py, when the user does View Log AND the machine is a cluster.
  """
  onebox_port = servertype.GetPortBase('oneboxenterprise')
  onebox_node = config.SERVERS[onebox_port]
  crt_machine = E.getCrtHostName()
  ent_config_type = config.var('ENT_CONFIG_TYPE')

  #If onebox server is not running no need to sync.
  if ent_config_type != 'CLUSTER' or crt_machine != onebox_node[0]:
    return

  tmp_dir     = config.var('TMPDIR')
  gfs_cell    = config.var('GFS_CELL')
  local_log_name = os.path.join(tmp_dir, config.var('ENTERPRISE_ONEBOX_LOG'))
  gfs_log_name   = os.path.join(os.sep, 'gfs', gfs_cell,
                                config.var('ENTERPRISE_ONEBOX_LOG'))

  equalize_command = 'equalize %s %s' % (local_log_name, gfs_log_name)

  # fileutil equalize copies only the difference of the log files.
  err, out = E.run_fileutil_command(config, equalize_command)
  if not err:
    return

  # files didn't match in the begining, possibly a new log file would have
  # created, copy the whole log file in such case.
  copy_command = 'cp -f %s %s' % (local_log_name, gfs_log_name)
  err, out = E.run_fileutil_command(config, copy_command)

  if err:
    logging.error('Error while syncing onebox logs.')
Exemplo n.º 2
0
  def browse(self, clientName, virtualFile, fromLine, toLine,
             grepString, fileArgs):

    grepString = urllib.quote_plus(grepString.strip())
    fromLine = string.atoi(fromLine)
    toLine   = string.atoi(toLine)

    fileLocation = self.makePhysicalFile(virtualFile, clientName, grepString,
                                         fileArgs)
    if not fileLocation:
      raise ar_exception.ARException((
        ar_exception.LOADPARAMS, M.ERR_INVALIDFILESPECIFICATION))

    # Get valid start line and end line
    numLines = toLine - fromLine + 1;
    if numLines < 0:  numLines = 0
    toLine = fromLine + numLines - 1

    # Get the lines
    result = []

    parsedGrepString = commands.mkarg(grepString)
    if ( E.ERR_OK != E.execute(
        [E.getCrtHostName()], "head -n %s %s | tail -n %s | grep -i -F -- %s" %
        (toLine, fileLocation, numLines, parsedGrepString), result, false) ):
      return "0"

    return "%s\n%s" % (len(result[0]), result[0])
Exemplo n.º 3
0
def SyncOpLogs(all_machines, log_dir):
  """ This will sync the AdminRunner.OPERATOR.* logs to all machines """

  # We have to run this only on master
  master = find_master.FindMaster(2100, all_machines)

  # The name of this machine
  crt_machine = E.getCrtHostName()
  if len(master) == 1 and master[0] == crt_machine:
    for machine in all_machines:
      if machine != crt_machine:
        src_dir = '%s/AdminRunner.OPERATOR.*' % (log_dir)
        dest_dir = '%s:/%s' % (machine, log_dir)

        logging.info('Collecting operator logs from %s into %s' % (
          src_dir, dest_dir))

        rsync_cmd = 'rsync --timeout=20 --size-only -vau ' \
                     ' -e ssh %s %s/' % (src_dir, dest_dir)

        # rsync the logs
        lockfile = '%s/syncops_lock' % log_dir
        lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0)
        if lock == None:
          logging.info('Cannot grab the lock. Return!')
          return

        try:
          (status, output) = liblog.DoCommand(rsync_cmd)
          if status != 0:
            logging.error('Failed to collect logs from %s: %s' % (
              machine, output))
        finally:
          lock.close()
          os.unlink(lockfile)
def SyncLogsWithCanonical(ver, canonical):
  """ sync logs with the canonical

  Arguments:
    ver: '4.6.5'
    canonical: 'ent1'
  """

  gfs_master_nodes, _ = core_utils.GFSMasterNodes()
  gfs_master_nodes.remove(canonical)
  gfs_master_dir = '/export/hda3/%s/data/gfs_master' % ver
  log_dir = '%s/ent.gfsmaster' % gfs_master_dir
  backup_log_dir = '%s.backup.%d' % (log_dir, int(time.time()))
  vars = {'gfs_master_dir':     gfs_master_dir,
          'log_dir':            log_dir,
          'backup_log_dir':     backup_log_dir,
          'canonical':          canonical
         }
  cmd = ('rm -rf %(log_dir)s.backup*; '
         'mv %(log_dir)s %(backup_log_dir)s; '
         'mkdir -p %(log_dir)s; chown nobody:nobody %(log_dir)s; '
         'rsync -c -e ssh -aHpogt %(canonical)s:%(log_dir)s %(gfs_master_dir)s'
         % vars
        )
  out = []
  enthome = '/export/hda3/%s' % ver
  E.execute(gfs_master_nodes, cmd, out, 1200, 1, 0, enthome)
Exemplo n.º 5
0
def SetInitState(cfg, state):
  """Sets system's initialization state. For oneway, it stores it in
  C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file
  /ls/ent<version>/ENT_SYSTEM_INIT_STATE.

  @param cfg - of type configurator.
  @param state - string
  """
  # oneway?
  if 1 == len(core_utils.GetNodes()):
    cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state)
    return

  tmpfile = E.mktemp('/export/hda3/tmp')
  try:
    f = open(tmpfile, 'w')
    f.write(state)
    f.close()
  except IOError:
    logging.fatal('Cannot write to temp file %s' % tmpfile)
    return
  version = cfg.getGlobalParam('VERSION')
  lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version))
  chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version)
  write_cmd =  '%s cp %s %s/%s' % (lockserv_cmd_prefix,
      tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE')
  logging.info('setting system init state to: %s', state)
  E.exe_or_fail(write_cmd)
  E.exe('rm -rf %s' % tmpfile)
Exemplo n.º 6
0
def kill_service(services, machines):
  """Kill all processes associated with specified services on the specified
  machines. E.execute() sends the commands concurrently when there is more than
  one node.

  Args:
    services: list of services to kill. 'adminconsole' and 'adminrunner' are
              currently supported.
    machines: list of hostnames
  """
  # Map of services to the command that kills the service
  find_service_pid_cmd = {
      'adminconsole': ("ps -e -o pid,args --width 100 | "
                       "grep loop_AdminConsole.py | grep -v grep | "
                       "awk '{print $1}' ; "
                       "%s" % python_kill.GetServicesListeningOn(['8000'])),
      'adminrunner': ("ps -e -o pid,args --width 100 | "
                      "grep loop_AdminRunner.py | grep -v grep | "
                      "awk '{print $1}' ; "
                      "%s" % python_kill.GetServicesListeningOn(['2100'])),
  }

  for service in services:
    if service not in find_service_pid_cmd:
      logging.error('kill_service: Unrecognised service "%s"' % service)
    else:
      logging.info('kill_service: Killing service "%s" on %d nodes...' %
                   (service, len(machines)))
      kill_cmd = ('sh -c "(kill `%s`; sleep 3; kill -9 `%s`; true)" '
                  '> /dev/null 2>&1' %
                  (find_service_pid_cmd[service],
                   find_service_pid_cmd[service]))
      E.execute(machines, kill_cmd, [], alarm=1, verbose=0)
      logging.info('kill_service: Done killing service "%s"' % service)
Exemplo n.º 7
0
def CollectLogs(all_machines, gws_log_dir, log_collect_dir):
  # We only run this on oneway or master node of cluster.
  master = find_master.FindMaster(2100, all_machines)
  crt_machine = E.getCrtHostName()
  if len(all_machines) != 1 and (len(master) != 1 or master[0] != crt_machine):
    logging.info('Not a oneway or cluster master node. Return!')
    return

  lockfile = '%s/lock' % log_collect_dir
  # waiting up to 5 minutes for the lock.
  lock = E.acquire_lock(lockfile, 30, breakLockAfterGracePeriod = 0)
  if lock == None:
    logging.info('Cannot grab the lock. Return!')
    return

  try:
    for machine in all_machines:
      src_pattern = '%s/partnerlog.*' % gws_log_dir
      dest_dir = '%s/%s' % (log_collect_dir, machine)

      # If it's a oneway or master node, we make a symlink to gws_log_dir instead
      # of rsync to log_collect directory
      if machine == crt_machine:
        # To make it backward compatible, we need to remove old dest_dir if it's
        # already an existing directory from previous version because in previous
        # versions we created a dir and rsynced files even on the master node and
        # one-ways.
        if os.path.exists(dest_dir) and not os.path.islink(dest_dir):
          if not E.rm(master, '%s/*' % dest_dir) or not E.rmdir(master, dest_dir):
            logging.error('Directory %s exists and cannot be cleaned.', dest_dir)
            continue
          logging.info('Cleaned existing directory %s.', dest_dir)

        if E.ln(master, gws_log_dir, dest_dir):
          logging.info('Symlink %s to directory %s:%s for logs' %
                       (dest_dir, machine, gws_log_dir))
        else:
          logging.error('Cannot make a symlink from %s to %s' %
                        (dest_dir, gws_log_dir))
        continue

      # For non-master nodes on cluster, we need to rsync those files to master node
      logging.info('Collecting logs from %s:%s into %s' % (
        machine, src_pattern, dest_dir))

      # make log directories if needed
      liblog.MakeDir(dest_dir)

      # rsync all files from one remote machine in one command.
      rsync_cmd = 'rsync --timeout=60 --size-only -vau ' \
                  ' -e ssh %s:%s %s/' % (machine, src_pattern, dest_dir)

      # rsync the logs
      (status, output) = liblog.DoCommand(rsync_cmd)
      if status != 0:
        logging.error('Failed to collect logs from %s: %s' % (
          machine, output))
  finally:
    lock.close()
    os.unlink(lockfile)
Exemplo n.º 8
0
def main(unused_args):
  if not E.access([E.LOCALHOST], FLAGS.enthome, 'rd'):
    sys.exit("Invalid enthome %s" % FLAGS.enthome)

  if ( not FLAGS.box_keys_dir or
       not E.access([E.LOCALHOST], FLAGS.box_keys_dir, 'rwd') ):
    sys.exit("Invalid box_keys_dir %s" % FLAGS.box_keys_dir)
  if ( not FLAGS.license_keys_dir or
       not E.access([E.LOCALHOST], FLAGS.license_keys_dir, 'rwd') ):
    sys.exit("Invalid license_keys_dir %s" % FLAGS.license_keys_dir)

  if FLAGS.installstate not in ["INSTALL", "ACTIVE", "SERVE", "TEST"]:
    sys.exit("Invalid --installstate %s" % FLAGS.installstate)

  reset_index.SetResetStatusCacheTimeout(FLAGS.reset_status_cache_timeout)

  as = adminrunner_server.AdminRunnerServer(FLAGS.enthome,
                                            FLAGS.installstate,
                                            FLAGS.port,
                                            FLAGS.box_keys_dir,
                                            FLAGS.license_keys_dir)

  # make sure we have been given a config file
  if (not as.cfg.getGlobalParam(C.ENT_SYSTEM_HAS_VALID_CONFIG) and
      as.cfg.getInstallState() != 'INSTALL'):
    logging.fatal("adminrunner doesn't have a config file; you must "\
                  "install a conf rpm or run it with "\
                  "--installstate=INSTALL (for migration)")
    return # just in case fatal doesn't exit
Exemplo n.º 9
0
  def deleteCollection(self, collection):
    """Delete all reports and logs for a particular collection."""
    self.logreplock.acquire()
    try:
      for reportType in [liblog.RAW_REPORT, liblog.SUMMARY_REPORT]:
        reports = self.getLogReports(collection, reportType)
        for report in reports:
          # stop running job if report is being (re)generated.
          if report.completeState != COMPLETE:
            self.stopRunningJob(self.jobName(report))

          # delete data files if any.
          (html_file, valid_file) = liblog.get_report_filenames(self.entConfig,
                                     reportType, report.reportName, collection)
          self.RemoveReportFiles(html_file, valid_file)
        self.reportCount[reportType] -= len(reports)
        logging.info('Delete total %d reports of type %s for collection %s.' % (
          len(reports), reportType, collection))
        listfile = liblog.get_report_list_filename(self.entConfig,
                                                   reportType, collection)
        (err, out) = E.run_fileutil_command(self.entConfig,
                                            'rm -f %s' % listfile)
        if err:
          logging.error('Cannot remove list file %s.' % listfile)

      report_collection_dir = liblog.get_report_collection_dir(self.entConfig,
                                                               collection)
      (err, out) = E.run_fileutil_command(self.entConfig,
                                          'rmdir %s' % report_collection_dir)
      if err:
        logging.error('Cannot delete unused directory %s' % \
                      report_collection_dir)
    finally:
      self.logreplock.release()
Exemplo n.º 10
0
    def setcert(self, certBody):
        """ Takes a cert file body as the input, and saves it
    as the staging certificate
    returns 0 on success, or 1 on failure
    """

        retval = 0
        self.updatelock.acquire()
        try:
            try:
                open(ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "w").write(certBody)
            except IOError:
                retval = 1
                logging.error(
                    "Couldn't save certificate to [%s]"
                    % (ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))
                )

            if retval == 0:
                verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % (
                    self.sslWrapperPath,
                    self.cfg.getGlobalParam("ENTERPRISE_HOME"),
                )
                outputList = []
                verifycode = E.execute(["localhost"], verifycmd, outputList, 60)

                if verifycode != 0:
                    retval = 1
                    E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))
                    logging.error("Couldn't verify certificate [%s]; error code: %d" % (str(outputList), verifycode))

        finally:
            self.updatelock.release()

        return "%d" % retval
Exemplo n.º 11
0
def KillProcessIfNotMaster(config):
  # kill babysitter
  util_dir = "%s/local/google3/enterprise/legacy/util" % config.ENTERPRISE_HOME
  E.killBabysitter(util_dir, config.GetConfigFileName(), config.VERSION)

  # stop snmpd when not a master
  disableSnmp()
Exemplo n.º 12
0
 def replicateConfig(self, machines):
   """ Replicates the config to a list of machines """
   cmd = (". %s && cd %s/local/google3/enterprise/legacy/scripts && "
          "./replicate_config.py %s %s" % (
     self.getGlobalParam("ENTERPRISE_BASHRC"), self.entHome,
     E.getCrtHostName(), self.globalParams.GetConfigFileName()))
   return E.execute(machines, cmd, None, true)
Exemplo n.º 13
0
def _RunServeCmd(cfg, version, cmd, allnodes=0):
  """Run serve_service command.
  cmd: 'stop', 'start', 'activate', 'deactivate'
  allnodes: 1 to run command on all nodes
  """
  serve_service_cmd = (
      '/export/hda3/%s/local/google3/enterprise/legacy/scripts/'
      'serve_service.py %s %s' % (version,
                                  cfg.getGlobalParam('ENTERPRISE_HOME'),
                                  cmd))
  logging.info('Running: %s' % serve_service_cmd)
  if allnodes:
    machines = cfg.getGlobalParam(C.MACHINES)
  else:
    machines = [E.getCrtHostName()]

  if E.execute(machines,
                   SECURE_WRAPPER_COMMAND % ( \
                        cfg.getGlobalParam('ENTERPRISE_HOME'),
                        '-p2',
                        serve_service_cmd),
                   None, 0) != E.ERR_OK:
    logging.error('%s: failed' % serve_service_cmd)
    return 1
  logging.info('%s: completed' % serve_service_cmd)
  return 0
Exemplo n.º 14
0
  def drain_urlmanagers(self):
    """
    We need to do this before advancing the epoch -- we can do it
    multiple times
    """
    urlmanagers = self.cfg.globalParams.GetServerHostPorts("urlmanager")
    num_shards = self.cfg.globalParams.GetNumShards('urlmanager')
    epoch = self.cfg.getGlobalParam('RT_EPOCH')

    for (host, port) in urlmanagers:
      # We don't do it here directly because of the timeout
      cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\
            "./port_talker.py %s %d 'd DumpingStatusTable' %d" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.entHome,
          host, port, 300) # 5 min timeout
      err = E.execute([E.getCrtHostName()], cmd, None, 0)
      if E.ERR_OK != err:
        logging.error("Error draining urlmanagers [%s]" % err)
        return 1

      # Make sure that the file is out
      shard_num = servertype.GetPortShard(port)
      file = "%surlmanager_out_table_%02d_of_%02d_epoch%010d" % (
        self.cfg.getGlobalParam('NAMESPACE_PREFIX'),
        shard_num, num_shards, epoch)
      err, out = E.run_fileutil_command(self.cfg.globalParams, "ls %s" % file)
      if E.ERR_OK != err:
        logging.error("The status table file [%s] is not there" % file)
        return 1

    return 0
Exemplo n.º 15
0
    def installkey(self):
        """ installs the staging key as the currently installed private key
    returns:
    0 on success,
    1 on empty install key (not an error)
    2 when the private key is invalid
    3 when the private key could not be distributed
    """

        self.updatelock.acquire()
        try:

            # first verify if the staging key is empty (not an error)
            if (
                not os.path.exists(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))
            ) or 0 == len(open(ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"), "r").read()):
                return "1"

            # next verify that the staging key is a valid file
            verifycmd = "secure_script_wrapper -p2 %s verifystagingkey %s" % (
                self.sslWrapperPath,
                self.cfg.getGlobalParam("ENTERPRISE_HOME"),
            )
            outputList = []
            verifycode = E.execute(["localhost"], verifycmd, outputList, 60)

            if verifycode != 0:
                E.rm(["localhost"], ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))
                logging.error("Verify failed for key [%s]; error code: %d" % (str(outputList), verifycode))
                return "2"

            # distribute the staging key
            retcode = E.distribute(
                self.cfg.getGlobalParam("MACHINES"),
                ssl_cert.STAGINGKEY_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"),
                60,
            )
            if retcode != 0:
                logging.error("Couldn't distribute private key, error %d" % retcode)
                return "3"

            # next, copy the key on all machines
            cmd = "secure_script_wrapper -p2 %s installkey %s" % (
                self.sslWrapperPath,
                self.cfg.getGlobalParam("ENTERPRISE_HOME"),
            )

            outputList = []
            retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60)

            if retcode != 0:
                logging.error("Couldn't install cert: %s" % str(outputList))
                return "3"

            self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_KEY_INSTALLED)
        finally:
            self.updatelock.release()

        return "0"
Exemplo n.º 16
0
 def activate(self):
   """
   activates this service and makes it's periodic cron script executable
   """
   E.exe_or_fail("/sbin/chkconfig --add %s_%s" % (
     self.service_name, self.version))
   E.exe_or_fail("chmod 755 /etc/cron.%s/cron_%s_%s" % (
     self.service_cron_time, self.service_name, self.version))
Exemplo n.º 17
0
 def GetDirList(self):
   return \
     [E.joinpaths([self.config.var('GOOGLEDATA'),
                   'gws/clients', self.name]),
      E.joinpaths([self.config.var('GOOGLEDATA'),
                   'gws/p4clientinfo', self.name]),
      E.joinpaths([self.config.var('CONFIGDIR'),
                   'frontends', self.name])]
Exemplo n.º 18
0
 def deactivate(self):
   """
   deactivate this service and macke the cron script unexecutable
   """
   E.exe_or_fail("/sbin/chkconfig --del %s_%s" % (
     self.service_name, self.version))
   E.exe_or_fail("chmod 644 /etc/cron.%s/cron_%s_%s" % (
     self.service_cron_time, self.service_name, self.version))
Exemplo n.º 19
0
def StartAdminLoop(config, op='start'):
  """Start AdminRunner Loop as well as Loop for initial_config_Server"""

  install_state = install_utilities.install_state(config.VERSION)

  logging.info("Starting the AdminRunner")
  ar_args = []
  ar_args.append("--port=2100")
  ar_args.append("--enthome=%s" % config.ENTERPRISE_HOME)
  ar_args.append("--installstate=%s" % install_state)
  ar_args.append("--reset_status_cache_timeout=60")

  restart_loop_AdminRunner = 0
  if op == 'babysit':
    pidfile = E.GetPidFileName('loop_AdminRunner')
    pid = E.ReadPidFile(pidfile)
    if os_utils.GetAttr('pid', pid=pid, fallback_to_ps=0) == None:
      restart_loop_AdminRunner = 1

  if op == 'start' or restart_loop_AdminRunner:
    E.su_exe_or_fail(
      config.ENTERPRISE_USER,
      """ ps axwwwww | fgrep AdminRunner | fgrep -v fgrep | \
      colrm 7 | xargs kill -9 2> /dev/null; \
      . %(eb)s; \
      cd %(eh)s/local/google3/enterprise/legacy/scripts/ &&  \
      ENT_ID=%(v)s_crawl ./loop_AdminRunner.py \
      %(eh)s %(args)s >> \
      /%(ld)s/loop_AdminOut_`whoami` 2>&1 &""" % {
      'eh' : config.ENTERPRISE_HOME,
      'eb' : config.ENTERPRISE_BASHRC,
      'v' : config.VERSION,
      'ld' : config.LOGDIR,
      'args' : string.join(map(commands.mkarg, ar_args))
      })

  restart_loop_webserver_config = 0
  if op == 'babysit':
    pidfile = E.GetPidFileName('loop_webserver_config')
    pid = E.ReadPidFile(pidfile)
    if os_utils.GetAttr('pid', pid=pid, fallback_to_ps=0) == None:
      restart_loop_webserver_config = 1

  if (install_state != "INSTALL" and
      (op == 'start' or restart_loop_webserver_config)):
    logging.info("Starting webserver_config")
    E.su_exe_or_fail(
      config.ENTERPRISE_USER,
      """ ps axwwwww | fgrep webserver_config.py | fgrep -v fgrep \
      | colrm 7 | xargs kill -9 2> /dev/null; \
      . %s; \
      cd %s/enterprise/legacy/scripts/ && \
      ENT_ID=%s_crawl ./loop_webserver_config.py %s \
      >> /%s/loop_WebserverConfig_`whoami` 2>&1 &""" % (
      config.ENTERPRISE_BASHRC,
      config.MAIN_GOOGLE3_DIR, config.VERSION,
      config.GetConfigFileName(), config.LOGDIR))
Exemplo n.º 20
0
  def remove(self, machine):
    """  This removes a machine from the configuration  """

    if machine not in self.cfg.getGlobalParam('MACHINES'):
      logging.error("%s doesn't exist" % machine)
      return 1

    ver = self.cfg.getGlobalParam('VERSION')
    home = self.cfg.getGlobalParam('ENTERPRISE_HOME')
    testver = install_utilities.is_test(ver)
    # if possible stop the core services, ignore return code
    install_utilities.stop_core(ver, home, [machine])

    if machine == E.getCrtHostName():
      logging.error("Cannot remove self")
      return 1

    # Halt the machine if APC is used.
    error = self.halt(machine)

    self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine)
    self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine)
    ret = core_utils.AddDeadNode(ver, testver, machine)
    # remove the chunkserver running on the node
    gfs_utils.DeleteGFSChunkservers(ver, testver, [machine])
    if ret:
      logging.error('Cannot add dead node to the lockserver.')
      # we ignore this error for now

    # now we need to remove the data disks that were on this machine
    data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS')
    if data_disks.has_key(machine):
      del data_disks[machine]
      if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks):
        return 1

    # This also saves the config file
    if not self.cfg.DoMachineAllocation():
      return 1

    # Now we need to restart babysitter because the old one
    # is out of sync after this
    serve_service_cmd = (". %s && "
        "cd %s/local/google3/enterprise/legacy/scripts && "
        "./serve_service.py %s" % (
          self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME'),
          self.cfg.getGlobalParam('ENTERPRISE_HOME')))
    E.exe("%s %s" % (serve_service_cmd, "babysit"))

    self.restart_crawl_processes(serve_service_cmd)

    if not mail_already_sent(M.MSG_MACHINEREMOVED % machine):
      SendMail.send(self.cfg, None, false,
                 M.MSG_MACHINEREMOVED % machine, "", true)

    return error
Exemplo n.º 21
0
def HaltMachines(enthome, machines):
  'Stops and powers down machines.'

  logging.info("Halting machines: %s" % string.join(machines, ","))
  E.execute(machines, '/sbin/shutdown -h now &', None, 1)
  time.sleep(60)
  for machine in machines:
    SendAPCCommand(enthome, machine, APC_OFF)
  return 0
Exemplo n.º 22
0
 def TouchFile(global_params, filename):
   """ check to see if filename exists, create if it does not exists """
   # first check if file exists
   ls_cmd = "ls %s" % filename
   err, out = E.run_fileutil_command(self.globalParams, ls_cmd)
   if err != E.ERR_OK:
     # create if not exists
     create_cmd = "truncate %s 0" % filename
     err, out = E.run_fileutil_command(self.globalParams, create_cmd)
     if err != E.ERR_OK:
       logging.fatal("Could not create file: %s" % filename)
Exemplo n.º 23
0
  def deactivate(self):
    """ Override this for some extra cleanup"""
    ent_service.ent_service.deactivate(self)

    # Remove cronjobs relating to this install version ONLY...
    tmp_cron_file = "%s/tmp/nobody_cron_crawl_" % self.ent_home
    E.exe_or_fail("""if $(/usr/bin/crontab -lu %s &>/dev/null);
    then crontab -lu %s | grep -v %s > %s; crontab -u %s %s; fi""" % \
        (self.ent_user, self.ent_user, self.entid_tag, tmp_cron_file,
         self.ent_user, tmp_cron_file))

    return 1
Exemplo n.º 24
0
def ResyncWithMaster(google_config_file,
                     enterprise_home,
                     enterprise_user,
                     master) :

  scripts_dir = "%s/local/google3/enterprise/legacy/scripts" % enterprise_home

  enterprise_bashrc = "%s/local/conf/ent_bashrc" % enterprise_home
  E.su_exe_or_fail(
    enterprise_user,
    ". %s && cd %s && alarm 600 ./replicate_config.py %s %s" %
    (enterprise_bashrc, scripts_dir, master, google_config_file))
  SetNonMasterNTP(master)
Exemplo n.º 25
0
 def send_cmd(self, server_name, cmd):
   srvrs = self.cfg.globalParams.GetServerManager().Set(server_name).Servers()
   for srvr in srvrs:
     actual_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\
                  "./port_talker.py %s %d '%s' %d" % (
                  self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
                  self.cfg.entHome,
                  srvr.host(), srvr.port(), cmd, 60) # 1 min timeout
     err = E.execute([E.getCrtHostName()], actual_cmd, None, 0)
     if E.ERR_OK != err:
       logging.error("Error talking to server at %s:%d" % (srvr.host(),
                                                           srvr.port()))
   return true
Exemplo n.º 26
0
 def send_urlscheduler_command(self, command):
   schedulers = self.cfg.globalParams.GetServerHostPorts("urlscheduler")
   timeout = 60 # 1 minute is good enough
   for (machine, port) in schedulers:
     port_talker_cmd = ". %s; cd %s/local/google3/enterprise/legacy/util && "\
           "./port_talker.py %s %d 'GET /run?Flags=%s\r\n\r\n' %d" % (
       self.cfg.getGlobalParam('ENTERPRISE_BASHRC'),
       self.cfg.entHome,
       machine, port, command, timeout)
     if E.ERR_OK != E.execute([E.getCrtHostName()], port_talker_cmd, None, 0):
       logging.error("Error talking to urlscheduler %s:%d" % (machine, port))
       return 1
   return 0
Exemplo n.º 27
0
def remove_unused_from(dirname, fileutil, grace_seconds):
  '''
  Get a list of all files in the given directory  that aren't opened and delete
  them.
  fileutil - full path of fileutil
  grace_seconds - Even if a file isn't currently opened we consider it being
                  in-use if it has been accessed recently (less this many
                  seconds ago)
  '''
  if not dirname:
    logging.error("Not given a directory to cleanup")
    return

  open_files_cmd = ("lsof +D %s -Fn" % dirname)
  (status, output) = E.getstatusoutput(open_files_cmd)

  #if status != E.ERR_OK:
  #  return
  # lsof doesn't return 0 even on success, so ignore it

  # lsof returns several lines for each file because multiple threads in a
  # process could have it open.  Get a list of unique files.
  open_files = {}
  for line in output.split():
    if line[0] == 'n':
      file = line[1:]
      open_files[file] = 1

  # Get a list of all files in the directory - not starting with .
  all_files = glob.glob("%s/*" % dirname)

  # Delete all unused files.
  for file in all_files:
    if file not in open_files:
      try:
        age = int(time.time()) - os.stat(file)[stat.ST_ATIME]
        if age > grace_seconds:
          logging.info('Removing unused file %s' % file)
          (s, o) = E.getstatusoutput("%s rm -f %s" % (fileutil, file))
          # If fileutil can't delete it for any reason, nuke it directly
          # And its attribute file.
          if os.path.exists(file):
            os.remove(file)
            os.remove('%s/.attr.plain.%s' %
                (os.path.dirname(file), os.path.basename(file)))
        else:
          logging.info('Ignoring unused file %s of age %s seconds' % (file, age))
          continue
      except OSError:
        # File got deleted since we ran glob?  Ignore away.
        continue
Exemplo n.º 28
0
    def installcert(self):
        """ installs the staging certificate as the currently installed certificate
    returns:
    0 on success, and
    1 on failure
    """

        self.updatelock.acquire()
        try:

            # first verify that the staging certificate is a valid file
            verifycmd = "secure_script_wrapper -p2 %s verifystagingcert %s" % (
                self.sslWrapperPath,
                self.cfg.getGlobalParam("ENTERPRISE_HOME"),
            )
            outputList = []
            verifycode = E.execute(["localhost"], verifycmd, outputList, 60)

            if verifycode != 0:
                E.rm(["localhost"], ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"))
                logging.error("Verify failed for certificate [%s]; error code: %d" % (str(outputList), verifycode))
                return "1"

            # distribute the staging certificate
            retcode = E.distribute(
                self.cfg.getGlobalParam("MACHINES"),
                ssl_cert.STAGINGCERT_FILENAME % self.cfg.getGlobalParam("ENTERPRISE_HOME"),
                60,
            )
            if retcode != 0:
                logging.error("Couldn't distribute apache cert, error %d" % retcode)

            # next, generate the certificate on all machines
            cmd = "secure_script_wrapper -p2 %s installcert %s" % (
                self.sslWrapperPath,
                self.cfg.getGlobalParam("ENTERPRISE_HOME"),
            )

            outputList = []
            retcode = E.execute(self.cfg.getGlobalParam("MACHINES"), cmd, outputList, 60)

            if retcode != 0:
                logging.error("Couldn't install cert: %s" % str(outputList))
                return "1"

            self.writeAdminRunnerOpMsg(M.MSG_LOG_SSL_CERT_INSTALLED)
        finally:
            self.updatelock.release()

        return "0"
Exemplo n.º 29
0
def RebootMachine(enthome, machine):
  'Reboots a machine.'
  if machine == E.getCrtHostName():
    # Rebooting ourself
    logging.info('Rebooting %s' % machine)
    E.execute([E.LOCALHOST], '/sbin/shutdown -r now', None, 1)
    # If we're still alive after a minute , the APC will kick in
  else:
    # Try shutting down cleanly first
    logging.info('Shutting down %s' % machine)
    E.execute([machine], '/sbin/shutdown -h now &', None, 1)
  time.sleep(60)
  logging.info('Rebooting %s via APC' % machine)
  return SendAPCCommand(enthome, machine, APC_REBOOT)
Exemplo n.º 30
0
def check_klogd_syslogd_conf(machines, enthome, unittestdir=None):
  """ babysit klogd.conf and syslogd.conf file

  Recreate klogd.conf and syslogd.conf if they are not in the dir.
  Args:
    machines: ['ent1', 'ent2', 'ent3', 'ent4', 'ent5']
    enthome: '/export/hda3/4.6.0.G.27/'
    unittestdir: '/tmp/etc/localbabysitter.d/' -- used for unittest only
  """
  KLOGD_CONF_DATA = (
    "klogd = {\n"
    "  restart_command : '/sbin/service syslog restart',\n"
    "  timeout : 30,\n"
    "  interval : 30,\n"
    "  use_service_wrapper : 0,\n"
    "  pidfile : '/var/run/klogd.pid',\n"
    "}\n"
  )

  SYSLOGD_CONF_DATA = (
    "syslogd = {\n"
    "  restart_command : '/sbin/service syslog restart',\n"
    "  timeout : 30,\n"
    "  interval : 30,\n"
    "  use_service_wrapper : 0,\n"
    "  pidfile : '/var/run/syslogd.pid',\n"
    "}\n"
  )

  CHECK_CREATE_CMD = (
    'if [ ! -e "%(file)s" ]\n'
    'then\n'
    '  echo "%(data)s" > %(file)s\n'
    '  chmod 644 %(file)s\n'
    'fi\n'
  )

  if unittestdir is None:
    dir = '/etc/localbabysitter.d/'
  else:
    dir = unittestdir
  file_info = {'klogd.conf':KLOGD_CONF_DATA, 'syslogd.conf':SYSLOGD_CONF_DATA }
  for fname, data in file_info.items():
    file = os.path.join(dir, fname)
    cmd = CHECK_CREATE_CMD % {'data': data, 'file': file}
    if unittestdir is None:
      E.execute(machines, cmd, [], 0, 0, 0, enthome)
    else:
      os.system(cmd)