예제 #1
0
def main(argv):
  pywrapfile.File.Init()
  config = entconfig.EntConfig(argv[0])
  if not config.Load():  sys.exit(__doc__)

  # Collect logs only if active
  state = install_utilities.install_state(config.var('VERSION'))
  if not state in [ 'ACTIVE', 'SERVE' ]:
    sys.exit(0)

  # NO collection for sitesearches:
  if config.var('SITESEARCH_INTERFACE'):
    sys.exit(0)

  # If I'm not a config replica I don't collect the logs..
  replicas = config.var('CONFIG_REPLICAS')
  crt_machine = E.getCrtHostName()
  if not crt_machine in replicas:
    logging.error('Not a replica')
    sys.exit(0)

  gws_log_dir = liblog.get_gws_log_dir(config)
  collect_dir = liblog.get_collect_dir(config)
  partition_dir = liblog.get_partition_dir(config)
  apache_dir = liblog.get_apache_dir(config)
  click_dir = liblog.get_click_dir(config)
  directory_map_file = liblog.get_directory_map_file(config)

  # in case cron job starts before adminrunner
  liblog.MakeDir(collect_dir)
  liblog.MakeDir(partition_dir)
  liblog.MakeDir(apache_dir)
  liblog.MakeDir(click_dir)

  # Collect Logs from machines
  all_machines = config.var('MACHINES')
  CollectLogs(all_machines, gws_log_dir, collect_dir)

  # Partition gwslogs by collections and convert to apache logs
  preprocess_logs.PartitionLogs(config)

  # Sanitize collection directory map
  coll_directory_map = liblog.CollectionDirectoryMap(directory_map_file)
  if coll_directory_map.sanitizeMap(partition_dir, apache_dir, click_dir):
    coll_directory_map.saveToDisk()

  # Send the OP logs to all machines
  SyncOpLogs(all_machines, config.var('LOGDIR'))

  logging.info('Done')
예제 #2
0
  def __init__(self, cfg):
    self.cfg = cfg    # configurator object
    self.entConfig = cfg.globalParams
    # locks for updating the report lists
    self.logreplock = threading.RLock()
    self.logdir = self.cfg.getGlobalParam('LOGDIR')
    liblog.MakeDir(liblog.get_click_dir(self.entConfig))
    liblog.MakeDir(liblog.get_collect_dir(self.entConfig))
    liblog.MakeDir(liblog.get_apache_dir(self.entConfig))
    liblog.MakeDir(liblog.get_partition_dir(self.entConfig))
    liblog.MakeGoogleDir(self.entConfig, liblog.get_report_dir(self.entConfig))
    collection_dir_map_file = liblog.get_directory_map_file(self.entConfig)

    if not os.path.exists(collection_dir_map_file):
      open(collection_dir_map_file, 'w').close()  # a trick to touch a file.
    self.reportCount = { liblog.RAW_REPORT:  0,
                         liblog.SUMMARY_REPORT: 0, }
    self.sanitizeReportList(liblog.RAW_REPORT)
    self.sanitizeReportList(liblog.SUMMARY_REPORT)

    self.joblock = threading.Lock()
    self.runningJobs = {}
예제 #3
0
    def babysit(self):
        """ This is the mode under which the service is executed when the cron
    process related to this service is executed"""

        log_max_size = LOG_MAX_SIZE

        # delete all converter tmpfiles older than 15 minutes since
        # we timeout a conversion after a few minutes anyways.
        # hardcoding the directory like we have done for data and log
        # long term we should consider running logcontrol_service with
        # additional parameters
        self.control_files([(".*", 0, 15)], "/mnt/rtcache/converter-tmp")
        self.control_files([(".*", 0, 15)], "%s/converter-tmp" % self.tmpdir)

        # Control files in TMPDIR, tmp and TMPDIR/oldlogs
        self.control_files(FILES_IN_ROOTTMPDIR, "/tmp")
        self.control_files(FILES_IN_TMPDIR, self.tmpdir)
        self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.datadir)
        self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.ent_home)
        self.control_files(GWS_LOGS_IN_TMPDIR, self.tmpdir)
        self.control_files([(".*", 0, 15)], "%s/oldlogs/" % self.tmpdir)

        # Control the core dump files which can fill up disks and can be dangerous
        self.control_files(CORE_DUMP_FILES, "%s/" % self.datadir)
        self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.datadir)
        self.control_files(CORE_DUMP_FILES, "%s/" % self.tmpdir)
        self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.ent_home)
        self.control_files(CORE_DUMP_FILES, "%s/" % self.logdir)
        self.control_files(CORE_DUMP_FILES, "/tmp/")

        # Control also AdminRunner/ gems # /babysitter/configmgrof files
        self.control_files(
            [
                ("adminrunner\\.py\\..*", 30, 15),
                ("log_collector_main_", 5, 15),
                ("log_analyzer_alerter_", 5, 15),
                ("babysitter_out_*", 48, 15),  # 12 hrs worth (4/hr)
                ("configmgr_out_*", 20, 15),
                ("fixer_out_*", 20, 15),
                ("py.migration_bot", 20, 15),
                ("batch_crawler_", 5, 15),
                ("ar_profile_*", 100, 60),  # 100 files for 1 hour
                ("snmphelper_", 14, 15),  # 14 files, creates 1/day
            ],
            self.logdir)

        # There are one of these files per version - we don't want anything more
        # than current version and previous version (serve and test)
        self.control_files([
            ("periodic_scriptOut", 2, 60),
            ("cronLogCollectOut", 2, 60),
            ("cronSyslogLogOut", 2, 60),
        ], self.logdir)
        # Keep operator logs for 1 year (assuming 1 per day)
        self.control_files([
            ("AdminRunner\\.OPERATOR\\..*", 365, 15),
            ("AdminServer\\.ERROR\\..*", 5, 15),
            ("AdminServer\\.INFO\\..*", 5, 15),
            ("AdminServer\\.FATAL\\..*", 5, 15),
        ], self.logdir)

        # Keep at most 50 successful config manager requests for 1 hour
        self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)],
                           "%s/local/conf/cmr_working/success/" %
                           self.ent_home)
        # Keep at most 50 successful config manager statusz for 1 hour
        self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)],
                           "%s/local/conf/cmr_working/statusz/" %
                           self.ent_home)
        # Keep at most 50 failed config manager requests for 8 hours
        self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                           "%s/local/conf/cmr_working/failed/" % self.ent_home)
        # Keep at most 50 .CONF config manager requests for 8 hours
        self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                           "%s/local/conf/cmr/" % self.ent_home)
        # Keep at most 50 .CONF config manager requests for 8 hours
        self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                           "%s/local/conf/fixer_cmr/" % self.ent_home)

        # Keep a maximum of 5 enterprise onebox log files created.
        # Delete them if they are older than 16 days.
        self.control_files(
            [("enterprise_onebox_log\\.from_.*", 5, 60 * 12 * 16)],
            self.tmpdir)

        # Do not allow more than a maximum of 5 web_log.*.browse file (that is
        # created when users browse our logs. Delete them if they are older than 1
        # hour
        self.control_files([("web_log\\..*\\.browse", 5, 60)], self.logdir)

        # Do not allow more than a maximum of 5 weblog dump files created when
        # users browse our logs. Delete them if they are older than 1 day
        self.control_files([("weblog_dump_.*", 5, 1440)], self.logdir)

        # Do not allow more than a maximum of 5 feedlog browse files created when
        # users browse our logs. Delete them if they are older than 1 hour
        self.control_files([("feed_log\\..*\\.browse", 5, 60)], self.logdir)

        # Do not allow more than a maximum of 5 tomcat log files for the connectormgr
        # connector manager. Delete them if they are older than 5 days.
        self.control_files(
            FILES_FOR_CONNECTORMGR,
            "%s/local/google/bin/connectormgr-prod/logs" % self.ent_home)

        self.control_files(
            FILES_FOR_CONNECTORMGR,
            "%s/local/google/bin/connectormgr-test/logs" % self.ent_home)

        # Control some files of unknown origin that end up in /var/tmp
        self.control_files(VAR_TMP_FILES, "/var/tmp/")

        # control the collected, partitioned gws logs, and apache logs
        collect_dir = liblog.get_collect_dir(self.cp)
        partition_dir = liblog.get_partition_dir(self.cp)
        apache_dir = liblog.get_apache_dir(self.cp)
        click_dir = liblog.get_click_dir(self.cp)

        # Delete all bigfile remnants for bigfiles with locks older than one week.
        # (These result from interrupted attempts to delete bigfiles.)
        bigfile_data_dirs = ("/export/hda3/%s/data/enterprise" % self.version,
                             "/export/hdb3/%s/data/enterprise" % self.version)
        self.control_bigfile_locks("%s/data/enterprise-data" % self.ent_home,
                                   bigfile_data_dirs, 60 * 24 * 7)

        machines = self.cp.var("MACHINES")
        for machine in machines:
            self.control_files(GWS_LOGS_IN_SAVEDDIR,
                               "%s/%s" % (collect_dir, machine))
            dirs = glob.glob("%s/*/%s" % (partition_dir, machine))
            for dir in dirs:
                self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

            dirs = glob.glob("%s/*/%s" % (apache_dir, machine))
            for dir in dirs:
                self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

            dirs = glob.glob("%s/*/%s" % (click_dir, machine))
            for dir in dirs:
                self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

        # control the size of apache logs etc
        self.control_log_files(LOGFILES_CONTROL)

        # remove distribution packages left behind by vmanager
        self.control_dirs(VMANAGER_DIRS)

        # control all log files over MAX_ALLOWED_LOG_SIZE
        # we first change the limit if disk space is less
        # Check if the disk is getting full, decrease the limit for file size
        try:
            df_status, df_output = commands.getstatusoutput('df /export/hda3/'
                                                            '| tail -1 ')
            disk_free_percent = int(df_output.split()[4][:-1])
            if disk_free_percent > CRITICAL_DISK_FULL_PERCENT:
                log_max_size = map(lambda (x, y): (x, y / 2), log_max_size)
        except ValueError, TypeError:
            pass  # we failed to get disk availability
  def babysit(self):
    """ This is the mode under which the service is executed when the cron
    process related to this service is executed"""

    log_max_size = LOG_MAX_SIZE

    # delete all converter tmpfiles older than 15 minutes since
    # we timeout a conversion after a few minutes anyways.
    # hardcoding the directory like we have done for data and log
    # long term we should consider running logcontrol_service with
    # additional parameters
    self.control_files([(".*", 0, 15)], "/mnt/rtcache/converter-tmp")
    self.control_files([(".*", 0, 15)], "%s/converter-tmp" % self.tmpdir)

    # Control files in TMPDIR, tmp and TMPDIR/oldlogs
    self.control_files(FILES_IN_ROOTTMPDIR, "/tmp")
    self.control_files(FILES_IN_TMPDIR, self.tmpdir)
    self.control_files(FILES_IN_TMPDIR, "%s/logs/"  % self.datadir)
    self.control_files(FILES_IN_TMPDIR, "%s/logs/"  % self.ent_home)
    self.control_files(GWS_LOGS_IN_TMPDIR, self.tmpdir)
    self.control_files([(".*", 0, 15)], "%s/oldlogs/" % self.tmpdir)

    # Control the core dump files which can fill up disks and can be dangerous
    self.control_files(CORE_DUMP_FILES, "%s/" % self.datadir)
    self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.datadir)
    self.control_files(CORE_DUMP_FILES, "%s/" % self.tmpdir)
    self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.ent_home)
    self.control_files(CORE_DUMP_FILES, "%s/" % self.logdir)
    self.control_files(CORE_DUMP_FILES, "/tmp/")

    # Control also AdminRunner/ gems # /babysitter/configmgrof files
    self.control_files([("adminrunner\\.py\\..*", 30, 15),
                        ("log_collector_main_", 5, 15),
                        ("log_analyzer_alerter_", 5, 15),
                        ("babysitter_out_*", 48, 15), # 12 hrs worth (4/hr)
                        ("configmgr_out_*", 20, 15),
                        ("fixer_out_*", 20, 15),
                        ("py.migration_bot", 20, 15),
                        ("batch_crawler_", 5, 15),
                        ("ar_profile_*", 100, 60), # 100 files for 1 hour
                        ("snmphelper_", 14, 15),   # 14 files, creates 1/day
                        ],
                       self.logdir)

    # There are one of these files per version - we don't want anything more
    # than current version and previous version (serve and test)
    self.control_files([("periodic_scriptOut", 2, 60),
                        ("cronLogCollectOut", 2, 60),
                        ("cronSyslogLogOut", 2, 60),
                        ],
                       self.logdir)
    # Keep operator logs for 1 year (assuming 1 per day)
    self.control_files([("AdminRunner\\.OPERATOR\\..*", 365, 15),
                        ("AdminServer\\.ERROR\\..*", 5, 15),
                        ("AdminServer\\.INFO\\..*", 5, 15),
                        ("AdminServer\\.FATAL\\..*", 5, 15),
                       ], self.logdir)

    # Keep at most 50 successful config manager requests for 1 hour
    self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)],
                       "%s/local/conf/cmr_working/success/" % self.ent_home)
    # Keep at most 50 successful config manager statusz for 1 hour
    self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)],
                       "%s/local/conf/cmr_working/statusz/" % self.ent_home)
    # Keep at most 50 failed config manager requests for 8 hours
    self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                       "%s/local/conf/cmr_working/failed/" % self.ent_home)
    # Keep at most 50 .CONF config manager requests for 8 hours
    self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                       "%s/local/conf/cmr/" % self.ent_home)
    # Keep at most 50 .CONF config manager requests for 8 hours
    self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)],
                       "%s/local/conf/fixer_cmr/" % self.ent_home)

    # Keep a maximum of 5 enterprise onebox log files created.
    # Delete them if they are older than 16 days.
    self.control_files([("enterprise_onebox_log\\.from_.*", 5, 60 * 12 * 16)],
        self.tmpdir)

    # Do not allow more than a maximum of 5 web_log.*.browse file (that is
    # created when users browse our logs. Delete them if they are older than 1
    # hour
    self.control_files([("web_log\\..*\\.browse", 5, 60)], self.logdir)

    # Do not allow more than a maximum of 5 weblog dump files created when
    # users browse our logs. Delete them if they are older than 1 day
    self.control_files([("weblog_dump_.*", 5, 1440)], self.logdir)

    # Do not allow more than a maximum of 5 feedlog browse files created when
    # users browse our logs. Delete them if they are older than 1 hour
    self.control_files([("feed_log\\..*\\.browse", 5, 60)], self.logdir)

    # Do not allow more than a maximum of 5 tomcat log files for the connectormgr
    # connector manager. Delete them if they are older than 5 days.
    self.control_files(FILES_FOR_CONNECTORMGR,
                       "%s/local/google/bin/connectormgr-prod/logs"
                       % self.ent_home)

    self.control_files(FILES_FOR_CONNECTORMGR,
                       "%s/local/google/bin/connectormgr-test/logs"
                       % self.ent_home)

    # Control some files of unknown origin that end up in /var/tmp
    self.control_files(VAR_TMP_FILES, "/var/tmp/")

    # control the collected, partitioned gws logs, and apache logs
    collect_dir = liblog.get_collect_dir(self.cp)
    partition_dir = liblog.get_partition_dir(self.cp)
    apache_dir = liblog.get_apache_dir(self.cp)
    click_dir = liblog.get_click_dir(self.cp)

    # Delete all bigfile remnants for bigfiles with locks older than one week.
    # (These result from interrupted attempts to delete bigfiles.)
    bigfile_data_dirs = ("/export/hda3/%s/data/enterprise" % self.version,
                         "/export/hdb3/%s/data/enterprise" % self.version)
    self.control_bigfile_locks("%s/data/enterprise-data" % self.ent_home,
                               bigfile_data_dirs,
                               60 * 24 * 7)

    machines = self.cp.var("MACHINES")
    for machine in machines:
      self.control_files(GWS_LOGS_IN_SAVEDDIR,
                         "%s/%s" % (collect_dir, machine))
      dirs = glob.glob( "%s/*/%s" % (partition_dir, machine))
      for dir in dirs:
        self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

      dirs = glob.glob("%s/*/%s" % (apache_dir, machine))
      for dir in dirs:
        self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

      dirs = glob.glob("%s/*/%s" % (click_dir, machine))
      for dir in dirs:
        self.control_files(GWS_LOGS_IN_SAVEDDIR, dir)

    # control the size of apache logs etc
    self.control_log_files(LOGFILES_CONTROL)

    # remove distribution packages left behind by vmanager
    self.control_dirs(VMANAGER_DIRS)

    # control all log files over MAX_ALLOWED_LOG_SIZE
    # we first change the limit if disk space is less
    # Check if the disk is getting full, decrease the limit for file size
    try:
      df_status, df_output = commands.getstatusoutput('df /export/hda3/'
                                                      '| tail -1 ')
      disk_free_percent = int(df_output.split()[4][:-1])
      if disk_free_percent > CRITICAL_DISK_FULL_PERCENT:
        log_max_size = map(lambda (x, y):(x, y/2), log_max_size)
    except ValueError, TypeError:
      pass # we failed to get disk availability
예제 #5
0
def main(argv):
  argc = len(argv)

  if argc < 6:
    sys.exit(__doc__)

  config = entconfig.EntConfig(argv[0])
  if not config.Load():
    sys.exit(__doc__)

  pywrapbase.InitGoogleScript('', ['foo',
          '--gfs_aliases=%s' % config.var("GFS_ALIASES"),
          '--bnsresolver_use_svelte=false',
          '--logtostderr'], 0)
  gfile.Init()

  client = argv[1]
  date_arg = argv[2]
  html_file = argv[3]
  valid_file = argv[4]
  new_valid_file = argv[5]

  # extract tag and date_range from command line args
  date_fields = string.split(date_arg, '_')
  date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:])

  if not date_range:
    sys.exit(__doc__)

  first_date, last_date, printable_date, file_date = date_range

  if last_date.as_int() < first_date.as_int():
    sys.exit(__doc__)

  gws_log_dir = liblog.get_gws_log_dir(config)
  click_dir = liblog.get_click_dir(config)
  collect_dir = liblog.get_collect_dir(config)
  apache_dir = liblog.get_apache_dir(config)
  directory_map_file = liblog.get_directory_map_file(config)

  # we need to collect logs first from all gws nodes and preprocess
  # logs first to make sure logs are up to date.
  all_machines = config.var('MACHINES')
  collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir)
  preprocess_logs.PartitionLogs(config)

  # make a vector of Log objects for all apache_logs and click_logs matching
  # the given date range and client.
  apache_logs = liblog.FindClientLogFiles(apache_dir, directory_map_file,
                                          client, first_date, last_date)
  click_logs = liblog.FindClientLogFiles(click_dir, directory_map_file,
                                          client, first_date, last_date)

  # If we have valid file and report file, we check to see if the data in
  # apache_dir has been changed and if the report is still valid.
  if (gfile.Exists(html_file) and gfile.Exists(valid_file) and
      liblog.checkValid(html_file, valid_file, apache_logs)):
    logging.info('%s still valid.' % html_file)
    sys.exit(liblog.STILL_VALID)

  # if there is no valid report, we create a new one
  DumpApacheAndClickLogs(apache_logs, click_logs)
  if not liblog.makeValid(new_valid_file, apache_logs):
    logging.error('Error validating %s' % html_file)
    sys.exit(liblog.FAILURE)

  logging.info('done apache_log, new_valid_file: %s' % new_valid_file)
  sys.exit(liblog.SUCCESS)
예제 #6
0
def main(argv):
    argc = len(argv)

    if argc < 6:
        sys.exit(__doc__)

    config = entconfig.EntConfig(argv[0])
    if not config.Load():
        sys.exit(__doc__)

    pywrapbase.InitGoogleScript('', [
        'foo',
        '--gfs_aliases=%s' % config.var("GFS_ALIASES"),
        '--bnsresolver_use_svelte=false', '--logtostderr'
    ], 0)
    gfile.Init()

    client = argv[1]
    date_arg = argv[2]
    html_file = argv[3]
    valid_file = argv[4]
    new_valid_file = argv[5]

    # extract tag and date_range from command line args
    date_fields = string.split(date_arg, '_')
    date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:])

    if not date_range:
        sys.exit(__doc__)

    first_date, last_date, printable_date, file_date = date_range

    if last_date.as_int() < first_date.as_int():
        sys.exit(__doc__)

    gws_log_dir = liblog.get_gws_log_dir(config)
    click_dir = liblog.get_click_dir(config)
    collect_dir = liblog.get_collect_dir(config)
    apache_dir = liblog.get_apache_dir(config)
    directory_map_file = liblog.get_directory_map_file(config)

    # we need to collect logs first from all gws nodes and preprocess
    # logs first to make sure logs are up to date.
    all_machines = config.var('MACHINES')
    collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir)
    preprocess_logs.PartitionLogs(config)

    # make a vector of Log objects for all apache_logs and click_logs matching
    # the given date range and client.
    apache_logs = liblog.FindClientLogFiles(apache_dir, directory_map_file,
                                            client, first_date, last_date)
    click_logs = liblog.FindClientLogFiles(click_dir, directory_map_file,
                                           client, first_date, last_date)

    # If we have valid file and report file, we check to see if the data in
    # apache_dir has been changed and if the report is still valid.
    if (gfile.Exists(html_file) and gfile.Exists(valid_file)
            and liblog.checkValid(html_file, valid_file, apache_logs)):
        logging.info('%s still valid.' % html_file)
        sys.exit(liblog.STILL_VALID)

    # if there is no valid report, we create a new one
    DumpApacheAndClickLogs(apache_logs, click_logs)
    if not liblog.makeValid(new_valid_file, apache_logs):
        logging.error('Error validating %s' % html_file)
        sys.exit(liblog.FAILURE)

    logging.info('done apache_log, new_valid_file: %s' % new_valid_file)
    sys.exit(liblog.SUCCESS)