def SyncOpLogs(all_machines, log_dir): """ This will sync the AdminRunner.OPERATOR.* logs to all machines """ # We have to run this only on master master = find_master.FindMaster(2100, all_machines) # The name of this machine crt_machine = E.getCrtHostName() if len(master) == 1 and master[0] == crt_machine: for machine in all_machines: if machine != crt_machine: src_dir = '%s/AdminRunner.OPERATOR.*' % (log_dir) dest_dir = '%s:/%s' % (machine, log_dir) logging.info('Collecting operator logs from %s into %s' % ( src_dir, dest_dir)) rsync_cmd = 'rsync --timeout=20 --size-only -vau ' \ ' -e ssh %s %s/' % (src_dir, dest_dir) # rsync the logs lockfile = '%s/syncops_lock' % log_dir lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def CollectLogs(all_machines, gws_log_dir, log_collect_dir): # We only run this on oneway or master node of cluster. master = find_master.FindMaster(2100, all_machines) crt_machine = E.getCrtHostName() if len(all_machines) != 1 and (len(master) != 1 or master[0] != crt_machine): logging.info('Not a oneway or cluster master node. Return!') return lockfile = '%s/lock' % log_collect_dir # waiting up to 5 minutes for the lock. lock = E.acquire_lock(lockfile, 30, breakLockAfterGracePeriod = 0) if lock == None: logging.info('Cannot grab the lock. Return!') return try: for machine in all_machines: src_pattern = '%s/partnerlog.*' % gws_log_dir dest_dir = '%s/%s' % (log_collect_dir, machine) # If it's a oneway or master node, we make a symlink to gws_log_dir instead # of rsync to log_collect directory if machine == crt_machine: # To make it backward compatible, we need to remove old dest_dir if it's # already an existing directory from previous version because in previous # versions we created a dir and rsynced files even on the master node and # one-ways. if os.path.exists(dest_dir) and not os.path.islink(dest_dir): if not E.rm(master, '%s/*' % dest_dir) or not E.rmdir(master, dest_dir): logging.error('Directory %s exists and cannot be cleaned.', dest_dir) continue logging.info('Cleaned existing directory %s.', dest_dir) if E.ln(master, gws_log_dir, dest_dir): logging.info('Symlink %s to directory %s:%s for logs' % (dest_dir, machine, gws_log_dir)) else: logging.error('Cannot make a symlink from %s to %s' % (dest_dir, gws_log_dir)) continue # For non-master nodes on cluster, we need to rsync those files to master node logging.info('Collecting logs from %s:%s into %s' % ( machine, src_pattern, dest_dir)) # make log directories if needed liblog.MakeDir(dest_dir) # rsync all files from one remote machine in one command. rsync_cmd = 'rsync --timeout=60 --size-only -vau ' \ ' -e ssh %s:%s %s/' % (machine, src_pattern, dest_dir) # rsync the logs (status, output) = liblog.DoCommand(rsync_cmd) if status != 0: logging.error('Failed to collect logs from %s: %s' % ( machine, output)) finally: lock.close() os.unlink(lockfile)
def main(argv): if len(argv) != 1: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect syslogs only if active or serve state = install_utilities.install_state(config.var('VERSION')) if not state in ['ACTIVE', 'SERVE']: sys.exit(0) # Collect syslogs only from master node. if not isMaster(config): logging.fatal('Not a oneway or cluster master node. Return!') pywrapbase.InitGoogleScript('', [ 'foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr' ], 0) gfile.Init() first_date, last_date, printable_date, file_date = \ liblog.ParseDateRange('all',[]) apache_main_dir = liblog.get_apache_dir(config) checkpoint_dir = liblog.get_syslog_checkpoint_dir(config) liblog.MakeGoogleDir(config, checkpoint_dir) if (config.var('SYSLOG_SERVER') == None or config.var('ENT_SYSLOG_GWS_FAC') == None): logging.fatal('SYSLOG logging is disabled') lockfile = '%s/syslog_lock' % config.var('LOGDIR') lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod=0) if not lock: return try: logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'), config.var('EXTERNAL_WEB_IP')) logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER')) for collection in os.listdir(apache_main_dir): apache_dir = '%s/%s' % (apache_main_dir, collection) checkpoint_file = "%s/%s" % (checkpoint_dir, collection) apache_logs = liblog.FindLogFiles(apache_dir, first_date, last_date) logging.info('syslog handles collection %s' % collection) if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger, config): sys.exit('Updating failed') logger.close() finally: lock.close() os.unlink(lockfile) sys.exit(0)
def main(argv): if len(argv) != 1: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect syslogs only if active or serve state = install_utilities.install_state(config.var('VERSION')) if not state in [ 'ACTIVE', 'SERVE' ]: sys.exit(0) # Collect syslogs only from master node. if not isMaster(config): logging.fatal('Not a oneway or cluster master node. Return!') pywrapbase.InitGoogleScript('', ['foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr'], 0) gfile.Init() first_date, last_date, printable_date, file_date = \ liblog.ParseDateRange('all',[]) apache_main_dir = liblog.get_apache_dir(config) checkpoint_dir = liblog.get_syslog_checkpoint_dir(config) liblog.MakeGoogleDir(config, checkpoint_dir) if ( config.var('SYSLOG_SERVER') == None or config.var('ENT_SYSLOG_GWS_FAC') == None ): logging.fatal('SYSLOG logging is disabled') lockfile = '%s/syslog_lock' % config.var('LOGDIR') lock = E.acquire_lock(lockfile, 1, breakLockAfterGracePeriod = 0) if not lock: return try: logger = syslog_client.SyslogClient(config.var('SYSLOG_SERVER'), config.var('EXTERNAL_WEB_IP')) logging.info("syslog-server = %s" % config.var('SYSLOG_SERVER')) for collection in os.listdir(apache_main_dir): apache_dir = '%s/%s' % (apache_main_dir, collection) checkpoint_file = "%s/%s" % (checkpoint_dir, collection) apache_logs = liblog.FindLogFiles(apache_dir, first_date, last_date) logging.info('syslog handles collection %s' % collection) if not SyslogLogs(apache_logs, apache_dir, checkpoint_file, logger, config): sys.exit('Updating failed') logger.close() finally: lock.close() os.unlink(lockfile) sys.exit(0)
def saveToDisk(self): """ Save it back to disk. This is needed if it has changed.""" if not self.hasChanged(): return lockfile = '%s.lock' % self.filename lock = E.acquire_lock(lockfile, 5, breakLockAfterGracePeriod = true) try: file = open(self.filename, 'w') colls = self.collection_dir_map.keys() colls.sort() for collection in colls: directories = " ".join(self.collection_dir_map[collection]) file.write('%s %s\n' % (collection, directories)) file.close() # reset the flag to be safe. Normally, this is called at # the end of a script execution. self.changed = false finally: lock.close() os.unlink(lockfile)
def saveToDisk(self): """ Save it back to disk. This is needed if it has changed.""" if not self.hasChanged(): return lockfile = '%s.lock' % self.filename lock = E.acquire_lock(lockfile, 5, breakLockAfterGracePeriod=true) try: file = open(self.filename, 'w') colls = self.collection_dir_map.keys() colls.sort() for collection in colls: directories = " ".join(self.collection_dir_map[collection]) file.write('%s %s\n' % (collection, directories)) file.close() # reset the flag to be safe. Normally, this is called at # the end of a script execution. self.changed = false finally: lock.close() os.unlink(lockfile)
def prereq_check(self, send_email, collections): """ This checks the prerequisites for all collection, updates the epochs to serve from and (optionally) sends a mail. """ if collections != None: collections = string.strip(collections) if not collections: collections = ent_collection.ListCollections(self.cfg.globalParams) else: collections = map(lambda c, p = self.cfg.globalParams: ent_collection.EntCollection(c, p), map(string.strip, string.split(collections, ",")) ) # No collections -- exit quickly if not collections: return {} send_email = string.atoi(send_email) epochs = self.cfg.getGlobalParam('ENTERPRISE_EPOCHS') gwssers = self.cfg.globalParams.GetServerHostPorts("web") jobs = [] for c in collections: collection = ent_collection.EntCollection(c, self.cfg.globalParams) # Write the testwords in a copy file filename = collection.get_var('TESTWORDS') filename_copy = "%s_" % filename open(filename_copy, "w").write(open(filename, "r").read()) num = collection.get_var("TESTWORDS_IN_FIRST") jobs.append((self.cfg, gwssers, c, filename_copy, epochs, num)) # Lock a file so we test once at a time lock_file = "%s/prerequisites_lock" % self.cfg.getGlobalParam("TMPDIR") flock = E.acquire_lock(lock_file, 12) try: # Run the tests -- one per thread ... # see how many threads to spawn if len(jobs) >= NUM_THREADS: num_threads = NUM_THREADS else: num_threads = len(jobs) # create the threads - workers threads = [] for n in range(0, num_threads): threads.append(Runner(n, jobs)) # start the threads for thread in threads[:-1]: thread.start() # I run the last one threads[-1].run() # wait to collect the errors at the end errors = threads[-1].errors max_epochs = threads[-1].max_epochs for thread in threads[:-1]: thread.join() for k, v in thread.max_epochs.items(): max_epochs[k] = v for k, v in thread.errors.items(): errors[k] = v # prepare and send a nice :) message if errors and send_email: last_msg_time = self.cfg.getGlobalParam('LAST_PREREQUISITES_EMAIL_TIME') email_interval = self.cfg.getGlobalParam('ENTERPRISE_INTER_EMAIL_TIME') now = int(time.time()) if now - last_msg_time > email_interval: msg = [M.MSG_PREREQ_FAIL] msg.extend(map( lambda (c, e): "Collection %s generated a wrong answer for %s" % (c, string.join(e, ",")), errors.items())) SendMail.send(self.cfg, None, 1, M.MSG_PREREQ_FAIL_SUBJECT, string.join(msg, "\n"), 1) self.cfg.globalParams.set_var('LAST_PREREQUISITES_EMAIL_TIME', now) self.cfg.globalParams.set_var('LAST_PREREQUISITES_CHECK', time.strftime("%Y/%m/%d %H:%M:%S")) epochs.sort() cur_epoch = epochs[-1] for c in collections: collection = ent_collection.EntCollection(c, self.cfg.globalParams) collection.set_var('LAST_PREREQUISITES_ERRORS', errors.get(c, [])) # EPOCH_SERVING has two values in the form of "es[0] es[1]" # es[0]: the epoch the prereq_check ask us to serve, or # -1 means no epoch answers OK, # -2 means current index answers OK # es[1]: the epoch the user set from UI, if -2 means use # most recent valid epoch # the serving logic is as following: # -- if user set a sepcific epoch (es[1]) >= 0), serve es[1] # -- if user set most recent valid epoch (es[1] == -2), then # serve the current index if no/all epochs answers ok # (es[0] == -1 or es[0] == -2 ) # otherwise (es[0] >= 0) serve from the es[0] # es = string.split(string.strip( open(collection.get_var('EPOCHS_SERVING'), "r").read()), " ") # The epoch prereq_check asks us to serve # this from -- -2 means current index is OK, # -1 means no epoch answers OK (is returned by the checker) epoch = max_epochs.get(c, -2) if not errors.has_key(c): epoch = -2 # initialize EPOCHS_SERVING if not es or len(es) == 1: es = [epoch, -2] else: es = map(string.atoi, es) # if this change cause automatic rollback, which means # - user choose the most recent valid epoch and # - the new epoch differs from previous epoch and # - the change is not from -1 -> -2 or -2 -> -1. # we log it in AdminRunner Operations log if es[1] == -2 and epoch != es[0] and ( es[0] + epoch != -3 ) : epochs_to_time = self.cfg.getGlobalParam('ENTERPRISE_EPOCHS_ENDTIME') epoch_time = epochs_to_time.get(epoch, M.MSG_EPOCH_CURRENT_TIME) self.writeAdminRunnerOpMsg(M.MSG_UI_LOG_INDEX_ROLLBACK % epoch_time) es[0] = epoch collection.set_file_var_content('EPOCHS_SERVING', string.join(map(str, es)), 0) # also check if the current serving epoch for the collection # is the most recent one, if not, send a warning email if send_email and ( ( es[1] == -2 and es[0] >= 0 ) or \ (es[1] >= 0 and es[1] != cur_epoch ) ) : last_msg_time = self.cfg.getGlobalParam( 'LAST_SERVING_EPOCH_WARNING_EMAIL_TIME') email_interval = self.cfg.getGlobalParam( 'ENTERPRISE_INTER_EMAIL_TIME') now = int(time.time()) if now - last_msg_time > email_interval: SendMail.send(self.cfg, None, 0, M.MSG_SERVING_EPOCH_NOT_CURRENT % c, "", 0) self.cfg.globalParams.set_var( 'LAST_SERVING_EPOCH_WARNING_EMAIL_TIME', now) self.cfg.saveParams() finally: flock.close() return errors