def refresh_single_instance(self, umcdef, GlobalContext): umcdef.lock.acquire() try: if umcdef.proc is not None: try: # update the process return code if any umcdef.proc.poll() if not(umcdef.proc.is_running()) and umcdef.proc.returncode is not None: rc=umcdef.proc.returncode if rc != 0: Msg.warn_msg("umc instance %s failed/terminated with exit code %d. Will attempt to restart it after %d seconds." %(umcdef.umc_instanceid,rc,GlobalContext.params.run_after_failure)) umcdef.start_after=time.time()+GlobalContext.params.run_after_failure umcdef.num_errors = umcdef.num_errors + 1 umcdef.lasterror_time = time.time() umcdef.returncodes.insert(0,(time.time(), rc)) if len(umcdef.returncodes)>GlobalContext.params.retc_history: del umcdef.returncodes[-(len(umcdef.returncodes)-GlobalContext.params.retc_history):] # clear the process is not runnig or check the process is zombie; this happens when the process ends normally but we still hold a refernece to it if not(umcdef.proc.is_running()) or (umcdef.proc.is_running() and umcdef.proc.status() == psutil.STATUS_ZOMBIE): del umcdef.proc umcdef.proc=None umcdef.last_started_time=0 sleep(0.1) except Exception as e: Msg.warn_msg("There was a problem when quering the process with pid %d: %s"%(umcdef.proc.pid,str(e))) if e.__class__ == psutil.NoSuchProcess: umcdef.proc=None umcdef.last_started_time=0 pass finally: umcdef.lock.release()
def __run_httpd(self): # start the server Msg.info1_msg("Starting http server on at %s:%s." % (self.address, self.tcp_port)) try: self.exit = Event() self.httpd = ThreadedHTTPServer((self.address, int(self.tcp_port)), Handler, bind_and_activate=False) self.httpd.allow_reuse_address = True self.httpd.timeout = 1 self.httpd.server_bind() self.httpd.server_activate() except Exception as e: Msg.warn_msg("Cannot start HTTP server due to: %s." % (str(e))) return # serve the requests try: while not (self.exit.is_set()): self.httpd.handle_request() finally: Msg.info1_msg("Closing HTTP server.") try: self.httpd.server_close() except Exception as e: Msg.warn_msg( "Error occurred while closing the HTTP server: %s" % (str(e)))
def run_task(self, GlobalContext, tdef): orphans = [] pids = self.get_all_pgids()[str(os.getpgrp())] procs = psutil.Process().children(recursive=True) for pid in pids: try: os.kill(int(pid), 0) except OSError: # we are not so fast, the process ended in the meantime pass else: # the process is live; check it exist in process tree found = False for p in procs: if p.pid == pid: found = True break if not found: orphans.append(pid) # else # for pid # pause if there are orhpans if len(orphans)>0: Msg.warn_msg("There are %d orphan processes, will pause umcrunner until orhpans exist!"%(len(orphans))) Msg.info2_msg("The orhpans are: %s"%orphans) return False else: return True
def terminate_process_children(proc, timeout=10): # get all children processes procs = proc.children(recursive=True) Msg.info1_msg("Terminating process tree of pid %d with %d children..."%(proc.pid,len(procs))) if len(procs) > 0: # send SIGTERM for p in procs: try: p.terminate() except: pass # wait for processes to die gone, alive = psutil.wait_procs(procs, timeout=timeout, callback=on_terminate) # send force kill if there are still live processs if alive: Msg.warn_msg("There were %d child processes that did not terminare within the timeout of %d seconds. Killing them..." %(len(alive),timeout)) for p in alive: try: p.kill() except: pass
def run_task(self, GlobalContext, tdef): kids=psutil.Process().children(True) Msg.info2_msg("There are %d children processes."%(len(kids))) if len(kids) > GlobalContext.params.max_processes: Msg.warn_msg("The current number of child processes %d exceeds the maximum of %d; umcrunner will be paused." %(len(kids),GlobalContext.params.max_processes)) return False else: return True
def run_task(self, GlobalContext, tdef): if GlobalContext.umcdefs is not None: for ud in GlobalContext.umcdefs: if ud.enabled: ud.lock.acquire() try: log_stats=Map(backlog_total=0, errorlog_mtime=0, errorlog_size=0, errorlog_tail=[]) log_dir=get_umc_instance_log_dir(ud.umc_instanceid, GlobalContext) if os.path.isdir(log_dir): for file in [os.path.basename(f.path) for f in scandir(log_dir)]: # match the log file waiting to be consumed # there is a maximum of 9 groups (1-9) m1 = re.match(r"^{umc_instanceid}_[0-9\-]+.log.([1-9])$".format(umc_instanceid=ud.umc_instanceid), file) if m1: fg_key="backlog_group_%s"%m1.group(1) if log_stats.get(fg_key) is None: log_stats[fg_key]=1 else: log_stats[fg_key]+=1 log_stats.backlog_total += 1 # // if match log file # match the error log m2 = re.match(r"^{umc_instanceid}(_[0-9\-]+)?.error.out$".format(umc_instanceid=ud.umc_instanceid), file) if m2: stat=os.stat(log_dir + "/" + file) log_stats.errorlog_size=stat.st_size if log_stats.errorlog_size>0: log_stats.errorlog_mtime=stat.st_mtime else: log_stats.errorlog_mtime=0 #the below takes too much time to finish, better not run this #log_stats.errorlog_tail=utils.tail(log_dir + "/" + file, 10) # // if match error log # // for else: Msg.warn_msg("Directory %s does not exist!"%log_dir) # update log stats ud.log_stats = log_stats finally: ud.lock.release() # // if enabled # // for # // if return True
def __init__(self, globalCtx): global GlobalContext GlobalContext = globalCtx self.enabled = False self.thread = None if GlobalContext.params.http_enabled: sl_def = GlobalContext.server_list.get(socket.gethostname()) if sl_def is not None and sl_def.address is not None and sl_def.tcp_port is not None and sl_def.me: self.enabled = True self.address = sl_def.address self.tcp_port = sl_def.tcp_port else: Msg.warn_msg( "Cannot determine umcrunner's address and/or tcp_port for http server to bind to. The http server will not be started!" ) else: Msg.info1_msg("HTTP server is disabled.")
def run_task(self, GlobalContext, tdef): kids=psutil.Process().children(True) nz = 0 for p in kids: try: if p.status() == psutil.STATUS_ZOMBIE: nz = nz + 1 except Exception as e: pass Msg.info2_msg("There are %d zombie processes"%(nz)) if nz > len(GlobalContext.umcdefs): Msg.warn_msg("There are %d zombie processes which exceeds the number of umc instances %d. Will pause umc runner until the zombie processes will disappear!"% (nz,len(GlobalContext.umcdefs))) return False else: return True
def run_task(self, GlobalContext, tdef): running=[]; started=[]; waiting=[] for umcdef in GlobalContext.umcdefs: if umcdef.enabled: umcdef.lock.acquire() try: if umcdef.proc is None and time.time()>umcdef.start_after: if umcdef.last_started_time is not None and time.time()-umcdef.last_started_time < GlobalContext.params.min_starting_time: Msg.warn_msg("umc instance id '%s' starting frequency is too high (<%d seconds), will not start it now!" %(umcdef.umc_instanceid,GlobalContext.params.min_starting_time)) waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,GlobalContext.params.min_starting_time)) else: try: # run umcinstance as a child process umcdef.proc = self.run_umc(umcdef, GlobalContext) # start time start_t=time.time() umcdef.start_after=0 umcdef.last_started_time=start_t umcdef.num_runs = umcdef.num_runs + 1 if umcdef.first_started_time == 0: umcdef.first_started_time = time.time() started.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid)) except Exception as e: Msg.warn_msg("Error occurred while starting umc instance %s. The exception was: %s"%(umcdef.umc_instanceid, str(e))) pass else: if umcdef.proc is not None: running.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid)) else: waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,umcdef.start_after-time.time())) finally: umcdef.lock.release() # for time_run = time.time() Msg.info2_msg("Running: %s"%(running)) Msg.info2_msg("Started: %s"%(started)) Msg.info2_msg("Waiting: %s"%(waiting))
def __send_request(self): try: Msg.info2_msg("Sending proxy request %s %s" % (self.method.upper(), self.url)) headers = {"Via": "1.1 %s" % socket.gethostname()} if self.method == "get": self.response = requests.get( self.url, timeout=(GlobalContext.params.proxy_timeout_connect, GlobalContext.params.proxy_timeout_read), headers=headers) elif self.method == "post": self.response = requests.post( self.url, timeout=(GlobalContext.params.proxy_timeout_connect, GlobalContext.params.proxy_timeout_read), headers=headers) else: raise Exception("Method %s is not supported!" % self.method) except Exception as e: Msg.warn_msg("Proxy request to %s failed: %s" % (self.url, str(e))) pass
def __init__(self, config, writer_id): self.config = config # read common reader's params base_key = "common.umcpush.reader-params" self.params = Map( max_batchsize_rows=self.config.value( base_key + ".max-batchsize-rows", 50), max_batchsize_files=self.config.value( base_key + ".max-batchsize-files", 300), log_file_group=self.config.value(base_key + ".log-file-group", 1), common_tags=self.config.value(base_key + ".common-tags").split(','), common_fields=self.config.value(base_key + ".common-fields").split(','), default_timefield=self.config.value( base_key + ".default-timefield", "datetime"), default_timeformat=self.config.value( base_key + ".default-timeformat", "%Y-%m-%d %H:%M:%S"), tzoffset=utils.float_ex( self.config.value(base_key + ".tzoffset", 0), 0)) # update any value that may be overriden in writer's specific parameters writers = config.value("common.umcpush.writers") for writer in writers: if writer["writer-id"] == writer_id: rparams = writer["reader-params"] if rparams is not None: for k, v in rparams.items(): k = k.replace("-", "_") if self.params.get(k): self.params[k] = v else: Msg.warn_msg( "The reader param %s is invalid in %s" % (k, key))
def run_all(self): paused = self.GlobalContext.paused for tdef in self.tasks: if time.time()-tdef.last_run_time > tdef.time_interval and (tdef.run_on_global_pause or not(paused)): if tdef.run_after==0 or time.time()>tdef.run_after: if not(tdef.disabled): # inform that the task is resumed if it was puased if tdef.run_after>0: tdef.run_after=0 Msg.info1_msg("The task %s is resumed."%(tdef.name)) # run the task start_t=time.time() tdef.result = tdef.target.run_task(self.GlobalContext, tdef) end_t=time.time() if not(tdef.result): paused = True tdef.last_run_time = end_t tdef.last_run_duration=end_t-start_t # check to be disabled due to hard limit if tdef.time_limit_disable>0 and tdef.last_run_duration > tdef.time_limit_disable: tdef.disabled=True Msg.warn_msg("The task %s was running for %.2f seconds which is more than the hard maximum of %.2f seconds. The task will be disabled." %(tdef.name, tdef.last_run_duration, tdef.time_limit_disable)) # check to be paused due to soft limit elif tdef.time_limit_pause>0 and tdef.last_run_duration > tdef.time_limit_pause: tdef.run_after=end_t+tdef.pause_for Msg.warn_msg("The task %s was running for %.2f seconds which is more than the soft maximum of %.2f seconds. The task will be paused for %.2f seconds." %(tdef.name, tdef.last_run_duration, tdef.time_limit_pause, tdef.pause_for)) else: # report on task duration Msg.info2_msg("The task %s was running for %.2f seconds."%(tdef.name,tdef.last_run_duration)) # // not disabled # // locally paused else: pass old_paused = self.GlobalContext.paused self.GlobalContext.paused = not(all([ tdef.result for tdef in self.tasks if tdef.result is not None ])) if self.GlobalContext.paused != old_paused: Msg.warn_msg("umcrunner state has been %s."%("PAUSED" if self.GlobalContext.paused else "RESUMED"))
def process_cluster_request(self, method, path_def, allow_all, cache_maxage, is_stream, get_content): params = PathDef(path_def).params( self.path) #get_path_params(path_def, self.path) # path must be a valid path and hostname param must exist in it if params is None or params.params.hostname is None: return None # get a list of servers this should be proxied to # if there is more than one, then proxy them, otherwise run the locally or redirect via client server_list = self.get_server_list(params) # hostname is "all", will forward to individual umcrunner servers if len(server_list) > 1 and allow_all: # check if this has been proxied already if self.headers.get("Via") is None: # acquire lock on this path to prevent other threads from doing the same cache.acquire_lock(self.path) try: # check if in cache content = cache.get(self.path) if content is None: # not in cache # proxy to all umcrunner hosts including "me" (this one) Msg.info2_msg("Sending %d proxy requests." % (len(server_list))) start_t = time.time() prqs = [] for server_def in server_list: prqs.append( ProxyRequest( method, 'http://{address}:{tcp_port}{fw_path}'. format( address=server_def.address, tcp_port=server_def.tcp_port, fw_path=params.replace( params, Map(hostname=server_def["hostname"] ))), GlobalContext.params.proxy_run_threads)) prqs[-1].send_request() # wait for all responses for x in prqs: x.wait_for_response() # get all "valid" responses resp = [r for r in prqs if r.response is not None] Msg.info2_msg( "Data from %d proxy requests retrieved in %.2f seconds." % (len(resp), time.time() - start_t)) # add result to cache; the result from individual servers should always be json array content = Map(content="[%s]" % ",".join([ r.response.text.strip()[1:-1] for r in resp if r.response.text.strip() != "[]" ])) if cache_maxage > 0: cache.create_data(self.path, content.content, time.time(), cache_maxage) # if not in cache else: Msg.info2_msg("Serving request for %s from cache." % self.path) # send back response self.send(200, {"Content-Type": "application/json"}, content.content) finally: cache.release_lock(self.path) return True # if not via else: Msg.warn_msg( "A request to %s can only come from a client, not a proxy! (%s)" % (self.path, self.headers.get("Via"))) self.send( 400, None, "Request to the resource that comes via a proxy is not allowed!" ) return False # // if multiple hostnames elif len(server_list) == 1: # params.params.hostname should be a valid hostname server_def = server_list[0] if not (server_def.me): # host should be a known host, redirect the request onto it rather than being a proxy location_url = "http://{address}:{tcp_port}{fw_path}".format( address=server_def.address, tcp_port=server_def.tcp_port, fw_path=params.replace( params, Map(hostname=server_def["hostname"]))) Msg.info2_msg("Redirecting the request to '%s'" % location_url) self.send(308, {"Location": location_url}, "") return else: if not (is_stream): content = get_content(params) if content is not None: self.send(content.code, {"Content-Type": "application/json"}, "[%s]" % ",".join(content.json)) else: # should not happen really self.send(500, None, "") return True else: get_content(params) return True # // if one hostname only else: self.send( 404, None, "The host '%s' cannot be found or is not allowed!" % params.params.hostname) return False
def run_task(self, GlobalContext, tdef): umc_counts=Map(count=0, enabled=0, disabled=0, running=0, waiting=0, num_children=0, rss=0, cpu=0, cpu_s=0, runs=0, errors=0, last_errortime=0, backlog_total=0) if GlobalContext.umcdefs is not None: for ud in GlobalContext.umcdefs: ud.lock.acquire() try: umc_counts.count += 1 if ud.enabled: umc_counts.enabled += 1 else: umc_counts.disabled += 1 umc_counts.errors += ud.num_errors umc_counts.runs += ud.num_runs # update last error time from the error log if it was sooner if ud.log_stats is not None and ud.log_stats.errorlog_mtime > ud.lasterror_time: ud.lasterror_time = ud.log_stats.errorlog_mtime if ud.lasterror_time > umc_counts.last_errortime: umc_counts.last_errortime = ud.lasterror_time if time.time()<ud.start_after: umc_counts.waiting += 1 umc_counts.backlog_total += ud.log_stats.backlog_total if ud.get("log_stats") and ud.get("log_stats").get("backlog_total") else 0 # umc instance statistics stats = {}; # process info p = {} try: if ud.proc is not None: umc_counts.running += 1 p["top_pid"] = ud.proc.pid #p["uptime"] = time.time() - ud.proc.create_time() p["uptime"] = time.time()-ud.last_started_time p["cmdline"] = ud.proc.cmdline() kids = ud.proc.children(True) rss = 0.0; cpu = 0 for k in kids: d = k.as_dict(attrs=['cpu_times', 'memory_info']) cpu = cpu + d["cpu_times"].user rss = rss + d["memory_info"].rss p["rss"] = float(rss/1024/1024) # in MB p["cpu"] = cpu p["cpu_s"] = cpu/p["uptime"] p["num_chproc"] = len(kids) umc_counts.rss += p["rss"] umc_counts.cpu += p["cpu"] umc_counts.cpu_s += p["cpu_s"] umc_counts.num_children += p["num_chproc"] # // end if except Exception as e: Msg.warn_msg("Error occurred when retrieving process info: %s"%str(e)) pass stats["p"] = p ud.stats = stats finally: ud.lock.release() # // for # umcrunner stats proc=psutil.Process() d = proc.as_dict(attrs=['cpu_times', 'memory_info']) uptime=time.time()-proc.create_time() hostname=socket.gethostname() GlobalContext.umcrunner_stats = Map( pid=proc.pid, hostname=hostname, uptime=uptime, cpu=d["cpu_times"].user, cpu_s=d["cpu_times"].user/uptime, rss=float(d["memory_info"].rss/1024/1024), threads=proc.num_threads(), umc_counts=umc_counts, link_umcinstances="/stats/hosts/{hostname}/umc/all".format(hostname=hostname) ) return True
def read_datapoints(self, logfilename, umcdef, create_writeitem_func): datapoints = [] notags = False nofields = False tzoffset = self.params.tzoffset if umcdef.enabled: # read datapoints with open(logfilename, 'r') as csvfile: reader = csv.DictReader(csvfile, delimiter=',') for row in reader: # remove None keys row = {k: v for k, v in row.items() if k is not None} # timestamp try: if not (umcdef.reader.timefield in row): raise ValueError("Cannot find time field '" + umcdef.reader.timefield + "' in data row!") if umcdef.reader.timeformat == "_unix_" or umcdef.reader.timeformat == "_time_s_": timestamp = long( row[umcdef.reader.timefield]) * 1000000000 elif umcdef.reader.timeformat == "_time_ms_": timestamp = long( row[umcdef.reader.timefield]) * 1000000 else: if umcdef.reader.tzfield is not None and umcdef.reader.tzfield in row: tzoffset = utils.float_ex( row[umcdef.reader.tzfield], self.params.tzoffset) timestamp = (self.unix_time_millis( datetime.datetime.strptime( row[umcdef.reader.timefield], umcdef.reader.timeformat)) - int( tzoffset * 60 * 60 * 1000)) * 1000000 except Exception as e: # output error and skip this row Msg.err_msg( "Cannot read or convert time to timestamp for %s: %s" % (umcdef.umcid, str(e))) continue # create tags and fields tags = { k: str(v) for k, v in row.items() if k in umcdef.reader.tcols } fields = { k: utils.float_ex(v) for k, v in row.items() if k in umcdef.reader.fcols } notags = (len(tags) == 0) # only add this row if there is at least one field with some value if len([v for k, v in fields.items() if v is not None]) > 0: # evaluate transformations if umcdef.reader.transform is not None: tags, fields = eval_transform( umcdef.reader.transform, timestamp, tags, fields) # only add this row if filter holds on this row or there is no filter if umcdef.reader.filter is None or eval_filter( umcdef.reader.filter, timestamp, tags, fields): try: records = create_writeitem_func( umcdef, timestamp, fields, tags) if records is not None and isinstance( records, list): datapoints += records except Exception as e: Msg.err_msg( "Error occured while creating data points item: %s" % str(e)) # // if write data # // end reading rows # // end open file # check for no tags if notags and len(datapoints) > 0: Msg.warn_msg( "The definition of %s contains no tags presented in the log file %s!" % (umcdef.umcid, os.path.basename(logfilename))) return datapoints