def timed_broadcast(): """ periodically broadcast system status and known jobs """ while running: broadcast( args.udp_communication_port, UdpSerializer.dump(Status(get_ip(), get_load()), hash_key)) for job in storage.cluster_jobs: if job.assigned_to == get_ip(): job.pid = check_process(job.command) for packet in UdpSerializer.dump(job, hash_key): client(args.udp_communication_port, packet) time.sleep(args.broadcast_interval)
def test_manual_run_is_executed_exactly_once(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) command = "echo 'hello world'" cron_job = CronItem(command=command) cron_job.assigned_to = get_ip() storage = Storage() tab = CronTab(tab="""* * * * * command""") processor = Processor(12345, storage, cron=tab) for packet in UdpSerializer.dump(cron_job): processor.queue.put_nowait(packet) for packet in UdpSerializer.dump(Run(cron_job)): processor.queue.put_nowait(packet) loop.run_until_complete(processor.process()) assert 1 == len(storage.cluster_jobs) assert command == storage.cluster_jobs[0].command assert 1 == len(storage.cluster_jobs[0].log) assert 'exit code: 0' in storage.cluster_jobs[0].log[ 0] and 'hello world' in storage.cluster_jobs[0].log[0] assert processor.queue.empty() loop.close()
def test_add_same_job_twice_adds_cron_once(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) command = "echo 'hello world'" cron_job = CronItem(command=command) cron_job.assigned_to = get_ip() storage = Storage() tab = CronTab(tab="""* * * * * command""") processor = Processor(12345, storage, cron=tab) for packet in UdpSerializer.dump(cron_job): processor.queue.put_nowait(packet) for packet in UdpSerializer.dump(cron_job): processor.queue.put_nowait(packet) loop.run_until_complete(processor.process()) assert 1 == len(storage.cluster_jobs) assert command == storage.cluster_jobs[0].command assert None is not next(tab.find_command(command), None) assert 1 == len(list(tab.find_command(command))) loop.close()
def kill(self, kill): if not kill.pid: self.logger.warning("got kill command for {0} but PID not set".format(kill.job)) else: self.logger.debug("got full kill in buffer ({0}".format(kill.job)) if kill.job.assigned_to == get_ip() and check_process(kill.command, pid=kill.pid): self.logger.info("I'm owner, going to try and kill the running job {0}".format(kill.job)) try: kill_proc_tree(kill.pid) except ValueError: self.logger.warning("got signal to kill self, that's not happening")
async def cron_in_sync(self, request): for job in self.storage.cluster_jobs: if job.assigned_to == get_ip(): found = next( iter([ j for j in self.cron.find_command(job.command) if j == job ]), None) if not found: return web.HTTPConflict( text="stored job {0} not matched to actual cron". format(job)) return web.HTTPOk()
def toggle_job(self, toggle): self.logger.debug("got full toggle in buffer {0}".format(toggle.job)) job = next(iter([j for j in self.storage.cluster_jobs if j == toggle.job]), None) if job: if job.assigned_to == get_ip(): self.logger.info("am owner for job {0}, toggling it".format(job)) job.enable(not job.is_enabled()) if self.user and not job.user: job.user = self.user if self.cron and not job.cron: job.cron = self.cron self.cron.write() idx = self.storage.cluster_jobs.index(job) del (self.storage.cluster_jobs[idx]) self.storage.cluster_jobs.append(job)
async def run(self, run, uuid): self.logger.debug("got full run in buffer {0}".format(run.job)) job = next(iter([j for j in self.storage.cluster_jobs if j == run.job]), None) if job and job.assigned_to == get_ip(): self.logger.info("am owner for job {0}".format(job)) run.timestamp = datetime.now() process = subprocess.Popen(run.job.command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, shell=True) self.logger.info("{0} has been defined, going to execute".format(job.command)) std_out, std_err = process.communicate() exit_code = process.wait() if std_err: self.logger.warning("error during execution of {0}: {1}".format(run.job.command, std_err)) self.logger.info("output of {0} with code {1}: {2}".format(job.command, exit_code, std_out)) job.append_log("{0:%b %d %H:%M:%S} localhost CRON[{1}] exit code: {2}, out: {3}, err: {4}".format(datetime.now(), process.pid, exit_code, std_out, std_err)) broadcast(self.udp_port, UdpSerializer.dump(job, self.hash_key)) self.clean_buffer(uuid)
def remove_job(self, job): self.logger.debug("got full remove in buffer {0}".format(job)) if job in self.storage.cluster_jobs: self.logger.debug("removing existing job {0}".format(job)) self.storage.cluster_jobs.remove(job) if job.assigned_to == get_ip(): if job.pid: self.logger.warning("job {0} is running, going to kill it".format(job)) if check_process(job.command, job.pid): kill_proc_tree(job.pid) self.logger.info("removing existing, assigned job {0}".format(job)) cmd = next(self.cron.find_command(job.command), None) if cmd: self.logger.info("removing {0} from cron".format(job)) self.cron.remove(cmd) self.cron.write() else: self.logger.warning("defined job {0} not found in cron, but assigned to me!".format(job))
def add_job(self, new_job): self.logger.debug("got full job in buffer {0}".format(new_job)) job = next(iter([j for j in self.storage.cluster_jobs if j == new_job]), None) if not job: if new_job.assigned_to == get_ip(): existing_job = next(self.cron.find_command(new_job.command), None) if existing_job and existing_job == new_job: self.logger.info("job already defined in tab, skipping it") else: if self.user and not new_job.user: new_job.user = self.user if self.cron and not new_job.cron: new_job.cron = self.cron self.logger.info("adding job {0} to cron {1} for user {2}".format(new_job, self.cron.filename, new_job.user)) self.cron.append(new_job) self.cron.write() else: idx = self.storage.cluster_jobs.index(job) del (self.storage.cluster_jobs[idx]) self.storage.cluster_jobs.append(new_job)
def main(): """ entry point """ parser = argparse.ArgumentParser( description='Distributed Cronlike Scheduler') parser.add_argument('-l', '--log-file', default=None, help='path to store logfile') parser.add_argument('-p', '--storage-path', default=None, help='directory where to store cache') parser.add_argument('-u', '--udp-communication-port', type=int, default=12345, help='communication port (default: 12345)') parser.add_argument('-i', '--broadcast-interval', type=int, default=5, help='interval for broadcasting data over UDP') parser.add_argument( '-c', '--cron', default=None, help= 'crontab to use (default: /etc/crontab, use `memory` to not save to file' ) parser.add_argument('-d', '--cron-user', default=None, help='user for storing cron entries') parser.add_argument('-w', '--web-port', type=int, default=8080, help='web hosting port (default: 8080)') parser.add_argument( '-n', '--ntp-server', default='pool.ntp.org', help='NTP server to detect clock skew (default: pool.ntp.org)') parser.add_argument( '-s', '--node-staleness', type=int, default=180, help= 'Time in seconds of non-communication for a node to be marked as stale (defailt: 180s)' ) parser.add_argument( '-x', '--hash-key', default='abracadabra', help="String to use for verifying UDP traffic (to disable use '')") parser.add_argument('-v', '--verbose', action='store_true', default=False, help='verbose logging') args = parser.parse_args() if get_ntp_offset(args.ntp_server) > 60: exit("your clock is not in sync (check system NTP settings)") root_logger = logging.getLogger() if args.log_file: file_handler = logging.FileHandler(args.log_file) file_handler.setFormatter(logging.Formatter(log_format)) root_logger.addHandler(file_handler) if args.verbose: root_logger.setLevel(logging.DEBUG) else: root_logger.setLevel(logging.INFO) logging.getLogger('aiohttp').setLevel(logging.WARNING) pool = ThreadPoolExecutor(4) storage = Storage(args.storage_path) if args.cron: if args.cron == 'memory': processor = Processor(args.udp_communication_port, storage, cron=CronTab(tab="""* * * * * command""")) elif args.cron_user: processor = Processor(args.udp_communication_port, storage, cron=CronTab(tabfile=args.cron, user=args.cron_user), user=args.cron_user) else: processor = Processor(args.udp_communication_port, storage, cron=CronTab(tabfile=args.cron, user='******'), user='******') else: processor = Processor(args.udp_communication_port, storage, user='******') hash_key = None if args.hash_key != '': hash_key = args.hash_key with StatusProtocolServer(processor, args.udp_communication_port) as loop: running = True scheduler = Scheduler(storage, args.node_staleness) def timed_broadcast(): """ periodically broadcast system status and known jobs """ while running: broadcast( args.udp_communication_port, UdpSerializer.dump(Status(get_ip(), get_load()), hash_key)) for job in storage.cluster_jobs: if job.assigned_to == get_ip(): job.pid = check_process(job.command) for packet in UdpSerializer.dump(job, hash_key): client(args.udp_communication_port, packet) time.sleep(args.broadcast_interval) def timed_schedule(): """ periodically check if cluster needs re-balancing """ while running: time.sleep(23) if not scheduler.check_cluster_state(): logger.info("re-balancing cluster") jobs = storage.cluster_jobs.copy() for packet in UdpSerializer.dump( ReBalance(timestamp=datetime.now()), hash_key): client(args.udp_communication_port, packet) time.sleep(5) for job in jobs: for packet in UdpSerializer.dump(job, hash_key): client(args.udp_communication_port, packet) async def scheduled_broadcast(): await loop.run_in_executor(pool, timed_broadcast) async def scheduled_rebalance(): await loop.run_in_executor(pool, timed_schedule) async def save_schedule(): """ auto save every 100 seconds """ while running: await asyncio.sleep(100) await storage.save() logger.info("setting broadcast interval to {0} seconds".format( args.broadcast_interval)) loop.create_task(scheduled_broadcast()) loop.create_task(scheduled_rebalance()) if args.storage_path: loop.create_task(save_schedule()) logger.info( "starting web application server on http://{0}:{1}/".format( get_ip(), args.web_port)) if args.cron_user: s = Site(scheduler, storage, args.udp_communication_port, cron=processor.cron, user=args.cron_user, hash_key=hash_key) else: s = Site(scheduler, storage, args.udp_communication_port, cron=processor.cron, hash_key=hash_key) runner = AppRunner(s.app) loop.run_until_complete(runner.setup()) site_instance = TCPSite(runner, port=args.web_port) loop.run_until_complete(site_instance.start()) try: loop.run_forever() except: logger.info("interrupt received") logger.info("stopping web application") loop.run_until_complete(site_instance.stop()) running = False if args.storage_path: loop.create_task(storage.save()) logger.debug("waiting for background tasks to finish") pending_tasks = [ task for task in asyncio.Task.all_tasks() if not task.done() ] loop.run_until_complete(asyncio.gather(*pending_tasks)) logger.info("elvis has left the building")