def instance_status(self, instance_name, project_id, job_status): new_instance = instance(project_id) status_of_instance = new_instance.status_of(instance_name, project_id) # Job is running on running instance if status_of_instance == "RUNNING" and job_status == "running": return "running" # We don't have any job running but instance is in running mode # May be a disfonctionnement > Stop gce manually if status_of_instance == "RUNNING" and job_status != "running": return "standby-gce-running" # Instance is stopped and not suppressed # While job is supposed to run may be job failed if status_of_instance == "TERMINATED" and job_status == "running": return "failed" # Instance doesn't exist # While job is supposed to run may be job failed if status_of_instance == None and job_status == "running": return "failed" # Instance is stopped and not suppressed if status_of_instance == "TERMINATED" and job_status != "running": return "standby-gce" return None
def run(self, job_name): jobs = self.get_job_by_name(job_name) for job in jobs: project_id = job.project_id job_status = job.job_status machine_name = job.machine_name new_instance = instance(project_id) instance_status = new_instance.status_of(machine_name, project_id) # Check if job is set as already running on datastore # and check if on GCE job is running # if job is not set as not running and not running on gcp >> job is not running print(">>>>>>>> Run status is " + job_status) if job_status != JOB_RUNNING_STATUS or instance_status != GCE_RUNNING_STATUS: # Run job by initializing new client new_instance.run_job(job) last_run = datetime.utcnow() self.update_job(job.job_name, job.emails, job.project_id, job.bucket_id, job.machine_type, job.machine_name, job.startup_script, job.shutdown_script, job.machine_zone, job.after_run, job.machine_os, job.cron_schedule, job.max_running_time, job_name, last_run, JOB_RUNNING_STATUS)
def stop(self, job_name): jobs = self.get_job_by_name(job_name) # Convert cron to readable before listing values for job in jobs: project_id = job.project_id job_status = job.job_status # Check if job is already running if job_status == JOB_RUNNING_STATUS: # Stop job by initializing new client new_instance = instance(project_id) new_instance.stop_job(job)
def run_job_queue(self): queues = self.get_queue_list() for queue in queues: machine_name = queue.machine_name machine_zone = queue.machine_zone project_id = queue.project_id after_run = queue.after_run job_name = queue.job_name # Restart instance if instance has been stopped if after_run == STOP_AFTER_RUN_VALUE: #Check if instance exist new_instance = instance(project_id) status_of_instance = new_instance.status_of( machine_name, project_id) if status_of_instance == GCE_TERMINATED_STATUS: new_instance.start(machine_name, project_id, machine_zone) # TODO: Warning we must act to alert user -- but maybe user start job manually if status_of_instance == GCE_RUNNING_STATUS: logging.info("Trying to start GCE engine already running") # Delete instance if instance must be deleted if after_run == DELETE_AFTER_RUN_VALUE: #Check if instance exist new_instance = instance(project_id) status_of_instance = new_instance.status_of( machine_name, project_id) if status_of_instance == None: # Run job self.run(job_name)
def delete_job(self, job): job_sup = job[0] # Delete job job_sup.key.delete() time.sleep(SLEEP_TIME_AFTER_DATASTORE_OP) if len(self.get_job_by_name(job[0].job_name)) > 0: project_id = job.project_id machine_name = job.machine_name machine_zone = job.machine_zone # Check if job instance is running and delete new_instance = instance(project_id) new_instance.delete(machine_name, project_id, machine_zone) return True else: return False
def instance_stop(self, instance_name, project_id, zone): new_instance = instance(project_id) new_instance.stop(instance_name, project_id, zone)
def overwatch(self): jobs = self.get_job_list() #################################################### # JOBS TO STOP FROM QUEUE #################################################### queues = self.get_queue_list() for queue in queues: creation = queue.creation project_id = queue.project_id machine_name = queue.machine_name after_run = queue.after_run max_running_time = queue.max_running_time machine_zone = queue.machine_zone job_name = queue.job_name # Elapsed time in min of job running elapsed_time = self.elapsed_min_after_run(creation) print(elapsed_time) print(elapsed_time - (MAX_GRACE_MIN + int(max_running_time))) # Check job which reach max run time and stop or delete instances if elapsed_time - (MAX_GRACE_MIN + int(max_running_time)) >= 0: # Stop instance if instance must be stopped if after_run == STOP_AFTER_RUN_VALUE: new_instance = instance(project_id) new_instance.stop(machine_name, project_id, machine_zone) # Delete instance if instance must be deleted if after_run == DELETE_AFTER_RUN_VALUE: new_instance = instance(project_id) new_instance.delete(machine_name, project_id, machine_zone) # Remove from Queue self.delete_queue(queue) # Set job status to standby self.queue_update_job(job_name) #################################################### # JOBS TO RUN QUEUE #################################################### for job in jobs: project_id = job.project_id machine_name = job.machine_name cron_schedule = job.cron_schedule after_run = job.after_run max_running_time = job.max_running_time job_name = job.job_name machine_zone = job.machine_zone # remaining Time In Min For Next Run min_before = self.min_before_next_run(cron_schedule) print(">>>>> Min before " + str(min_before)) # Add new job to the jobs queue -- Min time to run is the latence between processing and effectif run if min_before <= MIN_TIME_TO_RUN: self.create_queue(project_id, machine_name, machine_zone, after_run, max_running_time, job_name) # After creating queue, run queue self.run_job_queue()