def get_disks_meta(self): ceph_api = CephAPI() consul_api = ConsulAPI() ls = ceph_api.get_disks_meta() for disk in ls: if disk and hasattr(disk, "paths") and not disk.paths: disk.status = DisplayDiskStatus.unattached elif disk and hasattr(disk, "paths") and disk.paths: data = consul_api.find_disk(disk.id) if data is not None: disk.status = DisplayDiskStatus.starting if str(data.Flags) == "1": disk.status = DisplayDiskStatus.stopping elif consul_api.is_path_locked(disk.id): disk.status = DisplayDiskStatus.started else: disk.status = DisplayDiskStatus.stopped job_manager = JobManager() job_list = job_manager.get_running_job_list() for j in job_list: # Check if the status running if j.is_running: # Set disk status [deleting] if j.type == JobType.DELETE_DISK and str( j.params).find(str(disk.id)) > -1: disk.status = DisplayDiskStatus.deleting return ls
def get_job(args): job_manager = JobManager() # Get Status if args.t ==1: print( int(job_manager.is_done(args.id))) # Get output elif args.t ==2: print (job_manager.get_job_output(args.id)) sys.exit(0)
def storage(args): job_manager = JobManager() params = '-d {} '.format(args.d) for j in job_manager.get_running_job_list(): if j.type == JobType.STORAGELOAD : logger.info("Cannot start storage load job for 'sar',") print("-1") return print( job_manager.add_job(JobType.STORAGELOAD,params)) logger.info("Start storage load job for 'sar'") sys.exit(0)
def get_test_report(self, id): try: result = JobManager().get_job_output(id) if result.startswith(self.output_split_text) or result.find( self.output_split_text) > -1: result = result.split(self.output_split_text)[1] else: return None report = BenchmarkResult() report.load_json(result) except: return None return report
def client(args): job_manager = JobManager() params = '-d {} -t {} -b {} -m {} -p {}'.format(args.d, args.t, args.b, args.m, args.p) for j in job_manager.get_running_job_list(): if j.type == JobType.CLIENTSTRESS: logger.info("Cannot start client stress job for 'rados',") print("-1") return print(job_manager.add_job(JobType.CLIENTSTRESS, params)) logger.info("Start client stress job for rados") sys.exit(0)
def delete_cache(args): if not configuration().get_node_info().is_storage: print("-1") return job_manager = JobManager() params = '-disk_name {}'.format(args.disk_name) for j in job_manager.get_running_job_list(): if j.type == JobType.DELETEOSD or j.type == JobType.ADDDISK or \ j.type == JobType.ADDJOURNAL or j.type == JobType.DELETEJOURNAL or \ j.type == JobType.ADDCACHE or j.type == JobType.DELETECACHE: logger.info("Cannot start delete job for cache of disk {},There ara running jobs. ".format(args.disk_name)) print("-1") return print(job_manager.add_job(JobType.DELETECACHE,params)) logger.info("Start delete job for cache of disk {}".format(args.disk_name)) sys.exit()
def start(self, type, duration_sec, threads, clients, pool, cleanup): job_manager = JobManager() clients = "" + ",".join(clients) + "" for j in job_manager.get_running_job_list(): if j.type == JobType.BENCHMANAGER: logger.info( "Cannot start benchmark manager there is a job already running." ) return -1 cleanup_val = 1 if not cleanup: cleanup_val = 0 params = '-d {} -t {} -type {} -c {} -p {} --cleanup {}'.format( duration_sec, threads, type, clients, pool, cleanup_val) id = job_manager.add_job(JobType.BENCHMANAGER, params) return id
def add_journal(args): if not configuration().get_node_info().is_storage: print("-1") return job_manager = JobManager() params = '-disk_name {}'.format(args.disk_name) for j in job_manager.get_running_job_list(): if j.type == JobType.DELETEOSD or j.type == JobType.ADDDISK or\ j.type == JobType.ADDJOURNAL or j.type == JobType.DELETEJOURNAL: logger.info( "Cannot start add journal job to create journal for disk.{}.There ara running jobs. " .format(args.disk_name)) print("-1") return print(job_manager.add_job(JobType.ADDJOURNAL, params)) logger.info("Start add journal job for disk {}.".format(args.disk_name)) sys.exit()
def add_osd(args): if not configuration().get_node_info().is_storage: print("-1") return job_manager = JobManager() # If no journal and no cache : if str(args.journal) == "" and str(args.cache) == "": params = '-disk_name {}'.format(args.disk_name) # If journal but no cache : elif str(args.journal) != "" and str(args.cache) == "": params = '-disk_name {} -journal {}'.format(args.disk_name, args.journal) # If cache but no journal : elif str(args.journal) == "" and str(args.cache) != "" and str( args.cache_type) != "": params = '-disk_name {} -cache {} -cache_type {}'.format( args.disk_name, args.cache, args.cache_type) # If both journal and cache : else: params = '-disk_name {} -journal {} -cache {} -cache_type {}'.format( args.disk_name, args.journal, args.cache, args.cache_type) # Getting all running jobs : for j in job_manager.get_running_job_list(): if j.type == JobType.DELETEOSD or j.type == JobType.ADDDISK or \ j.type == JobType.ADDJOURNAL or j.type == JobType.DELETEJOURNAL or \ j.type == JobType.ADDCACHE or j.type == JobType.DELETECACHE: logger.info( "Cannot start add job to create osd for disk : {}. There ara running jobs." .format(args.disk_name)) print("-1") return print(job_manager.add_job(JobType.ADDDISK, params)) logger.info("Start add osd job for disk {}.".format(args.disk_name)) sys.exit()
def clear_disk(args): disk_id = args.disk_id image_name = "image-" + disk_id try: # Get which ceph user is using this function & get his keyring file path # # ---------------------------------------------------------------------- # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() # Get disk metadata : # ------------------- ceph_api = CephAPI() disk_metadata = ceph_api.get_diskmeta(disk_id) # Get pool name : # --------------- pool_name = disk_metadata.pool data_pool = "" # Check if disk has been created on replicated pool or erasure pool : # ------------------------------------------------------------------- if len(disk_metadata.data_pool) > 0: data_pool = disk_metadata.data_pool tmp_image_name = "tmp_disk_" + disk_metadata.id # (1.) Check if a previous tmp image for this disk is still existed : # =================================================================== images_list = ceph_api.get_all_images(pool_name) for image in images_list: if tmp_image_name in image: # Delete image # cmd = "rbd rm {}/{} {} --cluster {}".format( pool_name, image, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot remove tmp image ,\ncmd : " + cmd) sys.exit(-1) print( "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)" ) logger.info( "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)" ) # (2.) Stop old disk : # ==================== consul_api = ConsulAPI() kv = consul_api.find_disk(disk_id) if kv is not None: manage_disk = ManageDisk() status = manage_disk.stop(disk_id) if status != Status.done: print('Error : Cannot stop disk , id = ' + disk_id) sys.exit(-1) print("Stage 2 :\n\tStop old disk > (Completed)") logger.info("Stage 2 :\n\tStop old disk > (Completed)") time.sleep(3) # (3.) Check if old disk is stopped or not : # ========================================== if len(data_pool) > 0: pool_type = "erasure" _confirm_disk_stopped(data_pool, disk_id, pool_type) else: pool_type = "replicated" _confirm_disk_stopped(pool_name, disk_id, pool_type) print( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) logger.info( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) else: print("Stage 2 :\n\tStop old disk > (Completed)") logger.info("Stage 2 :\n\tStop old disk > (Completed)") print( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) logger.info( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) print('\tclear_disk.py script : disk {} is already stopped'.format( disk_id)) # (4.) Create a tmp image (not PetaSAN image) : # ============================================= # Generate a random value between 1 and 99999 # random_no = str(random.randint(1, 100000)) tmp_image_name = tmp_image_name + "_" + str(random_no) image_size = disk_metadata.size * 1024 if len(data_pool) > 0: cmd = "rbd create {}/{} --size {} --data-pool {} {} --cluster {}".format( pool_name, tmp_image_name, image_size, data_pool, ceph_auth.get_authentication_string(), cluster_name) else: cmd = "rbd create {}/{} --size {} {} --cluster {}".format( pool_name, tmp_image_name, image_size, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot create new tmp image ,\ncmd : " + cmd) sys.exit(-1) print("Stage 4 :\n\tCreate a tmp image called ( " + tmp_image_name + " ) > (Completed)") logger.info("Stage 4 :\n\tCreate a tmp image called ( " + tmp_image_name + " ) > (Completed)") # (5.) Run script to copy "old disk" metadata to new "tmp_disk" : # =============================================================== metadata_script_file = ConfigAPI().get_disk_meta_script_path() # Function : read_disks_metadata : parser_key_1 = "read" arg_1 = "--image" arg_2 = "--pool" # Function : set_disk_metadata : parser_key_2 = "write" arg_3 = "--file" cmd = metadata_script_file + " " + parser_key_1 + " " + arg_1 + " " + image_name + " " + arg_2 + " " + pool_name +\ " | " + metadata_script_file + " " + parser_key_2 + " " + arg_1 + " " + tmp_image_name + " " + arg_2 + " " + pool_name if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot copy metadata from old disk to new tmp image ,\ncmd : " + cmd) sys.exit(-1) print( "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)" ) logger.info( "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)" ) time.sleep(3) # (6.) Remove metadata of old disk : # =========================================================== old_image_name = str(ceph_api.conf_api.get_image_name_prefix() + disk_metadata.id) confirm = ceph_api.remove_disk_metadata(old_image_name, disk_metadata.pool) if not confirm: print( "Error : clear_disk.py script : cannot remove metadata of old disk" ) # sys.exit(-1) print("Stage 6 :\n\tRemove metadata of old disk > (Completed)") logger.info("Stage 6 :\n\tRemove metadata of old disk > (Completed)") # (7.) Rename old disk image name with "deleted-" + disk_id + random_no: # ====================================================================== new_image_name = "deleted-" + disk_metadata.id + "-" + random_no cmd = "rbd mv {}/{} {} {} --cluster {}".format( pool_name, image_name, new_image_name, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot rename old image from {} to {} ,\ncmd : {}" .format(image_name, new_image_name, cmd)) sys.exit(-1) print("Stage 7 :\n\tRename old disk image name with ( " + new_image_name + " ) > (Completed)") logger.info("Stage 7 :\n\tRename old disk image name with ( " + new_image_name + " ) > (Completed)") time.sleep(5) # (8.) Rename "tmp_disk" with old disk image name : # ================================================= cmd = "rbd mv {}/{} {} {} --cluster {}".format( pool_name, tmp_image_name, image_name, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot rename \"tmp_disk\" from {} to {} ,\ncmd : {}" .format(tmp_image_name, image_name, cmd)) sys.exit(-1) print( "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)" ) logger.info( "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)" ) time.sleep(5) jm = JobManager() id = jm.add_job(JobType.DELETE_DISK, new_image_name + ' ' + pool_name) print("Stage 9 :\n\tStart a job to remove old disk image , job id = " + str(id)) logger.info( "Stage 9 :\n\tStart a job to remove old disk image , job id = " + str(id)) sys.exit(0) except PoolException as e: print("Error : PoolException , {}".format(e.message)) logger.error("Clear Disk Error : PoolException , {}".format(e.message)) sys.exit(-1) except DiskListException as e: print("Error : DiskListException , {}".format(e.message)) logger.error("Clear Disk Error : DiskListException , {}".format( e.message)) sys.exit(-1) except CephException as e: if e.id == CephException.GENERAL_EXCEPTION: print("Error : CephException , {}".format(e.message)) logger.error("Clear Disk Error : CephException , {}".format(e.message)) sys.exit(-1) except MetadataException as e: print("Error : MetadataException , {}".format(e.message)) logger.error("Clear Disk Error : MetadataException , {}".format( e.message)) sys.exit(-1) except Exception as e: print("Error : Exception , {}".format(e.message)) logger.error("Clear Disk Error : Exception , {}".format(e.message)) sys.exit(-1)
def get_full_disk_list(pid=None): __output_split_text = "##petasan##" disk_list = [] ceph_disk_list = get_disk_list() ph_disk_list = disk_util.get_disk_list() osd_dict = None try: osd_dict = ceph_osd.ceph_osd_tree(configuration().get_node_info().name) except Exception as e: logger.error(e.message) missing_disk_list = [] # Set osd id and usage if ceph_disk_list and len(ceph_disk_list) > 0: for disk in ceph_disk_list: for ph_disk in ph_disk_list: if ph_disk.name == disk.name: ph_disk.usage = disk.usage ph_disk.osd_id = disk.osd_id ph_disk.osd_uuid = disk.osd_uuid ph_disk.linked_journal = disk.linked_journal ph_disk.linked_osds = disk.linked_osds ph_disk.linked_cache = disk.linked_cache ph_disk.linked_cache_part_num = disk.linked_cache_part_num ph_disk.vg_name = disk.vg_name ph_disk.lv_name = disk.lv_name ph_disk.linked_journal_part_num = disk.linked_journal_part_num ph_disk.no_of_partitions = disk.no_of_partitions ph_disk.no_available_partitions = disk.no_available_partitions disk_list.append(ph_disk) break else: disk_list.extend(ph_disk_list) health_test = Smart().get_overall_health() for disk in disk_list: if disk.name in health_test: disk.smart_test = health_test[disk.name] # get all running jobs job_manager = JobManager() job_list = job_manager.get_running_job_list() # Set disk osd status for node_disk in disk_list: # Set osd status [up, down] if node_disk.usage == DiskUsage.osd: status = None if osd_dict and node_disk.osd_id is not None: status = osd_dict.get(int(node_disk.osd_id), None) if str(ceph_osd.get_osd_id(node_disk.osd_uuid)) == "-1": node_disk.status = OsdStatus.no_status node_disk.usage = DiskUsage.mounted node_disk.osd_id = -1 elif status is not None: node_disk.status = status else: node_disk.status = OsdStatus.no_status disk_name_parameter = "-disk_name {}".format(node_disk.name) disk_id_parameter = "-id {}".format(node_disk.osd_id) # loop on running job list for j in job_list: # Set osd status [deleting , adding] if j.type == JobType.ADDDISK and str(j.params).find( str(disk_name_parameter)) > -1: node_disk.status = OsdStatus.adding elif j.type == JobType.ADDJOURNAL and str(j.params).find( str(disk_name_parameter)) > -1: node_disk.status = OsdStatus.adding_journal elif j.type == JobType.ADDCACHE and str(j.params).find( str(disk_name_parameter)) > -1: node_disk.status = OsdStatus.adding_cache elif j.type == JobType.DELETEOSD and ( str(j.params).find(str(disk_name_parameter)) > -1 or str(j.params).find(str(disk_id_parameter)) > -1): node_disk.status = OsdStatus.deleting elif j.type == JobType.DELETEJOURNAL and str(j.params).find( str(disk_name_parameter)) > -1: node_disk.status = OsdStatus.deleting elif j.type == JobType.DELETECACHE and str(j.params).find( str(disk_name_parameter)) > -1: node_disk.status = OsdStatus.deleting # Check if the job completed and has error to return it elif pid and j.id == int(pid): job_output = job_manager.get_job_output(j) if job_output is None: continue job_output = str(job_output).strip() if job_output != "": # We expect our custom messages appear after __output_split_text. out_arr = job_output.split(__output_split_text) if out_arr > 1: node_disk.error_message = out_arr[1] job_manager.remove_job(j.id) if not osd_dict or len(osd_dict.items()) == 0: return disk_list # If there is an osd found in ceph tree and this osd not has disk. for osd_id, osd_status in osd_dict.items(): is_missing = True for disk in disk_list: if str(disk.osd_id) == str(osd_id): is_missing = False break if is_missing: disk = DiskInfo() disk.osd_id = osd_id disk.status = osd_status disk.usage = DiskUsage.osd missing_disk_list.append(disk) disk_list.extend(missing_disk_list) return disk_list
#! /usr/bin/python ''' Copyright (C) 2019 Maged Mokhtar <mmokhtar <at> petasan.org> Copyright (C) 2019 PetaSAN www.petasan.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' from PetaSAN.core.cluster.ntp import NTPConf from PetaSAN.core.cluster.job_manager import JobManager ntp = NTPConf() ntp.sync_hw_clock() JobManager().remove_jobs_since(3600)
def delete_pool(self, pool_name): jm = JobManager() id = jm.add_job(JobType.DELETE_POOL, pool_name + ' --yes-i-really-really-mean-it') return id
def delete_disk(self, disk_id, pool): ceph_api = CephAPI() consul_api = ConsulAPI() ls = ceph_api.get_disks_meta_for_pool(pool) try: for disk in ls: if disk_id == disk.id: if disk and hasattr(disk, "paths") and not disk.paths: disk_status = DisplayDiskStatus.unattached elif disk and hasattr(disk, "paths") and disk.paths: data = consul_api.find_disk(disk.id) if data is not None: disk_status = DisplayDiskStatus.started if str(data.Flags) == "1": disk_status = DisplayDiskStatus.stopping else: disk_status = DisplayDiskStatus.stopped break disk_status = None except: return StopDiskStatus.error if disk_status == DisplayDiskStatus.started or disk_status == DisplayDiskStatus.stopping: return StopDiskStatus.working elif disk_status is None: return StopDiskStatus.error elif disk_status == DisplayDiskStatus.stopped or disk_status == DisplayDiskStatus.unattached: # return ceph_api.delete_disk(disk_id,pool) # start: delete disk as a job __image_name_prefix = ConfigAPI().get_image_name_prefix() # set image_name by disk_id : image_name = disk_id # if PetaSAN disk : if disk_id.isdigit() and (len(disk_id) == 5): image_name = __image_name_prefix + str(disk_id) jm = JobManager() try: id = jm.add_job(JobType.DELETE_DISK, image_name + ' ' + pool) print("Start Delete image: ", image_name) if id > 0: logger.info( "Deleting disk: {} has been started as a job".format( image_name)) return id except Exception as ex: logger.error("Error Deleting disk: {}".format(image_name)) # end: delete disk as a job # else: return StopDiskStatus.error
def startup_services(building_stage=False, cluster_complete=False): path = ConfigAPI().get_service_files_path() if not building_stage and cluster_complete: logger.info("Start settings IPs") call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path()) call_cmd('systemctl start ntp') call_cmd('systemctl start petasan-mount-sharedfs') NTPConf().force_ntp_sync() JobManager().remove_jobs_since(0) if cluster_config.get_node_info().is_management: call_cmd('python ' + ConfigAPI().get_consul_start_up_script_path()) call_cmd('systemctl start glusterd') call_cmd('systemctl start petasan-cluster-leader') else: call_cmd('python ' + ConfigAPI().get_consul_client_start_up_script_path()) logger.info("Starting cluster file sync service") call_cmd('systemctl start petasan-file-sync') call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh') if cluster_config.get_node_info().is_iscsi: logger.info("Starting iSCSI Service") call_cmd('systemctl start petasan-iscsi') if cluster_config.get_node_info().is_management: logger.info("Starting Cluster Management application") call_cmd('systemctl start petasan-admin') # create Ceph manager if not already created # exec_command('python /opt/petasan/scripts/create_mgr.py 60 >/dev/null 2>&1 &') logger.info("Starting Node Stats Service") call_cmd('systemctl start petasan-node-stats') # activate PetaSAN custom vgs cm = CacheManager() cm.activate() # remove any unused ceph-volume services ceph_disk_lib.delete_unused_ceph_volume_services() logger.info("Starting OSDs") call_cmd('systemctl restart petasan-start-osds') if cluster_config.get_node_info().is_backup: logger.info('Starting sync replication node service') call_cmd('systemctl restart petasan-sync-replication-node') if cluster_config.get_node_info( ).is_iscsi or cluster_config.get_node_info().is_storage: logger.info("Starting petasan tuning service") call_cmd("systemctl restart petasan-tuning &") elif building_stage: call_cmd('systemctl start petasan-mount-sharedfs') if cluster_config.get_node_info().is_management: call_cmd('systemctl start petasan-cluster-leader') logger.info("Starting cluster file sync service") call_cmd('systemctl start petasan-file-sync') # replace node if cluster_config.get_node_info().is_backup: logger.info("Replace cluster node sync service") call_cmd('systemctl start petasan-sync-replication-node') # end call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh') if cluster_config.get_node_info().is_iscsi: logger.info("Starting PetaSAN service") call_cmd('systemctl start petasan-iscsi') sleep(2) if cluster_config.get_node_info().is_management: logger.info("Starting Cluster Management application") call_cmd('systemctl start petasan-admin') # activate PetaSAN custom vgs cm = CacheManager() cm.activate() # remove any unused ceph-volume services ceph_disk_lib.delete_unused_ceph_volume_services() logger.info("Starting Node Stats Service") call_cmd('systemctl start petasan-node-stats') logger.info("Starting OSDs") call_cmd('systemctl restart petasan-start-osds') if cluster_config.get_node_info( ).is_iscsi or cluster_config.get_node_info().is_storage: logger.info("Starting petasan tuning service") call_cmd("systemctl restart petasan-tuning &") elif not building_stage and not cluster_complete: logger.info("Start settings IPs") call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path())
def is_disk_deleting(self, id): jm = JobManager() return jm.is_done(id)
def is_test_complete(self, id): return JobManager().is_done(id)