This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' import os import sys from PetaSAN.core.cluster.configuration import configuration from PetaSAN.core.common.log import logger from PetaSAN.core.config.api import ConfigAPI from PetaSAN.core.common.cmd import call_cmd cluster_name = configuration().get_cluster_info().name ceph_mon_keyring = ConfigAPI().get_ceph_mon_keyring(cluster_name) ceph_client_admin_keyring = ConfigAPI().get_ceph_keyring_path(cluster_name) try: cluster_conf = configuration() current_node_info = cluster_conf.get_node_info() current_node_name = current_node_info.name current_cluster_info = cluster_conf.get_cluster_info() config_api = ConfigAPI() os.makedirs("/var/lib/ceph/mon/{}-{}".format(cluster_name, current_node_name)) os.makedirs("/tmp/{}".format(current_node_name))
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' from itertools import starmap import os from PetaSAN.core.config.api import ConfigAPI is_read = False old_msgs = dict() new_msgs = dict() new_megs_dis = dict() list = [] message_path = ConfigAPI().get_messages_file_path() message_path_description = message_path.replace(".txt", "_description.txt") def read(): global is_read print "file is reading now." if not os.path.exists(message_path) or not os.path.exists( message_path_description): raise Exception("Messages files not exists.") with open(message_path_description, 'r') as f: for line in f.read().splitlines(): if len(line.strip()) == 0: continue line_description = line.split("#")
def startup_services(building_stage=False, cluster_complete=False): path = ConfigAPI().get_service_files_path() if not building_stage and cluster_complete: logger.info("Start settings IPs") call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path()) call_cmd('systemctl start ntp') call_cmd('systemctl start petasan-mount-sharedfs') NTPConf().force_ntp_sync() JobManager().remove_jobs_since(0) if cluster_config.get_node_info().is_management: call_cmd('python ' + ConfigAPI().get_consul_start_up_script_path()) call_cmd('systemctl start glusterfs-server') call_cmd('systemctl start petasan-cluster-leader') else: call_cmd('python ' + ConfigAPI().get_consul_client_start_up_script_path()) logger.info("Starting cluster file sync service") call_cmd('systemctl start petasan-file-sync') call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh') if cluster_config.get_node_info().is_iscsi: logger.info("Starting iSCSI Service") call_cmd('systemctl start petasan-iscsi') if cluster_config.get_node_info().is_management: logger.info("Starting Cluster Management application") call_cmd('systemctl start petasan-admin') # create Ceph manager if not already created #exec_command('python /opt/petasan/scripts/create_mgr.py 60 >/dev/null 2>&1 &') logger.info("Starting Node Stats Service") call_cmd('systemctl start petasan-node-stats') logger.info("Starting OSDs") call_cmd('systemctl restart petasan-start-osds') if cluster_config.get_node_info().is_backup: logger.info('Starting sync replication node service') call_cmd('systemctl restart petasan-sync-replication-node') elif building_stage: call_cmd('systemctl start petasan-mount-sharedfs') if cluster_config.get_node_info().is_management: call_cmd('systemctl start petasan-cluster-leader') logger.info("Starting cluster file sync service") call_cmd('systemctl start petasan-file-sync') call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh') if cluster_config.get_node_info().is_iscsi: logger.info("Starting PetaSAN service") call_cmd('systemctl start petasan-iscsi') sleep(2) if cluster_config.get_node_info().is_management: logger.info("Starting Cluster Management application") call_cmd('systemctl start petasan-admin') logger.info("Starting Node Stats Service") call_cmd('systemctl start petasan-node-stats') logger.info("Starting OSDs") call_cmd('systemctl restart petasan-start-osds') elif not building_stage and not cluster_complete: logger.info("Start settings IPs") call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path())
def replace(self, ip, password): config = configuration() ssh_obj = ssh() config_api = ConfigAPI() logger.info("Starting replace.") if os.path.exists(config_api.get_cluster_info_file_path()): os.remove(config_api.get_cluster_info_file_path()) if ssh_obj.copy_public_key_from_host(ip, password): logger.info("Successfully copied public keys.") if ssh_obj.copy_private_key_from_host(ip, password): ssh_obj.create_authorized_key_file() logger.info("Successfully copied private keys.") else: raise SSHKeyException("Error copying keys") out, err = ssh_obj.exec_command( ip, "python {}".format(config_api.get_cluster_status_for_join_path())) out = int(out) if out == -1: raise ReplaceException("core_deploy_replace_mon_not_healthy_err") elif out == 0: raise ReplaceException( "core_deploy_replace_cluster_in_progress_err") elif out == 1: raise ReplaceException( "core_deploy_replace_two_management_node_down_err") elif out == 3: raise ReplaceException("core_deploy_replace_cluster_running_err") if not os.listdir( os.path.dirname(config_api.get_cluster_info_file_path())): os.makedirs( os.path.dirname(config_api.get_cluster_info_file_path())) logger.info("Starting to copy config file") if not ssh_obj.copy_file_from_host( ip, config_api.get_cluster_info_file_path()): raise Exception("Error copying config file") logger.info("Successfully copied config file.") cluster_name = config.get_cluster_name(True) logger.info("Successfully joined to cluster {}".format(cluster_name)) wrong_name = True wrong_ip = True for node_info in config.get_management_nodes_config(): if node_info.name == config.get_node_name( ) or node_info.management_ip == Network().get_node_management_ip(): if node_info.name == config.get_node_name(): wrong_name = False if node_info.management_ip == Network().get_node_management_ip( ): wrong_ip = False if not wrong_name and not wrong_ip: config.set_node_info(node_info, True) open(config_api.get_replace_file_path(), 'w+').close() break if wrong_name and wrong_ip: os.remove(config_api.get_cluster_info_file_path()) raise ReplaceException("core_deploy_replace_node_do_not_match_err") elif wrong_name: os.remove(config_api.get_cluster_info_file_path()) raise ReplaceException( "core_deploy_replace_node_do_not_match_name_err") elif wrong_ip: os.remove(config_api.get_cluster_info_file_path()) raise ReplaceException( "core_deploy_replace_node_do_not_match_ip_err") config.set_password(password) logger.info("password set successfully.") self.__copy_current_tunings(ip) return cluster_name
def process(self): logger.info("Start process reassignments paths.") max_retry = 100 current_reassignments = self.get_current_reassignment() config = configuration() assignment_script_path = ConfigAPI().get_assignment_script_path() if current_reassignments is None: return for ip, path_assignment_info in current_reassignments.iteritems(): logger.info("process path {} and its status is {}".format( ip, path_assignment_info.status)) if path_assignment_info.status == ReassignPathStatus.pending: logger.info( "Move action,try clean disk {} path {} remotely on node {}." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) status = False try: cmd = "python {} path_host -ip {} -disk_id {}".format( assignment_script_path, path_assignment_info.ip, path_assignment_info.disk_id) out, err = ssh().exec_command(path_assignment_info.node, cmd) logger.info(cmd) # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id) except Exception as ex: logger.exception(ex.message) out = "" if str(out).strip() == "0": logger.info("Move action passed") status = True current_path_assignment_info = None if status: for i in xrange(0, max_retry): logger.debug( "Wait to update status of path {}.".format( path_assignment_info.ip)) sleep(0.25) reassignments = self.get_current_reassignment() if reassignments: current_path_assignment_info = reassignments.get( path_assignment_info.ip) if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: continue else: logger.info( "Process completed for path {} with status {}." .format( current_path_assignment_info.ip, current_path_assignment_info.status)) break if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: self.update_path(current_path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action,failed ,disk {} path {}.".format( path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) else: self.update_path(path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action ,failed to clean disk {} path {} remotely on node ." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) sleep(10) # wait for display status to user if needed logger.info("Process completed.") self.remove_assignment() ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), config.get_node_name())
def __start_leader_locally(): PetaSAN.core.common.cmd.call_cmd( 'python ' + ConfigAPI().get_consul_start_up_script_path()) return
def build_monitors(): cluster_name = configuration().get_cluster_name() ceph_mon_keyring = ConfigAPI().get_ceph_mon_keyring(cluster_name) ceph_client_admin_keyring = ConfigAPI().get_ceph_keyring_path(cluster_name) status = StatusReport() try: _fsid = uuid.uuid4() content = "[global]\n\ fsid = {fsid}\n\ mon_host = {mon_host}\n\ \n\ public_network = {public_network}\n\ cluster_network = {cluster_network}\n\ \n" cluster_config = configuration() current_node_info = cluster_config.get_node_info() current_node_name = current_node_info.name current_cluster_info = cluster_config.get_cluster_info() config_api = ConfigAPI() mon_hosts_backend_ip = [] remote_mons_management_ips = [] for i in current_cluster_info.management_nodes: node_info = NodeInfo() node_info.load_json(json.dumps(i)) mon_hosts_backend_ip.append(node_info.backend_1_ip) if current_node_name != node_info.name: remote_mons_management_ips.append(node_info.management_ip) if not os.path.exists(config_api.get_cluster_ceph_dir_path()): os.makedirs(os.path.dirname( config_api.get_cluster_ceph_dir_path())) with open( config_api.get_cluster_ceph_dir_path() + "{}.conf".format(cluster_name), 'w', ) as f: f.write( content.format( fsid=_fsid, public_network=str( current_cluster_info.backend_1_base_ip) + "/" + __get_net_size(str(current_cluster_info.backend_1_mask)), cluster_network=str( current_cluster_info.backend_2_base_ip) + "/" + __get_net_size(str(current_cluster_info.backend_2_mask)), mon_initial=cluster_config.get_node_name(), mon_host=cluster_config.get_node_info().backend_1_ip + ',' + ','.join(mon_hosts_backend_ip)) + cluster_config.get_ceph_tunings() + "\n") if not call_cmd( "ceph-authtool --create-keyring /tmp/{} --gen-key -n mon. --cap mon 'allow *'" .format(ceph_mon_keyring)): logger.error( "ceph-authtool --create-keyring for mon returned error") status.success = False # elif not call_cmd("".join(["ceph-authtool --create-keyring {}".format(ceph_client_admin_keyring), # " --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'"])) : # Nautilius remove --set-uid=0 elif not call_cmd("".join([ "ceph-authtool --create-keyring {}".format( ceph_client_admin_keyring), " --gen-key -n client.admin --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'" ])): logger.error( "ceph-authtool --create-keyring for admin returned error") status.success = False elif not call_cmd("ceph-authtool /tmp/{} --import-keyring {}".format( ceph_mon_keyring, ceph_client_admin_keyring)): logger.error("ceph-authtool --import-keyring returned error") status.success = False elif not call_cmd( "monmaptool --create --add {} {} --fsid {} /tmp/monmap".format( cluster_config.get_node_name(), cluster_config.get_node_info().backend_1_ip, _fsid)): logger.error("monmaptool --create --add returned error") status.success = False if not os.path.exists("/var/lib/ceph/mon/{}-{}".format( cluster_name, current_node_name)): os.makedirs("/var/lib/ceph/mon/{}-{}".format( cluster_name, current_node_name)) if not status.success or not call_cmd( "ceph-mon --cluster {} --mkfs -i {} --monmap /tmp/monmap --keyring /tmp/{}" .format(cluster_name, current_node_name, ceph_mon_keyring)): logger.error("ceph-mon --mkfs --add returned error") status.success = False open( "/var/lib/ceph/mon/{}-{}/done".format(cluster_name, current_node_name), 'w+').close() open( "/var/lib/ceph/mon/{}-{}/systemd".format(cluster_name, current_node_name), 'w+').close() call_cmd("chown -R ceph:ceph /var/lib/ceph/mon") call_cmd("systemctl enable ceph.target ") call_cmd("systemctl enable ceph-mon.target ") call_cmd("systemctl enable ceph-mon@{} ".format(current_node_name)) if not status.success or not call_cmd( "systemctl start ceph-mon@{} ".format(current_node_name)): status.success = False if not status.success: status.failed_tasks.append( "Create ceph mon on {} returned error.".format( current_node_name)) return status logger.info("First monitor started successfully") # create local manager : call_cmd('/opt/petasan/scripts/create_mgr.py') logger.info("Starting to deploy remote monitors") # call_cmd("ceph-create-keys --cluster {} -i {} ".format(cluster_name,current_node_name)) # Nautilius copy bootstrap-osd ourselves if not os.path.exists("/var/lib/ceph/bootstrap-osd/"): os.makedirs("/var/lib/ceph/bootstrap-osd/") call_cmd( 'ceph auth get client.bootstrap-osd > /var/lib/ceph/bootstrap-osd/ceph.keyring' ) for remote_mon in remote_mons_management_ips: ssh_obj = ssh() if not ssh_obj.copy_file_to_host( remote_mon, "{}".format(ceph_client_admin_keyring)): logger.error("Cannot copy {} to {}".format( ceph_client_admin_keyring, remote_mon)) status.success = False elif not ssh_obj.copy_file_to_host( remote_mon, "/etc/ceph/{}.conf".format(cluster_name)): logger.error("Cannot copy ceph.conf to {}".format(remote_mon)) status.success = False elif not ssh_obj.call_command( remote_mon, " python {} ".format( config_api.get_node_create_mon_script_path())): logger.error("Cannot create monitor on remote node {}".format( remote_mon)) status.success = False # Nautilius copy bootstrap-osd ourselves : elif not ssh_obj.call_command( remote_mon, 'mkdir -p /var/lib/ceph/bootstrap-osd'): logger.error( "Cannot create bootstrap-osd dir on remote node {}".format( remote_mon)) status.success = False elif not ssh_obj.copy_file_to_host( remote_mon, '/var/lib/ceph/bootstrap-osd/ceph.keyring'): logger.error("Cannot copy bootstrap-osd keyring to {}".format( remote_mon)) status.success = False if not status.success: status.failed_tasks.append( "core_cluster_deploy_monitor_create_err" + "%" + remote_mon) return status if not __test_mons(): status.success = False status.failed_tasks.append("core_cluster_deploy_monitors_down_err") return status # Nautilius enable msgr2 : call_cmd('ceph mon enable-msgr2') except Exception as ex: status.success = False logger.exception(ex.message) status.failed_tasks.append( "core_cluster_deploy_monitor_exception_occurred" + "%" + current_node_name) return status status.success = True return status
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' from PetaSAN.core.cluster.configuration import configuration import os from PetaSAN.core.common.log import logger from PetaSAN.core.config.api import ConfigAPI cluster_name = configuration().get_cluster_name() node_info = configuration().get_node_info() node_name = node_info.name nodes = configuration().get_management_nodes_config() collected_path = ConfigAPI().get_collect_state_dir()+node_name if not os.path.exists("{}".format(ConfigAPI().get_collect_state_dir())): os.system("mkdir {}".format(ConfigAPI().get_collect_state_dir())) if os.path.exists(collected_path): os.system("rm -rf {}".format(collected_path)) if os.path.exists("{}.tar".format(collected_path)): os.system("rm -rf {}.tar".format(collected_path)) os.mkdir("{}".format(collected_path)) try: for node in nodes: if node.name == node_info.name: continue
def get_disk_list(self): file_path = ConfigAPI().get_manage_node_disk_script() cmd = "python {} disk-list -pid 1".format(file_path) out, err = exec_command(cmd) data = json.loads(out) return data
def new(): conf = configuration() current_node_name = conf.get_node_info().name clu = conf.get_cluster_info() logger.info('Creating new cluster named %s', clu.name) cfg = CephConf() cfg.add_section('global') fsid = uuid.uuid4() cfg.set('global', 'fsid', str(fsid)) # if networks were passed in, lets set them in the # global section cfg.set('global', 'public network', str(clu.backend_1_base_ip)+"/"+get_net_size(str(clu.backend_1_mask))) cfg.set('global', 'cluster network', str(clu.backend_2_base_ip)+"/"+get_net_size(str(clu.backend_2_mask))) mon_initial_members = [] mon_host = [] config_api = ConfigAPI() for i in clu.management_nodes: node_info=NodeInfo() node_info.load_json(json.dumps(i)) mon_initial_members.append(node_info.name) mon_host.append(node_info.backend_1_ip) cfg.set('global', 'mon initial members', ', '.join(mon_initial_members)) # no spaces here, see http://tracker.newdream.net/issues/3145 cfg.set('global', 'mon host', ','.join(mon_host)) # override undesirable defaults, needed until bobtail # http://tracker.ceph.com/issues/6788 cfg.set('global', 'auth cluster required', 'cephx') cfg.set('global', 'auth service required', 'cephx') cfg.set('global', 'auth client required', 'cephx') cfg.set('global', 'mon clock drift allowed', '.300') cfg.set('global', 'osd pool default size', '2') cfg.set('global', 'max open files', '131072') # http://tracker.newdream.net/issues/3138 cfg.set('global', 'filestore xattr use omap', 'true') path = '{name}.conf'.format( name=clu.name, ) new_mon_keyring(clu.name) logger.info('Writing initial config to %s...', path) tmp = '%s.tmp' % path with file(tmp, 'w') as f: cfg.write(f) try: os.rename(tmp, path) except OSError as e: raise
def update_neighbors_arp(self, ip, eth): eth_name = self.get_eth_name(ip) if eth_name is not None and "." in eth_name: eth = eth_name call_cmd("python " + ConfigAPI().get_arping_script_path() + " -ip {} -eth {} &".format(ip, eth))
def add_osd(self, node_name, disk_name, journal=None, cache=None, cache_type="disabled"): """ :param node_name: :param disk_name: :param journal: :param cache: :param cache_type: :return: it will return pid number, so pid will use to track the error message if occurred , if it returns -1 , this means : core_manage_node_add_osd_err """ # Journal value will be : # - None : if no journal exist , # - disk_name : if user selected a journal or # - auto : if user did not select journal ssh_obj = ssh() cmd = "" # ---------------------------------------------------------------------- # if journal: if journal != "auto": if not self.is_journal_space_avail(node_name, str(journal).lower()): raise DiskException( DiskException.JOURNAL_NO_SPACE, 'There is no disk space for a new OSD with journal.') if not self.has_valid_journal(node_name): raise DiskException( DiskException.JOURNALS_NO_SPACE, 'There is no disk space for a new OSD with all existing journals.' ) # ---------------------------------------------------------------------- # if cache: if cache != 'auto': if not self.is_cache_partition_avail(node_name, str(cache).lower()): raise DiskException( DiskException.CACHE_NO_SPACE, 'There is no disk space for a new OSD with cache.') if not self.has_valid_cache(node_name): raise DiskException( DiskException.CACHE_NO_SPACE, 'There is no disk space for a new OSD with all existing caches.' ) # ---------------------------------------------------------------------- # # Adding OSD with Journal & Cache : # ================================= if journal and cache and cache_type != "disabled": cmd = "python {} -disk_name {} -journal {} -cache {} -cache_type {}".format( ConfigAPI().get_admin_add_osd_job_script(), disk_name, str(journal).lower(), str(cache).lower(), str(cache_type)) # Adding OSD with Journal : # ========================= elif journal: cmd = "python {} -disk_name {} -journal {}".format( ConfigAPI().get_admin_add_osd_job_script(), disk_name, str(journal).lower()) # Adding OSD with Cache : # ======================= elif cache and cache_type != "disabled": cmd = "python {} -disk_name {} -cache {} -cache_type {}".format( ConfigAPI().get_admin_add_osd_job_script(), disk_name, str(cache).lower(), str(cache_type)) # Adding OSD : # ============ elif journal is None and cache is None: cmd = "python {} -disk_name {}".format( ConfigAPI().get_admin_add_osd_job_script(), disk_name) # stdout, stderr = exec_command(cmd) stdout, stderr = ssh_obj.exec_command(node_name, cmd) logger.info("Start add osd job {} ".format(stdout)) return stdout
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' from PetaSAN.core.consul.ps_consul import RetryConsulException from datetime import date, datetime from requests import ConnectionError from time import sleep from consul import ConsulException from PetaSAN.core.common.log import logger from uuid import uuid1, uuid4 from flask.sessions import SessionInterface, SessionMixin from PetaSAN.core.consul.base import BaseAPI from PetaSAN.core.config.api import ConfigAPI consul_session_key = ConfigAPI().get_consul_session_path() class ConsulSession(SessionMixin): """Server-side session implementation. """ def __init__(self, sid, *args, **kwargs): self.sid = sid self.get_all_sessions() self.permanent = True pass def __getitem__(self, key): self.get_all_sessions() self.__dict__[key] return self.__dict__[key]
class JobType: ADDDISK = "adddisk" DELETEOSD = "deleteosd" CLIENTSTRESS = "client_stress" STORAGELOAD = "storage_load" BENCHMANAGER = "bench_manager" ADDJOURNAL = "addjournal" DELETEJOURNAL = "deletejournal" DELETE_POOL = 'delete_pool' DELETE_DISK = 'delete_disk' ADDCACHE = "addcache" DELETECACHE = "deletecache" TEST = "test" job_scripts = {JobType.ADDDISK: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-osd"), JobType.DELETEOSD: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-osd"), JobType.CLIENTSTRESS: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "client"), JobType.STORAGELOAD: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "storage"), JobType.BENCHMANAGER: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "manager"), JobType.ADDJOURNAL: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-journal"), JobType.DELETEJOURNAL: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-journal"), JobType.DELETE_POOL: ConfigAPI().get_delete_pool_scipt(), JobType.DELETE_DISK: ConfigAPI().get_delete_disk_scipt(), JobType.ADDCACHE: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-cache"), JobType.DELETECACHE: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-cache"), JobType.TEST: '/opt/petasan/scripts/test.sh -arg1 -arg2 '} class Job(object):
def clean_ceph_local(): config_api = ConfigAPI() call_cmd(" python {} ".format(config_api.get_node_clean_script_path()))
class CrushMap: CRUSH_SAVE_PATH = ConfigAPI().get_crush_save_path() def __init__(self): self.types = [-1 for i in range(20)] self.buckets = [] self.rules = {} self.device_class = {} self.device_weight = {} self.lines_tunables = [] self.lines_devices = [] self.lines_types = [] self.lines_buckets = [] self.lines_rules = [] def _decode_types(self): max_index = 0 for line in self.lines_types: if line.endswith('type'): continue tokens = line.split() self.types[int(tokens[1])] = tokens[2] if int(tokens[1]) > max_index: max_index = int(tokens[1]) self.types = self.types[:max_index + 1] def _decode_buckets(self): bucket = None for line in self.lines_buckets: if line.endswith('{'): bucket = {} bucket['items'] = [] bucket['class_ids'] = {} tokens = line.split() type = tokens[0] type_id = self.types.index(type) bucket['type_id'] = type_id bucket['name'] = tokens[1] elif line.startswith('alg'): tokens = line.split() bucket['alg'] = tokens[1] elif line.startswith('hash'): tokens = line.split() bucket['hash'] = int(tokens[1]) elif line.startswith('item'): tokens = line.split() if len(tokens) == 4 and tokens[1].startswith('osd.'): # osd, append class and weight info self.device_weight[tokens[1]] = tokens[3] if self.device_class.has_key(tokens[1]): item = tokens[1] + '#' + self.device_class[ tokens[1]] + '#' + self.device_weight[tokens[1]] else: item = tokens[ 1] + '#' + 'class not defined' + '#' + self.device_weight[ tokens[1]] bucket['items'].append(item) else: bucket['items'].append(tokens[1]) elif line.startswith('id'): tokens = line.split() id = int(tokens[1]) if tokens[2] == 'class': bucket['class_ids'][tokens[3]] = id else: bucket['id'] = id bucket['hash'] = int(tokens[1]) elif line.startswith('}'): self.buckets.append(bucket) #print(self.buckets) def _decode_rules(self): name = None body = None for line in self.lines_rules: if line.startswith('rule'): tokens = line.split() name = tokens[1] body = '{\n' continue if line.startswith('}'): body += '}' self.rules[name] = body continue body += line + '\n' #print self.rules def _decode_device_class(self): for line in self.lines_devices: if not line.startswith('device'): continue tokens = line.split() if len(tokens) != 5: continue if not tokens[2].startswith('osd') or not tokens[3].startswith( 'class'): continue self.device_class[tokens[2]] = tokens[4] def _get_rule_ids(self): ids = [] for rule in self.rules: body = self.rules[rule] id = self._get_rule_id(body) if id: ids.append(id) return ids def _get_rule_id(self, body): lines = body.splitlines() for line in lines: if line.startswith('id'): tokens = line.split() if len(tokens) < 2: continue return tokens[1] return None def _get_rule_class(self, body): lines = body.splitlines() for line in lines: line.strip() if line.startswith('#'): continue if 'step' in line and 'take' in line and 'class' in line: tokens = line.split() index = tokens.index('class') if index + 1 < len(tokens): return tokens[index + 1] return None def _encode_buckets(self): self.lines_buckets = [] bucket_names = set() for bucket in self.buckets: # duplicate name check if bucket['name'] in bucket_names: logger.error('Crush duplicate bucket name:' + bucket['name']) raise CrushException(CrushException.DUPLICATE_BUCKET_NAME, 'Duplicate bucket name') bucket_names.add(bucket['name']) type = self.types[bucket['type_id']] self.lines_buckets.append(type + ' ' + bucket['name'] + ' {') self.lines_buckets.append('id ' + str(bucket['id'])) if bucket.has_key('class_ids'): for c in bucket['class_ids']: self.lines_buckets.append('id ' + str(bucket['class_ids'][c]) + ' class ' + c) self.lines_buckets.append('alg ' + bucket['alg']) self.lines_buckets.append('hash ' + str(bucket['hash'])) if 'items' in bucket: for item in bucket['items']: if item.startswith('osd'): # osd tokens = item.split('#') osd_item = 'item ' + tokens[0] if tokens[0] in self.device_weight: osd_item += ' weight ' + self.device_weight[ tokens[0]] self.lines_buckets.append(osd_item) else: self.lines_buckets.append('item ' + item) self.lines_buckets.append('}') def _encode_rules(self): self.lines_rules = [] for name in self.rules: self.lines_rules.append('rule ' + name + ' ') body = self.rules[name] self.lines_rules += body.splitlines() def _get_rand_string(self, n): return ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(n)) def _read_file_lines(self, backup=False): # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() call_cmd('mkdir -p ' + self.CRUSH_SAVE_PATH) cluster_name = configuration().get_cluster_name() rand = self._get_rand_string(6) bin_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.bin' txt_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.txt' cmd = 'ceph osd getcrushmap -o ' + bin_file + ' ' + ceph_auth.get_authentication_string( ) + ' --cluster ' + cluster_name ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr and ('Connection timed out' in stderr or 'error connecting' in stderr): logger.error('Error in Ceph Connection cmd:' + cmd) raise CephException(CephException.CONNECTION_TIMEOUT, 'Connection Timeout Error') logger.error('General error in Ceph cmd:' + cmd + ' error:' + stderr) raise CephException(CephException.GENERAL_EXCEPTION, 'General Ceph Error') cmd = 'crushtool -d ' + bin_file + ' -o ' + txt_file if not call_cmd(cmd): raise CrushException(CrushException.DECOMPILE, 'Crush Decompile Error') with open(txt_file, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] section = 'start' # for section tags see src/crush/CrushCompiler.cc decompile for line in lines: if len(line) == 0: continue if line.startswith('# begin crush map'): section = 'tunables' continue elif line.startswith('# devices'): section = 'devices' continue elif line.startswith('# types'): section = 'types' continue elif line.startswith('# buckets'): section = 'buckets' continue elif line.startswith('# rules'): section = 'rules' continue elif line.startswith('# choose_args'): section = 'end' break elif line.startswith('# end crush map'): section = 'end' break if section == 'tunables': self.lines_tunables.append(line) elif section == 'devices': self.lines_devices.append(line) elif section == 'types': self.lines_types.append(line) elif section == 'buckets': self.lines_buckets.append(line) elif section == 'rules': self.lines_rules.append(line) if backup: self._backup(txt_file) call_cmd('rm ' + txt_file) call_cmd('rm ' + bin_file) def _backup(self, crush_file): stamp = datetime.now().strftime('%Y%m%d-%H:%M:%S') backup_name = 'crushmap-' + stamp + '.txt' backup_path = self.CRUSH_SAVE_PATH + backup_name # backup on filesystem cmd = 'cp ' + crush_file + ' ' + backup_path call_cmd(cmd) # backup to consul cmd = 'consul kv put PetaSAN/crush/' + backup_name + ' @' + backup_path call_cmd(cmd) def _get_backup_file_name(self): t = datetime.now().strftime('%Y%m%d-%H:%M:%S') return self.CRUSH_SAVE_PATH + 'crushmap-' + t + '.txt' def _write_file_lines(self): rand = self._get_rand_string(6) bin_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.bin' txt_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.txt' with open(txt_file, 'w') as f: for line in self.lines_tunables: f.writelines(line + '\n') f.writelines('\n') for line in self.lines_devices: f.writelines(line + '\n') f.writelines('\n') for line in self.lines_types: f.writelines(line + '\n') f.writelines('\n') for line in self.lines_buckets: f.writelines(line + '\n') f.writelines('\n') for line in self.lines_rules: f.writelines(line + '\n') f.writelines('\n') cmd = 'crushtool -c ' + txt_file + ' -o ' + bin_file if not call_cmd(cmd): raise CrushException(CrushException.COMPILE, 'Crush Compile Error') cluster_name = configuration().get_cluster_name() cmd = 'ceph osd setcrushmap -i ' + bin_file + ' --cluster ' + cluster_name ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr and ('Connection timed out' in stderr or 'error connecting' in stderr): logger.error('Error in Ceph Connection cmd:' + cmd) raise CephException(CephException.CONNECTION_TIMEOUT, 'Connection Timeout Error') logger.error('General error in Ceph cmd:' + cmd + ' error:' + stderr) raise CephException(CephException.GENERAL_EXCEPTION, 'General Ceph Error') call_cmd('rm ' + txt_file) call_cmd('rm ' + bin_file) def read(self, backup=False): self._read_file_lines(backup) self._decode_device_class() self._decode_types() self._decode_buckets() self._decode_rules() def write(self): self._encode_rules() self._encode_buckets() self._write_file_lines() def get_bucket_types(self): return self.types def get_buckets(self): return self.buckets def set_buckets(self, buckets): self.buckets = buckets def get_rules(self): return self.rules def add_rule(self, name, body): if self.rules.has_key(name): logger.error('add rule error, rule name ' + name + ' already exists') raise CrushException(CrushException.DUPLICATE_RULE_NAME, 'Duplicate rule name') ids = self._get_rule_ids() id = self._get_rule_id(body) if id in ids: logger.error('add rule error, rule id ' + id + ' already exists') raise CrushException(CrushException.DUPLICATE_RULE_ID, 'Duplicate rule id') dev_class = self._get_rule_class(body) if dev_class: if dev_class not in self.device_class.values(): logger.error('add rule error, device class ' + dev_class + ' does not exist') raise CrushException(CrushException.DEVICE_TYPE_NOT_EXISTS, 'Device type does not exist') self.rules[name] = body def update_rule(self, name, body): if not self.rules.has_key(name): logger.error('edit rule error, rule name ' + name + ' not found') raise CrushException(CrushException.RULE_NOT_FOUND, 'Rule not found') id_old = self._get_rule_id(self.rules[name]) id_new = self._get_rule_id(body) if id_new != id_old: ids = self._get_rule_ids() if id_new in ids: logger.error('update rule error, rule id ' + id_new + ' already exists') raise CrushException(CrushException.DUPLICATE_RULE_ID, 'Duplicate rule id') self.rules[name] = body # def get_next_rule_id(self): # next = 0 # ids = self._get_rule_ids() # for id in ids : # if next < int(id) : # next = int(id) # return str(next+1) def get_next_rule_id(self): ids = self._get_rule_ids() next_id = 0 ids_set = set(ids) while True: if str(next_id) not in ids_set: return str(next_id) next_id += 1 return str(next_id) def delete_rule(self, name): if not self.rules.has_key(name): logger.error('delete rule error, rule name ' + name + ' not found') raise CrushException(CrushException.RULE_NOT_FOUND, 'Rule not found') del self.rules[name]
parser.add_argument('-id', help='Disk id such as 00006.', required=True, type=str) parser.add_argument('-ip', help='IP address of path.', required=True, type=str) args = parser.parse_args() return args __app_conf = ConfigAPI() __node_info = configuration().get_node_info() __ceph_api = CephAPI() __disk_id = '' __ip = '' __network = NetworkAPI() __consul_api = ConsulAPI() __session = None def main_catch(func, args): try: func(args) except Exception as e: logger.error(e.message)
def run(self): cmd = "python {} server &".format( ConfigAPI().get_assignment_script_path()) call_cmd(cmd)
#!/usr/bin/python ''' Copyright (C) 2019 Maged Mokhtar <mmokhtar <at> petasan.org> Copyright (C) 2019 PetaSAN www.petasan.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. ''' from PetaSAN.core.common.cmd import kill_by_file_name from PetaSAN.core.config.api import ConfigAPI kill_by_file_name(ConfigAPI().get_sync_file_service()) kill_by_file_name(ConfigAPI().get_petasan_service()) kill_by_file_name(ConfigAPI().get_management_service())
class MangePathAssignment(object): def __init__(self): self.__app_conf = ConfigAPI() self.__context = AssignmentContext() self.__session_dict = ConsulAPI().get_sessions_dict( ConfigAPI().get_iscsi_service_session_name()) self.__node_session_dict = dict() pass def get_assignments_stats(self): return self._filter_assignments_stats() def search_by_disk_name(self, disk_name): return self._filter_assignments_stats(filter_type=1, filter_text=disk_name) def search_by_ip(self, ip): return self._filter_assignments_stats(filter_type=2, filter_text=ip) def _filter_assignments_stats(self, filter_type=0, filter_text=None, set_session=False): __disk_consul_stopped = set() running_paths = dict() ceph_api = CephAPI() consul_api = ConsulAPI() disk_kvs = consul_api.get_disk_kvs() # Step 1 get all running paths. for consul_kv_obj in disk_kvs: path_key = str(consul_kv_obj.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(path_key).split('/')[0] if disk_id in __disk_consul_stopped: continue if consul_kv_obj.Value == "disk": disk_id = str(path_key).split('/')[0] # Step 2 avoid stopping disks if str(consul_kv_obj.Flags) == "1": __disk_consul_stopped.add(disk_id) continue running_paths[path_key] = consul_kv_obj if len(running_paths) == 0: return AssignmentStats() # Step 3 get all images metadata images = ceph_api.get_disks_meta() assignment_stats = AssignmentStats() # Step 4 get current reassignments current_running_assignments = self.get_current_reassignment() if current_running_assignments is not None: assignment_stats.is_reassign_busy = True filter_type = 0 # we will stop any filter and get all data if here is running reassignment # Step 5 fill paths assignment info for path_key, consul_kv_obj in running_paths.iteritems(): disk_id = str(path_key).split('/')[0] disk = next((img for img in images if img.id == disk_id), None) if disk is None: continue disk_path = Path() path_index = int(str(path_key).split(disk_id + "/")[1]) path_str = disk.paths[path_index - 1] disk_path.load_json(json.dumps(path_str)) path_assignment_info = PathAssignmentInfo() path_assignment_info.interface = disk_path.eth if disk_path.vlan_id: path_assignment_info.interface = disk_path.eth + "." + disk_path.vlan_id path_assignment_info.ip = disk_path.ip path_assignment_info.disk_name = disk.disk_name path_assignment_info.disk_id = disk_id path_assignment_info.index = path_index current_path = None if current_running_assignments is not None: current_path = current_running_assignments.get(disk_path.ip) if hasattr(consul_kv_obj, "Session") and self.__session_dict.has_key( consul_kv_obj.Session): # Fill status and node name for started paths path_assignment_info.node = self.__session_dict.get( consul_kv_obj.Session).Node if current_running_assignments is not None: if current_path is not None and current_path.status != -1: path_assignment_info.status = current_path.status path_assignment_info.target_node = current_path.target_node if set_session: # session refers to the node that lock this path assignment,This property helps to know the # status of path and the node will handle this path path_assignment_info.session = current_path.session elif current_path: path_assignment_info.node = current_path.node path_assignment_info.target_node = current_path.target_node path_assignment_info.status = current_path.status if set_session: path_assignment_info.session = current_path.session # Step 6 search or get all if filter_type == 1 and filter_text is not None and len( str(filter_text).strip()) > 0: # by disk name if filter_text.strip().lower( ) in path_assignment_info.disk_name.lower(): assignment_stats.paths.append(path_assignment_info) elif filter_type == 2 and filter_text is not None and len( str(filter_text).strip()) > 0: # by ip if filter_text.strip() == path_assignment_info.ip.strip(): assignment_stats.paths.append(path_assignment_info) break else: assignment_stats.paths.append(path_assignment_info) # Step 7 set all online nodes assignment_stats.nodes = self._get_nodes() return assignment_stats def get_current_reassignment(self): paths = ConsulAPI().get_assignments() if paths is not None: for ip, path_assignment_info in paths.iteritems(): if not hasattr(path_assignment_info, "session"): logger.info("Path {} not locked by node.".format( path_assignment_info.ip)) if not hasattr( path_assignment_info, "session") and path_assignment_info.status not in [ ReassignPathStatus.succeeded, ReassignPathStatus.failed ]: path_assignment_info.status = ReassignPathStatus.failed return paths def set_new_assignments(self, paths_assignment_info): logger.info("Set new assignment.") if self.get_current_reassignment() is not None: raise Exception("There is already running assignment.") config_api = ConfigAPI() consul_api = ConsulAPI() logger.info("Delete old assignments.") consul_api.delete_assignments() session = consul_api.get_new_session_ID( config_api.get_assignment_session_name(), configuration().get_node_name(), True) if consul_api.lock_key(config_api.get_consul_assignment_path(), session, "root"): logger.info("Lock assignment root.") for path_assignment_info in paths_assignment_info: path_assignment_info.status = ReassignPathStatus.pending consul_api.set_path_assignment( path_assignment_info, self._get_node_session(path_assignment_info.target_node)) logger.info( "New assignment for {} ,disk {}, from node {} and to node {} with status {}" .format(path_assignment_info.ip, path_assignment_info.disk_id, path_assignment_info.node, path_assignment_info.target_node, path_assignment_info.status)) else: logger.error("Can't lock paths assignment key.") raise Exception("Can't lock paths assignment key.") def run(self): cmd = "python {} server &".format( ConfigAPI().get_assignment_script_path()) call_cmd(cmd) def _get_nodes(self): consul_api = ConsulAPI() # Get all PetaSAN nodes[management or storage]. node_list = consul_api.get_node_list() # Get online nodes from consul. consul_members = consul_api.get_consul_members() petasan_node_list = [] for i in node_list: if not i.is_iscsi: continue if i.name in consul_members: petasan_node_list.append(i.name) return petasan_node_list def remove_assignment(self): consul_api = ConsulAPI() if consul_api.get_assignments() is not None: consul_api.delete_assignments() def auto(self, type=1): logger.info("User start auto reassignment paths.") assignments_stats = self.get_assignments_stats() if assignments_stats.is_reassign_busy: logger.error("There is already reassignment running.") raise Exception("There is already reassignment running.") ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), configuration().get_node_name()) sleep(3) assignments_stats.paths = [ path for path in assignments_stats.paths if len(path.node.strip()) > 0 and path.status == -1 ] self.__context.paths = assignments_stats.paths self.__context.nodes = assignments_stats.nodes for plugin in self._get_new_plugins_instances(auto_plugins): if plugin.is_enable() and plugin.get_plugin_id() == type: paths_assignments = plugin.get_new_assignments() if len(paths_assignments) == 0: logger.info("There is no node under average.") return self.set_new_assignments(paths_assignments) break self.run() def manual(self, paths_assignment_info, assign_to="auto"): assignments_stats = self.get_assignments_stats() if assignments_stats.is_reassign_busy: logger.error("There is already reassignment running.") raise Exception("There is already reassignment running.") ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), configuration().get_node_name()) sleep(3) # Wait to be sure the session dropped if assign_to == "auto": logger.info( "User start auto reassignment paths for selected paths.") assignments_stats.paths = [ path for path in assignments_stats.paths if len(path.node.strip()) > 0 and path.status == -1 ] self.__context.paths = assignments_stats.paths self.__context.nodes = assignments_stats.nodes self.__context.user_input_paths = paths_assignment_info for plugin in self._get_new_plugins_instances(auto_plugins): if plugin.is_enable() and plugin.get_plugin_id() == 1: paths_assignments = plugin.get_new_assignments() self.set_new_assignments(paths_assignments) logger.info( "User start auto reassignment paths for selected paths." ) self.run() break pass else: for path_assignment_info in paths_assignment_info: path_assignment_info.target_node = assign_to path_assignment_info.status = ReassignPathStatus.pending logger.info( "User start manual reassignment paths for selected paths.") self.set_new_assignments(paths_assignment_info) self.run() def process(self): logger.info("Start process reassignments paths.") max_retry = 100 current_reassignments = self.get_current_reassignment() config = configuration() assignment_script_path = ConfigAPI().get_assignment_script_path() if current_reassignments is None: return for ip, path_assignment_info in current_reassignments.iteritems(): logger.info("process path {} and its status is {}".format( ip, path_assignment_info.status)) if path_assignment_info.status == ReassignPathStatus.pending: logger.info( "Move action,try clean disk {} path {} remotely on node {}." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) status = False try: cmd = "python {} path_host -ip {} -disk_id {}".format( assignment_script_path, path_assignment_info.ip, path_assignment_info.disk_id) out, err = ssh().exec_command(path_assignment_info.node, cmd) logger.info(cmd) # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id) except Exception as ex: logger.exception(ex.message) out = "" if str(out).strip() == "0": logger.info("Move action passed") status = True current_path_assignment_info = None if status: for i in xrange(0, max_retry): logger.debug( "Wait to update status of path {}.".format( path_assignment_info.ip)) sleep(0.25) reassignments = self.get_current_reassignment() if reassignments: current_path_assignment_info = reassignments.get( path_assignment_info.ip) if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: continue else: logger.info( "Process completed for path {} with status {}." .format( current_path_assignment_info.ip, current_path_assignment_info.status)) break if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: self.update_path(current_path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action,failed ,disk {} path {}.".format( path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) else: self.update_path(path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action ,failed to clean disk {} path {} remotely on node ." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) sleep(10) # wait for display status to user if needed logger.info("Process completed.") self.remove_assignment() ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), config.get_node_name()) def _clean_iscsi_config(self, disk_id, path_index, iqn): logger.debug("Move action ,start clean disk {} path {}.".format( disk_id, path_index)) lio_api = LioAPI() try: # Get tpgs for iqn. tpgs = lio_api.get_iqns_with_enabled_tpgs().get(iqn, None) if not iqn or not tpgs or len(tpgs) == 0: logger.info("Move action ,could not find ips for %s " % disk_id) # Remove the assigned ips from our interfaces elif tpgs and len(tpgs) > 0: # Get assigned ips for each path. for tpg, ips in tpgs.iteritems(): if tpg == str(path_index + 1): lio_api.disable_path(iqn, tpg) logger.info( "Move action,cleaned disk {} path {}.".format( disk_id, path_index)) break except Exception as e: logger.error("Move action,could not clean disk path for %s" % disk_id) return False logger.debug("Move action end clean disk {} path {}.".format( disk_id, path_index)) return True def clean_source_node(self, ip, disk_id): if not self.update_path(ip, ReassignPathStatus.moving): return False # pool = CephAPI().get_pool_bydisk(disk_id) pool = self._get_pool_by_disk(disk_id) if not pool: logger.error('Could not find pool for disk ' + disk_id) return False disk = CephAPI().get_disk_meta(disk_id, pool) paths_list = disk.paths disk_path = None path_index = -1 for i in xrange(0, len(paths_list)): path_str = paths_list[i] path = Path() path.load_json(json.dumps(path_str)) if path.ip == ip: disk_path = path path_index = i break if disk_path: self._clean_iscsi_config(disk_id, path_index, disk.iqn) network = Network() NetworkAPI().delete_ip(path.ip, path.eth, path.subnet_mask) if network.is_ip_configured(ip): logger.error( "Move action,cannot clean newtwork config for disk {} path {}." .format(disk_id, path_index)) self.update_path(ip, ReassignPathStatus.failed) return False logger.info( "Move action,clean newtwork config for disk {} path {}.". format(disk_id, path_index)) key = self.__app_conf.get_consul_disks_path( ) + disk_id + "/" + str(path_index + 1) consul_api = ConsulAPI() session = self._get_node_session(configuration().get_node_name()) if ConsulAPI().is_path_locked_by_session(key, session): consul_api.release_disk_path(key, session, None) logger.info("Move action,release disk {} path {}.".format( disk_id, path_index + 1)) else: self.update_path(ip, ReassignPathStatus.failed) return False return True def update_path(self, ip, status): logger.info("Updating path {} status to {} ".format(ip, status)) current_reassignments = self.get_current_reassignment() if current_reassignments: path_assignment_info = current_reassignments.get(ip) if path_assignment_info: path_assignment_info.status = status if ConsulAPI().update_path_assignment(path_assignment_info): logger.info("Path {} status updated to {} ".format( ip, status)) return True logger.info("Path {} status failed to update status to {} ".format( ip, status)) return False def _get_new_plugins_instances(self, modules): plugins = [] for cls in modules: try: # import plugins module mod_obj = __import__(cls) for i in str(cls).split(".")[1:]: mod_obj = getattr(mod_obj, i) # Find all plugins in module and create instances for mod_prop in dir(mod_obj): # Ignore private if not str(mod_prop).startswith("__"): attr = getattr(mod_obj, mod_prop) attr_str = str(attr) attr_type_str = str(type(attr)) # Find plugin from type ABCMeta , plugin class name contains 'plugin' and not contains base if attr_type_str.find( "ABCMeta") > -1 and attr_str.find( "Base") == -1 and attr_str.find("Plugin"): instance = attr(self.__context) plugins.append(instance) except Exception as e: logger.error("Error load plugin {}.".format(cls)) return plugins def get_forced_paths(self): paths = None assignments = self._filter_assignments_stats(set_session=True) if not assignments.is_reassign_busy: return paths for path_assignment_info in assignments.paths: if path_assignment_info.status == ReassignPathStatus.moving and hasattr( path_assignment_info, "session"): if paths is None: paths = dict() paths[path_assignment_info.disk_id + "/" + str(path_assignment_info.index)] = path_assignment_info return paths def _get_node_session(self, node_name): logger.info(self.__node_session_dict) if self.__session_dict: session = self.__node_session_dict.get(node_name) if session is not None: return session else: for sess, node in self.__session_dict.iteritems(): if node.Node == node_name: self.__node_session_dict[node] = sess return sess def _get_pool_by_disk(self, disk_id): consul_api = ConsulAPI() ceph_api = CephAPI() pool = consul_api.get_disk_pool(disk_id) if pool: logger.info('Found pool:{} for disk:{} via consul'.format( pool, disk_id)) return pool pool = ceph_api.get_pool_bydisk(disk_id) if pool: logger.info('Found pool:{} for disk:{} via ceph'.format( pool, disk_id)) return pool logger.error('Could not find pool for disk ' + disk_id) return None
def build(self): try: self.__status_report = StatusReport() conf = configuration() if len(conf.get_cluster_info().management_nodes) == 0: node_num = len(conf.get_cluster_info().management_nodes) + 1 self.__status_report.nod_num = node_num NTPConf().setup_ntp_local() if conf.add_management_node() != Status().done: self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_cant_add_node") logger.info( "Node 1 added, cluster requires 2 other nodes to build.") self.run_post_deploy_script() return BuildStatus().OneManagementNode elif len(conf.get_cluster_info().management_nodes) == 1: node_num = len(conf.get_cluster_info().management_nodes) + 1 self.__status_report.nod_num = node_num connection_status = self.check_connections() if not connection_status.success: self.__status_report.failed_tasks.extend( connection_status.failed_tasks) logger.error("Connection ping error.") logger.error(self.__status_report.failed_tasks) return BuildStatus().connection_error NTPConf().setup_ntp_local() if conf.add_management_node() != Status().done: self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_cant_add_node") return BuildStatus().error if not self.__sync_cluster_config_file(): return BuildStatus().error logger.info( "Node 2 is added, cluster requires 1 other node to build.") self.run_post_deploy_script() return BuildStatus().TwoManagementNodes elif len(conf.get_cluster_info().management_nodes) == 2: node_num = len(conf.get_cluster_info().management_nodes) + 1 self.__status_report.nod_num = node_num connection_status = self.check_connections() if not connection_status.success: self.__status_report.failed_tasks.extend( connection_status.failed_tasks) logger.error("Connection ping error.") logger.error(self.__status_report.failed_tasks) return BuildStatus().connection_error status = self.check_remote_connection() if not status.success: self.__status_report = status return BuildStatus().error NTPConf().setup_ntp_local() logger.info("Stopping petasan services on all nodes.") self.stop_petasan_services() logger.info("Starting local clean_ceph.") clean_ceph() logger.info("Starting local clean_consul.") clean_consul() status = build_consul() if not status.success: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.error("Could not build consul.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_consul_error status = build_monitors() if not status.success: self.__status_report = status logger.error("Could not build ceph monitors.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_monitors_error status = build_osds() if not status.success: self.__status_report = status logger.error("Could not build ceph OSDs.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_osd_error else: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.info("Main core components deployed.") if not self.__commit_management_nodes(): self.__status_report.success = False logger.error("Could not commit node.") self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_commit_node") logger.error(self.__status_report.failed_tasks) return BuildStatus().error logger.info("Starting all services.") self.start_petasan_services() if not self.add__node_to_hosts_file(): self.__status_report.success = False logger.error("Could not add node to hosts file.") self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_add_node_hosts") logger.error(self.__status_report.failed_tasks) return BuildStatus().error SharedFS().setup_management_nodes() if conf.add_management_node() != Status().done: self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_add_node_config") logger.error(self.__status_report.failed_tasks) return BuildStatus().error logger.info("Updating rbd pool.") if not create_rbd_pool(): self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_update_rbd") logger.error(self.__status_report.failed_tasks) return BuildStatus().error logger.info("Creating EC Profiles.") if not create_ec_profiles(): self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_create_ec_profiles") logger.error(self.__status_report.failed_tasks) return BuildStatus().error logger.info( "Waiting for ceph to reach active and clean status.") test_active_clean() if not self.__sync_cluster_config_file(): return BuildStatus().error self.run_post_deploy_script() self.kill_petasan_console(True) logger.info("Node 3 added and cluster is now ready.") elif len( conf.get_cluster_info().management_nodes ) == 3 and not os.path.exists(ConfigAPI().get_replace_file_path()): # ------------------------------ Join ------------------------------ # # ------------------------------------------------------------------ # node_num = len(conf.get_cluster_info().management_nodes) + 1 self.__status_report.nod_num = node_num logger.info("Joining node to running cluster.") connection_status = self.check_connections() if not connection_status.success: self.__status_report.failed_tasks.extend( connection_status.failed_tasks) logger.error("Connection ping error.") logger.error(self.__status_report.failed_tasks) return BuildStatus().connection_error status = self.check_remote_connection() NTPConf().setup_ntp_local() if not status.success: self.__status_report = status return BuildStatus().error logger.info("Stopping petasan services on local node.") self.stop_petasan_services(remote=False) logger.info("Starting local clean_ceph.") clean_ceph_local() logger.info("Starting local clean_consul.") clean_consul_local() status = build_consul_client() if not status.success: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.error("Could not build consul client.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_consul_error status = copy_ceph_config_from_mon() if not status.success: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.error("Could not copy ceph config.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_consul_error status = create_osds_local() if not status.success: self.__status_report = status logger.error("Could not build ceph OSDs.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_osd_error else: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.info("Main core components deployed.") logger.info("Staring all services") self.start_petasan_services(remote=False) test_active_clean() if not self.__commit_local_node(): test_active_clean() if not self.__commit_local_node(): self.__status_report.success = False logger.error("Could not commit node.") self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_commit_node_join") logger.error(self.__status_report.failed_tasks) os.remove(ConfigAPI().get_cluster_info_file_path()) return BuildStatus().error if not self.add__node_to_hosts_file(remote=False): test_active_clean() if not self.add__node_to_hosts_file(remote=False): self.__status_report.success = False logger.error("Could not add node to hosts file.") self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_add_node_hosts") logger.error(self.__status_report.failed_tasks) os.remove(ConfigAPI().get_cluster_info_file_path()) return BuildStatus().error logger.info("Node successfully joined to cluster.") self.kill_petasan_console(False) if os.path.exists(ConfigAPI().get_replace_file_path()): os.remove(ConfigAPI().get_replace_file_path()) self.run_post_deploy_script() return BuildStatus().done_joined elif len(conf.get_cluster_info().management_nodes ) == 3 and os.path.exists( ConfigAPI().get_replace_file_path()): # ----------------------------- Replace ---------------------------- # # ------------------------------------------------------------------ # node_num = len(conf.get_cluster_info().management_nodes) + 1 self.__status_report.nod_num = node_num logger.info("Replace node is starting.") connection_status = self.check_connections() if not connection_status.success: self.__status_report.failed_tasks.extend( connection_status.failed_tasks) logger.error("Connection ping error.") logger.error(self.__status_report.failed_tasks) return BuildStatus().connection_error status = self.check_remote_connection() NTPConf().setup_ntp_local() if not status.success: self.__status_report = status return BuildStatus().error logger.info("Stopping petasan services on local node.") self.stop_petasan_services(remote=False) logger.info("Starting clean_ceph.") clean_ceph_local() logger.info("Starting local clean_consul.") clean_consul_local() status = replace_consul_leader() if not status.success: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.error("Could not replace consul leader.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_consul_error status = replace_local_monitor() if not status.success: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.error(self.__status_report.failed_tasks) return BuildStatus().build_monitors_error status = create_osds_local() if not status.success: self.__status_report = status logger.error("Could not build ceph OSDs.") logger.error(self.__status_report.failed_tasks) return BuildStatus().build_osd_error else: self.__status_report.failed_tasks.extend( status.failed_tasks) logger.info("Main core components deployed.") logger.info("Starting all services.") self.start_petasan_services(remote=False) test_active_clean() SharedFS().rebuild_management_node() logger.info("Node successfully added to cluster.") self.run_post_deploy_script() self.kill_petasan_console(False) os.remove(ConfigAPI().get_replace_file_path()) return BuildStatus().done_replace except Exception as ex: config_api = ConfigAPI() if os.path.exists(config_api.get_cluster_info_file_path()): os.remove(config_api.get_cluster_info_file_path()) logger.exception(ex.message) return BuildStatus().error return BuildStatus().done
def clean_consul_local(): logger.info("Trying to clean Consul on local node") PetaSAN.core.common.cmd.call_cmd('python ' + ConfigAPI().get_consul_stop_script_path()) PetaSAN.core.common.cmd.call_cmd( 'python ' + ConfigAPI().get_consul_clean_script_path())
def __read(self): # Run rados benchmark on selected nodes for node in self.clients: cmd = "python " + ConfigAPI().get_node_stress_job_script_path( ) + " -d {} -t {} -m r -p {}".format(self.stress_duration, self.threads, self.pool) logger.info("Run rados read cmd on node {} : ".format(node) + cmd) out, err = ssh().exec_command(node, cmd) # get job id from output and assign to its node if not err: self.read_jobs[int(out)] = node logger.info("Wait time before collect node state.") sleep(self.wait_for_collect_state) # Get state of storage nodes for node in self.storage_nodes: cmd = "python " + ConfigAPI().get_storage_load_job_script_path( ) + " -d {} ".format(self.state_duration) out, err = ssh().exec_command(node, cmd) logger.info("Run sar state cmd on node {} : ".format(node) + cmd) if not err: self.read_jobs[int(out)] = node # Wait to complete all jobs sleep(self.stress_duration - self.wait_for_collect_state) # Check the completed jobs and get the output while (len(self.read_jobs) > 0): remove_job_ids = [] for job_id, node_name in self.read_jobs.iteritems(): cmd = "python " + ConfigAPI().get_job_info_script_path( ) + " -id {} -t {}".format(job_id, 1) out, err = ssh().exec_command(node_name, cmd) # Job completed if int(out) == 1: remove_job_ids.append(job_id) cmd = "python " + ConfigAPI().get_job_info_script_path( ) + " -id {} -t {}".format(job_id, 2) out, err = ssh().exec_command(node_name, cmd) logger.debug( "Get job output by cmd {} from node {} ".format( cmd, node_name)) logger.debug("Output is {} ".format(out)) # job passed and get our output if out.startswith(self.output_split_text) or out.find( self.output_split_text) > -1: out = out.split(self.output_split_text)[1] else: continue # Get rados IOPs output if node_name in self.clients: rados_rs = RadosResult() if out: rados_rs.load_json(out) self.report.read_iops += rados_rs.iops self.report.read_throughput += rados_rs.throughput elif node_name in self.storage_nodes: # Get sar output sar_rs = SarResult() if out: sar_rs.load_json(out) self.report.read_nodes.append(sar_rs) # Remove completed jobs for i in remove_job_ids: self.read_jobs.pop(i) if len(self.read_jobs) > 0: sleep(5)