def start_ignite_grid(self, name, activate=False, already_nodes=0, replaced_name=None):
        app = self.get_app(self.ignite_app_names[name])
        app.set_node_option('*', 'config',
                            Ignite.config_builder.get_config('server', config_set_name=self.config_name))
        # app.activate_default_modules()

        artifact_cfg = self.tiden.config['artifacts'][app.name]

        app.reset()
        log_print("Ignite ver. %s, revision %s" % (
            artifact_cfg['ignite_version'],
            artifact_cfg['ignite_revision'],
        ))

        if replaced_name:
            app.grid_name = replaced_name

        cmd = [
            'cp %s %s/libs/' % (app.config['artifacts']['piclient']['remote_path'],
                                app.config['artifacts'][self.ignite_app_names[name]]['remote_path'])
        ]
        self.util_exec_on_all_hosts(app, cmd)

        app.start_nodes(already_nodes=already_nodes, other_nodes=already_nodes)

        if activate:
            app.cu.activate()

        return app
    def teardown_testcase(self):
        self.logger.info('TestTeardown is called')

        log_print(repr(self.ignite), color='debug')
        log_print(self.ignite.get_all_additional_nodes(), color='blue')
        log_print(self.ignite.get_alive_additional_nodes(), color='blue')
        for additional_node in self.ignite.get_alive_additional_nodes():
            self.ignite.kill_node(additional_node)

        # kill utility if exists
        if self.ignite.jmx.is_started():
            self.ignite.jmx.kill_utility()

        self.stop_grid_hard()
        self.su.copy_utility_log()

        if self.get_context_variable('zookeeper_enabled'):
            self.zoo.stop_zookeeper()
            # self.zoo.collect_logs_to_folder(self.ignite.config['rt']['remote']['test_dir'])

        # c'mon, I'm too lazy to do it right
        log_print(
            f'Value for self.need_delete_lfs_on_teardown is {self.need_delete_lfs_on_teardown}',
            color='debug')
        if self.need_delete_lfs_on_teardown:
            self.cleanup_lfs()
        self.set_current_context()
        self.reset_cluster()
        log_print(repr(self.ignite), color='debug')
    def download_run_logs(self):
        log_print('download logs')

        logs_dir = self.tiden.config['rt']['remote']['test_dir']
        files = []
        commands = dict([[host, [f"ls {logs_dir} | grep .log"]] for host in self.tiden.ssh.hosts])
        items_list = self.cluster.ssh.exec(commands)
        for host in self.tiden.ssh.hosts:
            files.append([host, [f'{logs_dir}/{file}' for file in items_list[host][0].split('\n') if file]])

        log_print('downloaded')
        files_receiver_url = self.tiden.config['environment'].get('report_files_url')
        if files_receiver_url:
            for host, paths in files:
                for file_path in paths:
                    base_filename = basename(file_path)
                    file_name = f'{uuid4()}_{base_filename}'
                    self.tiden.ssh.exec_on_host(
                        host,
                        [f'cd {dirname(file_path)}; '
                         f'curl -H "filename: {file_name}" '
                         f'-F "file=@{base_filename};filename={base_filename}" '
                         f'{files_receiver_url}/files/add']
                    )
                    self.context['attachments'].append({
                        'name': base_filename,
                        'source': file_name,
                        'type': 'file'
                    })
 def execute_action(self, state):
     for s in state:
         log_print(f'[state] {s} starting', color='yellow')
         split_state = s.split('#')
         self.context['temp_action_options'] = split_state[1:] if len(split_state) > 1 else []
         self.action_methods[split_state[0]]()
         log_print(f'[state] {s} ended', color='yellow')
 def execute_cache_check(self, verify_check_dir):
     """
     Run control.sh idle verify command on existed cluster
     """
     name = 'verify.log'
     host = self.tiden.ssh.hosts[0]
     remote_log_path = join(self.tiden.config['rt']['remote']['test_dir'], name)
     local_log_path = join(verify_check_dir, name)
     skip_download = False
     try:
         self.cluster.cu.idle_verify_dump(key_dump=False, log=remote_log_path, output_limit=300)
     except Exception as e:
         stacktrace = format_exc()
         log_print(stacktrace, color='red')
         self.context['step_failed'] = f'Caches verification is failed\n{format_exc()}'
         if 'Not found running server nodes' in stacktrace:
             skip_download = True
     finally:
         # add logs in any case
         if skip_download:
             return
         try:
             self.cluster.ssh.download_from_host(host, remote_log_path, local_log_path)
             with open(local_log_path) as file:
                 fail_message = ''.join(file.readlines())
                 if 'no conflicts have been found' not in fail_message:
                     self.context['step_failed'] = self.context.get('step_failed', '') + f'\nFound conflicts!\n{fail_message}'
             self.context['attachments'].append({
                 'name': 'log',
                 'source': local_log_path,
                 'type': 'file'
             })
         except:
             self.context['step_failed'] = self.context.get('step_failed', '') + f'\nDownload failed!\n{format_exc()}'
示例#6
0
    def test_sim(self):
        version, ignite = self.start_ignite_grid(True)

        ignite.jmx.start_utility()

        client_config = Ignite.config_builder.get_config(
            'client', config_set_name='base')
        group_names = PiClientIgniteUtils.collect_cache_group_names(
            ignite, client_config)

        PiClientIgniteUtils.load_data_with_streamer(ignite,
                                                    client_config,
                                                    end_key=50)

        server_nodes_num = ignite.get_nodes_num('server')
        sim_engine = PigeonSimulation(server_nodes_num)

        for running_iteration in range(1, DEFAULT_ITERATIONS + 1):
            log_print("Running iteration %s" % running_iteration)

            ev, node = sim_engine.next_event()
            log_print("Evaluating event %s on node %s" % (ev, node))

            pigeon = self.get_pigeon(ignite, node)

            pigeon[ev]()

            ignite.jmx.wait_for_finish_rebalance(120, group_names)

            self.verify_cluster(ignite)

        ignite.jmx.kill_utility()
示例#7
0
    def util_find_snapshot_folders_on_fs(ignite, snapshot_id, remote_dir=None):
        snapshot_dirs = {}
        remote_snapshot_dir = None

        search_in_dir = remote_dir if remote_dir else './work/snapshot/'
        output = SnapshotScenario.run_on_all_nodes(
            ignite, 'ls -1 %s | grep %s' % (search_in_dir, snapshot_id))

        if len(output) > 0:
            for node_idx in output.keys():
                snapshot_dir = output[node_idx].rstrip()
                if snapshot_dir:
                    # Add only if directory exists
                    snapshot_dirs[node_idx] = snapshot_dir
                    remote_snapshot_dir = snapshot_dirs[node_idx]
                    log_print(
                        'Snapshot directory %s for snapshot ID=%s on node %s' %
                        (snapshot_dir if snapshot_dir else 'Not found',
                         snapshot_id, node_idx),
                        color='yellow')

        # if remote dir, it is the same for all servers, so don't need to iterate over all servers
        if remote_dir:
            return '%s/%s' % (remote_dir, remote_snapshot_dir)

        return snapshot_dirs
示例#8
0
    def parallel_restart(self, ignite):
        # cleanup dead nodes LFS to avoid BLAT errors
        alive_node_id = ignite.get_alive_default_nodes()[0]

        ignite.cu.get_current_topology_version()
        all_nodes = set(
            findall('ConsistentID=([^,;\n]+)',
                    ignite.cu.latest_utility_output))
        blt_nodes = set(
            findall('ConsistentID=(.*?), STATE=',
                    ignite.cu.latest_utility_output))
        nblt_nodes = all_nodes - blt_nodes
        log_print(nblt_nodes)

        for node_id in ignite.get_all_default_nodes():
            if node_id not in ignite.get_alive_default_nodes(
            ) or ignite.get_node_consistent_id(node_id) in nblt_nodes:
                ignite.cleanup_work_dir(node_id)

        ignite.stop_nodes()
        # start first previously alive node to set BLAT correctly
        ignite.start_node(alive_node_id)

        for node_id in ignite.get_all_default_nodes():
            if node_id == alive_node_id:
                continue

            ignite.start_node(node_id)

        ignite.cu.set_current_topology_as_baseline()
示例#9
0
    def start_ignite_grid(self,
                          activate=False,
                          already_nodes=0,
                          config_set='base',
                          jvm_options=None):
        app = Ignite(self.get_app_by_type('ignite')[0])
        app.set_node_option(
            '*', 'config',
            Ignite.config_builder.get_config('server',
                                             config_set_name=config_set))

        if jvm_options:
            app.set_node_option('*', 'jvm_options', jvm_options)

        artifact_cfg = self.tiden.config['artifacts'][app.name]

        app.reset()
        version = artifact_cfg['ignite_version']
        log_print("Ignite ver. %s, revision %s" % (
            version,
            artifact_cfg['ignite_revision'],
        ))

        app.start_nodes(already_nodes=already_nodes)

        if activate:
            app.cu.activate(activate_on_particular_node=1)

        return version, app
    def start_ignite_grid(self, name, activate=False, already_nodes=0, config_set='base', jvm_options=None):
        app = self.get_app(name)
        app.set_node_option('*', 'config',
                            Ignite.config_builder.get_config('server', config_set_name=config_set))

        if jvm_options:
            app.set_node_option('*', 'jvm_options', jvm_options)

        artifact_cfg = self.tiden.config['artifacts'][app.name]

        app.reset()
        log_print("Ignite ver. %s, revision %s" % (
            artifact_cfg['ignite_version'],
            artifact_cfg['ignite_revision'],
        ))

        app.set_activation_timeout(240)
        app.set_snapshot_timeout(240)

        app.start_nodes(already_nodes=already_nodes)

        if activate:
            app.cu.activate(activate_on_particular_node=1)

        return app
示例#11
0
    def register_config_set(self, config_set_name):
        """
        Register new config set

        Context name:
        * common variables - variables for all configs in this context
        * exclusive variables - variables for particular node
        * common configs - result of config + common variables
        * exclusive configs - result of exclusive config + (common + exclusive variables)

        :param config_set_name: configuration set name
        :return:
        """
        if config_set_name in self.config_sets:
            log_print(
                "Config set with name '%s' is already registered. Creating new config set."
                % config_set_name,
                color='yellow')

        self.config_sets[config_set_name] = {}

        if self.current_config_set is None:
            self.current_config_set = config_set_name

        self.config_sets[config_set_name]['common_variables'] = {}
        self.config_sets[config_set_name]['exclusive_variables'] = {}
        self.config_sets[config_set_name]['common_configs'] = {}
        self.config_sets[config_set_name]['exclusive_configs'] = {}
        self.config_sets[config_set_name]['additional_config_types'] = {}
        return self.config_sets[config_set_name]
 def operation_deactivation(self):
     """
     deactivate cluster one time
     Start operations only after other exceptions was started
     """
     property_name = 'deactivation'
     self.operations[property_name]['started'] = False
     self.operations[property_name]['kill'] = False
     test_dir = self.tiden.config['rt']['remote']['test_dir']
     set_log_path = f"{test_dir}/deactivation_log.log"
     while True:
         keys = list(self.operations.keys())
         for k in keys:
             if not self.operations.get(property_name):
                 log_print(
                     f'[operation] {property_name.upper()} kill already killed',
                     color='yellow')
                 return
             if self.operations[property_name]['kill']:
                 return
             if len(keys) == 1 or (k != property_name
                                   and self.operations[k].get('started')):
                 self.operations[property_name]['started'] = True
                 self.cluster.cu.deactivate(log=set_log_path)
                 self.context['activate'] = False
                 self.operations[property_name]['killed'] = True
                 log_print(f'[operation] {property_name.upper()} kill',
                           color='yellow')
                 return
         sleep(1)
    def setup_webconsole(self):
        console = self.get_app_by_type('webconsole')[0]

        console.start(users=True)
        ignite_urls = ['{}:8080'.format(host) for host in console.ssh.hosts]
        console.start_web_agent(ignite_urls=ignite_urls)

        log_print('WebConsole Started', color='debug')
示例#14
0
 def _print_wait_for(self, message, node_idxs, time, timeout, done):
     nodes_str = ', '.join([str(node_id) for node_id in node_idxs])
     log_put(
         f"Waiting for '{message}' at nodes [{nodes_str}], {time}/{timeout} sec"
     )
     if done:
         stdout.flush()
         log_print('')
 def test_client_start_restart_grid_stop(self):
     with PiClient(self.ignite, self.get_client_config()) as piclient:
         log_print(piclient)
         self.stop_grid()
         self.start_grid()
         cache_names = piclient.get_ignite().cacheNames().toArray()
         for cache_name in cache_names:
             log_print(cache_name)
 def cluster_state_empty(self):
     """
     Start new clean cluster
     """
     log_print('[cluster state] staring EMPTY', color='yellow')
     self.clean_cluster()
     self.start_grid()
     self.context['clean_cluster_was_here'] = False
     log_print('[cluster state] started EMPTY', color='yellow')
    def operation_baseline(self):
        """
        Update baseline in cycle
        """
        property_name = 'baseline'
        self.operations[property_name]['started'] = False
        self.operations[property_name]['kill'] = False
        test_dir = self.tiden.config['rt']['remote']['test_dir']
        set_log_path = f"{test_dir}/set_baseline_join.log"
        get_log_path = f"{test_dir}/get_baseline_version.log"
        fail_counter = 0
        end_time = time() + self.max_operation_timeout
        while True:
            try:
                curr_top_version = self.cluster.cu.get_current_topology_version(
                    log=get_log_path,
                    show_output=False,
                    ssh_options={'timeout': 60 * 3})

                if not curr_top_version:
                    raise TidenException(
                        'Failed to get current topology version')

                if not self.operations[property_name]['started']:
                    self.operations[property_name]['started'] = True
                    log_print(f'[operation] {property_name.upper()} started',
                              color='yellow')

                self.cluster.cu.set_baseline(curr_top_version,
                                             log=set_log_path,
                                             show_output=False,
                                             output_limit=100,
                                             ssh_options={'timeout': 60 * 3})
            except:
                fail_counter += 1
                log_print(format_exc(), color='red')
                if fail_counter > 10:
                    self.operations[property_name]['kill'] = True
            finally:
                if time() > end_time:
                    log_print(
                        f'[operation] {property_name.upper()} timeout kill',
                        color='yellow')
                    if self.operations.get(property_name):
                        self.operations[property_name]['killed'] = True
                    return
                if not self.operations.get(property_name):
                    log_print(
                        f'[operation] {property_name.upper()} kill already killed',
                        color='yellow')
                    return
                if self.operations[property_name]['kill']:
                    log_print(f'[operation] {property_name.upper()} kill',
                              color='yellow')
                    self.operations[property_name]['killed'] = True
                    return
                sleep(1)
    def setup_operations(self, operations: list):
        """
        Start all operations and wait when they being running
        """
        steps = []
        try:
            for operation in operations:
                steps.append({
                    'name': operation,
                    'time': {'start': self.current_time,
                             'start_pretty': self.current_time_pretty}
                })
                log_print(f'[operation] {operation.upper()} starting', color='yellow')
                # start
                t = Thread(target=self.operations_methods[operation])
                self.operations[operation] = {'thread': t}
                t.start()

            end_time = time() + 60
            while True:
                started_operations = [(name, op) for name, op in self.operations.items() if op.get('started', False)]
                for name, started_operation in started_operations:
                    idx = [idx for idx, _step in enumerate(steps) if _step['name'] == name][0]
                    # wait for operation being running
                    if steps[idx].get('status'):
                        continue
                    # step stuff
                    steps[idx]['status'] = 'passed'
                    steps[idx]['stacktrace'] = ''
                    steps[idx]['time']['end'] = self.current_time
                    steps[idx]['time']['end_pretty'] = self.current_time_pretty
                    diff = steps[idx]["time"]["end"] - steps[idx]["time"]["start"]
                    steps[idx]['time']['diff'] = f'{diff // 1000}s'

                # all operations started
                if len(started_operations) == len(operations):
                    return

                if [op for op in self.operations.values() if op.get('killed', False)] or end_time < time():
                    # if operations killed before start or long time not started
                    for idx, step in enumerate(deepcopy(steps)):
                        if not step.get('status'):
                            steps[idx]['status'] = 'failed'
                            steps[idx]['time']['end'] = self.current_time
                            steps[idx]['time']['end_pretty'] = self.current_time_pretty
                            diff = steps[idx]["time"]["end"] - steps[idx]["time"]["start"]
                            steps[idx]['time']['diff'] = f'{diff // 1000}s'
                            steps[idx]['stacktrace'] = f'Failed to wait for operations start: \n{format_exc()}\n' \
                                                       f'Operations: {operations}\n' \
                                                       f'Started: {started_operations}'
                    raise TidenException(f'Failed to wait for operations start'
                                         f'Operations: {operations}\n'
                                         f'Started: {started_operations}')
                sleep(0.5)
        finally:
            self.context['report']['children'] = steps
示例#19
0
 def wait_transactions_finish(self):
     log_print(
         "Waiting current transactions to finish for up to {} seconds".
         format(self.transactions_timeout))
     for tries in range(
             0,
             int(self.transactions_timeout / self.transaction_check_sleep)):
         self.ignite.cu.control_utility('--tx')
         if 'Nothing found' in self.ignite.cu.latest_utility_output:
             break
         util_sleep(self.transaction_check_sleep)
示例#20
0
    def teardown_testcase(self):
        for cluster in self.clusters:
            log_print('Teardown for cluster {}'.format(cluster))
            if cluster.grid:
                cluster.grid.kill_nodes()
                cluster.grid.remove_additional_nodes()

                if not SAVE_LFS:
                    cluster.grid.delete_lfs()

                cluster.grid = None
示例#21
0
def dr_is_stopped(piclient, cache_name):
    try:
        piclient.get_ignite().cache(cache_name)
        stopped = piclient.get_ignite().plugin(
            piclient.get_gateway().jvm.org.gridgain.grid.GridGain.PLUGIN_NAME
        ).dr().senderCacheStatus(cache_name).stopped()
    except Exception as e:
        log_print(e)
        stopped = False

    finally:
        return stopped
 def operation_none(self):
     property_name = 'none'
     self.operations[property_name]['started'] = True
     self.operations[property_name]['kill'] = False
     log_print(f'[operation] {property_name.upper()} started',
               color='yellow')
     while True:
         if self.operations[property_name]['kill']:
             self.operations[property_name]['killed'] = True
             log_print(f'[operation] {property_name.upper()} kill',
                       color='yellow')
             return
         sleep(0.5)
示例#23
0
def get_nodes_directory_size(ignite,
                             ssh,
                             directory,
                             nodes=None,
                             commands=None):
    """
    Total directory size from all nodes

    :return: collected result
    """
    node_ids_to_ignite_home = {}

    if not nodes:
        nodes = ignite.get_all_default_nodes(
        ) + ignite.get_all_additional_nodes()

    for node_ids in nodes:
        node_ids_to_ignite_home[node_ids] = ignite.nodes[node_ids][
            'ignite_home']

    # commands may be overridden if specific commands should be sent to node
    if not commands:
        commands = {}

        for node_idx in nodes:
            host = ignite.nodes[node_idx]['host']
            if commands.get(host) is None:
                commands[host] = [
                    'du -s {}/{}'.format(ignite.nodes[node_idx]['ignite_home'],
                                         directory)
                ]
            else:
                commands[host].append('du -s {}/{}'.format(
                    ignite.nodes[node_idx]['ignite_home'], directory))

    results = ssh.exec(commands)
    results_parsed = 0

    for host in results.keys():
        # print(results[host][0])
        search = re.search('(\d+)\\t', results[host][0])
        if not search:
            log_print(
                'Unable to get directory size for host %s. Set directory size as 0.'
                % host)
            return 0

        results_parsed += int(search.group(1))

    return results_parsed
    def remove_index_bin_files(self, node_id):
        """
        Remove all index.bin files except cache cache-ignite-sys-caches for particular node.
        :param node_id:
        :return:
        """
        if node_id in self.ignite.nodes.keys():
            host = self.ignite.nodes[node_id]['host']
            ignite_home = self.ignite.nodes[node_id]['ignite_home']

            commands = dict()
            dir_to_search = '{}/work/db/'.format(ignite_home)
            commands[host] = [
                'find {} -name \'index.bin\''.format(dir_to_search)
            ]

            output = self.ignite.ssh.exec(commands)
            files = [
                file for file in output[host][0].split('\n')
                if file and 'cache-ignite-sys-caches' not in file
            ]
            log_print(files, color='debug')
            log_print(commands, color='debug')
            commands[host] = [
                ';'.join(['rm {}'.format(file) for file in files])
            ]
            output = self.ignite.ssh.exec(commands)
            log_print(output, color='debug')
        else:
            log_print("Node id {} does not found in server nodes {}".format(
                node_id, self.ignite.nodes.keys()),
                      color='red')
示例#25
0
    def test_master_master_master_blinking_blt(self):
        self.prepare_clusters()

        client_config = self.preconfigure_cluster_0()

        iterations = 10
        last_loaded_key = START_DATA_SIZE
        nodes_before = 6

        with PiClient(self.clusters[0].grid,
                      client_config,
                      jvm_options=['-ea']) as piclient:
            PiClientIgniteUtils.load_data_with_streamer(
                self.clusters[0].grid,
                client_config,
                end_key=last_loaded_key,
                jvm_options=['-ea'],
                check_clients=False)

            sleep(60)

            with TransactionalLoading(self,
                                      ignite=self.clusters[0].grid,
                                      config_file=client_config,
                                      skip_consistency_check=True):
                for i in range(0, iterations):
                    log_print(f'Current iteration {i + 1} from {iterations}',
                              color='debug')

                    self.clusters[0].grid.kill_node(2)

                    utility_baseline_log = 'control-utility-baseline.log'

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.clusters[0].grid.start_node(2,
                                                     skip_topology_check=True)

                    self.clusters[0].grid.wait_for_topology_snapshot(
                        server_num=6)

                    self.clusters[0].grid.update_started_node_status(2)

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.verify_cluster(0, nodes_before, last_loaded_key)
示例#26
0
    def initialize_config(self):
        self.historical_rebalance = self.config.get('historical_rebalance', False)
        self.metrics_idle = self.config.get('metrics_idle', 30)
        self.with_loading = self.config.get('with_loading', False)
        self.idle_verify = is_enabled(self.config.get('idle_verify'))
        self.load_type = self.config.get('load_type', DEFAULT_LOAD_TYPE)
        self.single_cache = 'single_cache' in self.config
        self.parts_distribution = self.config.get('partition_distribution', None)
        self.with_no_rebalance_cache = self.config.get('with_no_rebalance_cache', False)
        self.jfr_settings = self.config.get('jfr_settings', None)

        if self.with_loading and self.historical_rebalance:
            log_print('There is no support historical rebalance with loading. Skipping loading.')
            self.with_loading = False
        if self.idle_verify and self.with_loading:
            log_print('Skipping idle_verify parameter because of with_loading used', color='yellow')
示例#27
0
def calc_checksums_on_client(piclient,
                             start_key=0,
                             end_key=1000,
                             dict_mode=False):
    """
    Calculate checksum based on piclient
    :param start_key: start key
    :param end_key: end key
    :param dict_mode:
    :return:
        """
    log_print("Calculating checksums using cache.get() from client")
    cache_operation = {}
    cache_checksum = {}

    sorted_cache_names = []
    for cache_name in piclient.get_ignite().cacheNames().toArray():
        sorted_cache_names.append(cache_name)

    sorted_cache_names.sort()

    async_operations = []
    for cache_name in sorted_cache_names:
        async_operation = create_async_operation(
            create_checksum_operation,
            cache_name,
            start_key,
            end_key,
            gateway=piclient.get_gateway())
        async_operations.append(async_operation)
        cache_operation[async_operation] = cache_name
        async_operation.evaluate()

    checksums = ''

    for async_operation in async_operations:
        result = str(async_operation.getResult())
        cache_checksum[cache_operation.get(async_operation)] = result
        checksums += result

    log_print('Calculating checksums done')

    if dict_mode:
        return cache_checksum
    else:
        return checksums
示例#28
0
    def increment_atomic(ignite):
        atomic_name = None
        atomic = None
        try:
            log_print("Incrementing atomics")
            for c in range(0, 10):
                atomic_name = "nodeId_%s" % c
                atomic = ignite.atomicLong(atomic_name, 0, True)
                for j in range(0, 50):
                    atomic.incrementAndGet()
        except Exception:
            log_print(
                "Failed to increment atomics: assert that atomic.removed() and recreating it",
                color='red')

            if atomic:
                ignite.atomicLong(atomic_name, 100, True)
示例#29
0
 def start_nodes(self):
     start_command = {}
     pids = {}
     nodes_to_start = []
     for node_idx, node in self.nodes.items():
         self.rotate_node_log(node_idx)
         nodes_to_start.append(node_idx)
         host = node['host']
         if host not in start_command:
             start_command[host] = []
         start_command[host].extend(self.get_node_start_commands(node_idx))
         pids[node_idx] = len(start_command[host]) - 1
         node['status'] = NodeStatus.STARTING
     log_print(f"Start {self.name.title()} node(s): {nodes_to_start}")
     result = self.ssh.exec(start_command)
     for node_idx, node in self.nodes.items():
         host = node['host']
         try:
             node['PID'] = int(result[host][pids[node_idx]].strip())
             if not node['PID']:
                 raise ValueError(f'no PID for node {node_idx}')
         except ValueError or IndexError or KeyError as e:
             raise TidenException(
                 f"Can't start {self.name.title()} node {node_idx} at host {host}"
             )
     check_command = {}
     status = {}
     for node_idx, node in self.nodes.items():
         host = node['host']
         if host not in check_command:
             check_command[host] = []
         check_command[host].extend(self.get_node_check_commands(node_idx))
         status[node_idx] = len(check_command[host]) - 1
     result = self.ssh.exec(check_command)
     for node_idx, node in self.nodes.items():
         host = node['host']
         try:
             if result[host][status[node_idx]]:
                 node['status'] = NodeStatus.STARTED
         except IndexError or ValueError or KeyError as e:
             raise TidenException(
                 f"Can't check {self.name.title()} node {node_idx} started at host {host}"
             )
         log_print(
             f"{self.name.title()} node {node_idx} started on {host} with PID {node['PID']}"
         )
示例#30
0
    def verify_no_assertion_errors(self, ignite):
        assertion_errors = ignite.find_exception_in_logs(
            ".*java.lang.AssertionError.*")

        # remove assertions from ignite.nodes to prevent massive output
        for node_id in ignite.nodes.keys():
            if 'exception' in ignite.nodes[
                    node_id] and ignite.nodes[node_id]['exception'] != '':
                log_print("AssertionError found on node %s, text: %s" %
                          (node_id, ignite.nodes[node_id]['exception'][:100]),
                          color='red')
                ignite.nodes[node_id]['exception'] = ''

        if assertion_errors != 0:
            for node_id in ignite.get_alive_default_nodes():
                self.util_get_threads_from_jstack(ignite, node_id)

            assert False, "AssertionErrors found in server logs! Count %d" % assertion_errors