示例#1
0
    def wait_until_stopped(self, wait_seconds=127, marks=[], dump_core=True):
        """
        Wait until node is stopped after do_stop was called.
          - wait_other_notice: return only when the other live nodes of the
            cluster have marked this node has dead.
          - marks: optional list of (node, mark) to call watch_log_for_death on.
        """

        if self.is_running():
            if not self._wait_until_stopped(wait_seconds):
                if self.jmx_pid:
                    try:
                        os.kill(self.jmx_pid, signal.SIGKILL)
                    except OSError:
                        pass
                if dump_core and self.pid:
                    # Aborting is intended to generate a core dump
                    # so the reason the node didn't stop normally can be studied.
                    print(
                        "{} is still running. Trying to generate coredump using kill({}, SIGQUIT)..."
                        .format(self.name, self.pid))
                    try:
                        os.kill(self.pid, signal.SIGQUIT)
                    except OSError:
                        pass
                    self._wait_until_stopped(300)
                if self.is_running():
                    raise NodeError("Problem stopping node %s" % self.name)

        for node, mark in marks:
            if node != self:
                node.watch_log_for_death(self, from_mark=mark)
示例#2
0
    def _update_pid(self):
        if not os.path.isfile(self._get_pid_file()):
            return

        start = time.time()
        pidfile = self._get_pid_file()
        while not (os.path.isfile(pidfile) and os.stat(pidfile).st_size > 0):
            if time.time() - start > 30.0:
                print_(
                    "Timed out waiting for pidfile {} to be filled (current time is %s): File {} size={}"
                    .format(
                        pidfile, datetime.now(), 'exists'
                        if os.path.isfile(pidfile) else 'does not exist'
                        if not os.path.exists(pidfile) else 'is not a file',
                        os.stat(pidfile).st_size
                        if os.path.exists(pidfile) else -1))
                break
            else:
                time.sleep(0.1)

        try:
            with open(self._get_pid_file(), 'r') as f:
                self._pid = int(f.readline().strip())
        except IOError as e:
            raise NodeError('Problem starting scylla-manager due to %s' % (e))
示例#3
0
    def replace_nonexistent_node_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(
                ['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(
                ['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version(
        ) >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        debug('Start node 4 and replace an address with no node')
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        cluster.add(node4, False)

        #try to replace an unassigned ip address
        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.5',
                            wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")
示例#4
0
    def _update_jmx_pid(self, wait=True):
        pidfile = os.path.join(self.get_path(), 'scylla-jmx.pid')

        start = time.time()
        while not (os.path.isfile(pidfile) and os.stat(pidfile).st_size > 0):
            elapsed = time.time() - start
            if elapsed > 30.0 or not wait:
                if wait:
                    print_(
                        "Timed out waiting for pidfile {} to be filled (after {} seconds): File {} size={}"
                        .format(
                            pidfile, elapsed, 'exists'
                            if os.path.isfile(pidfile) else 'does not exist' if
                            not os.path.exists(pidfile) else 'is not a file',
                            os.stat(pidfile).st_size
                            if os.path.exists(pidfile) else -1))
                break
            else:
                time.sleep(0.1)

        if not wait:
            self.jmx_pid = None
            return

        try:
            with open(pidfile, 'r') as f:
                self.jmx_pid = int(f.readline().strip())
        except IOError as e:
            raise NodeError('Problem starting node %s scylla-jmx due to %s' %
                            (self.name, e))
示例#5
0
    def start_nodes(self,
                    nodes=None,
                    no_wait=False,
                    verbose=False,
                    wait_for_binary_proto=False,
                    wait_other_notice=False,
                    jvm_args=None,
                    profile_options=None,
                    quiet_start=False):
        if not self.started and self.force_wait_for_cluster_start:
            wait_other_notice = True
            wait_for_binary_proto = True
        self.started = True

        p = None
        if jvm_args is None:
            jvm_args = []

        marks = []
        if wait_other_notice:
            marks = [(node, node.mark_log()) for node in self.nodes.values()
                     if node.is_running()]

        if nodes is None:
            nodes = self.nodes.values()
        elif isinstance(nodes, ScyllaNode):
            nodes = [nodes]

        started = []
        for node in nodes:
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False,
                               jvm_args=jvm_args,
                               profile_options=profile_options)
                started.append((node, p, mark))

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if wait_for_binary_proto:
            for node, _, mark in started:
                node.watch_log_for("Starting listening for CQL clients",
                                   verbose=verbose,
                                   from_mark=mark)

        if wait_other_notice:
            for old_node, mark in marks:
                for node, _, _ in started:
                    if old_node is not node:
                        old_node.watch_log_for_alive(node, from_mark=mark)

        return started
示例#6
0
    def _start_scylla(self, args, marks, update_pid, wait_other_notice,
                      wait_for_binary_proto):
        log_file = os.path.join(self.get_path(), 'logs', 'system.log')
        # In case we are restarting a node
        # we risk reading the old cassandra.pid file
        self._delete_old_pid()

        scylla_log = open(log_file, 'a')
        env_copy = os.environ
        env_copy['SCYLLA_HOME'] = self.get_path()
        dbuild_so_dir = os.environ.get('SCYLLA_DBUILD_SO_DIR')
        if dbuild_so_dir:
            # FIXME: this should be removed once we'll support running scylla relocatable package
            executable = os.path.join(dbuild_so_dir, 'scylla.sh')
            if not os.path.isfile(executable):
                with open(executable, 'w+') as f:
                    f.write('#!/bin/bash\nexec -a scylla ' + ' '.join([
                        os.path.join(dbuild_so_dir, 'ld-linux-x86-64.so.2'),
                        '--library-path', dbuild_so_dir
                    ]) + ' "$@" ')
                os.chmod(executable, 0o0777)
            args = [executable] + args
        self._process_scylla = subprocess.Popen(args,
                                                stdout=scylla_log,
                                                stderr=scylla_log,
                                                close_fds=True,
                                                env=env_copy)
        self._process_scylla.poll()
        # When running on ccm standalone, the waiter thread would block
        # the create commands. Besides in that mode, waiting is unnecessary,
        # since the original popen reference is garbage collected.
        standalone = os.environ.get('SCYLLA_CCM_STANDALONE', None)
        if standalone is None:
            self._process_scylla_waiter = threading.Thread(
                target=self._wait_for_scylla)
            self._process_scylla_waiter.start()
        pid_filename = os.path.join(self.get_path(), 'cassandra.pid')
        with open(pid_filename, 'w') as pid_file:
            pid_file.write(str(self._process_scylla.pid))

        if update_pid:
            self._update_pid(self._process_scylla)
            if not self.is_running():
                raise NodeError("Error starting node %s" % self.name,
                                self._process_scylla)

        if wait_other_notice:
            for node, mark in marks:
                node.watch_log_for_alive(self, from_mark=mark)

        if wait_for_binary_proto:
            self.wait_for_binary_interface(from_mark=self.mark,
                                           process=self._process_scylla)
        else:
            time.sleep(2)

        return self._process_scylla
示例#7
0
    def _start_scylla(self, args, marks, update_pid, wait_other_notice,
                      wait_for_binary_proto, ext_env):
        log_file = os.path.join(self.get_path(), 'logs', 'system.log')
        # In case we are restarting a node
        # we risk reading the old cassandra.pid file
        self._delete_old_pid()

        scylla_log = open(log_file, 'a')
        try:
            env_copy = self._launch_env
        except AttributeError:
            env_copy = os.environ
        env_copy['SCYLLA_HOME'] = self.get_path()
        env_copy.update(ext_env)
        self._process_scylla = subprocess.Popen(args,
                                                stdout=scylla_log,
                                                stderr=scylla_log,
                                                close_fds=True,
                                                env=env_copy)
        self._process_scylla.poll()
        # When running on ccm standalone, the waiter thread would block
        # the create commands. Besides in that mode, waiting is unnecessary,
        # since the original popen reference is garbage collected.
        standalone = os.environ.get('SCYLLA_CCM_STANDALONE', None)
        if standalone is None:
            self._process_scylla_waiter = threading.Thread(
                target=self._wait_for_scylla)
            # Don't block the main thread on abnormal shutdown
            self._process_scylla_waiter.daemon = True
            self._process_scylla_waiter.start()
        pid_filename = os.path.join(self.get_path(), 'cassandra.pid')
        with open(pid_filename, 'w') as pid_file:
            pid_file.write(str(self._process_scylla.pid))

        if update_pid:
            self._update_pid(self._process_scylla)
            if not self.is_running():
                raise NodeError("Error starting node %s" % self.name,
                                self._process_scylla)

        if wait_other_notice:
            for node, mark in marks:
                node.watch_log_for_alive(self, from_mark=mark)

        if wait_for_binary_proto:
            try:
                self.wait_for_binary_interface(from_mark=self.mark,
                                               process=self._process_scylla,
                                               timeout=420)
            except TimeoutError as e:
                if not self.wait_for_bootstrap_repair(from_mark=self.mark):
                    raise e
                pass

        return self._process_scylla
示例#8
0
    def start(self,
              no_wait=False,
              verbose=False,
              wait_for_binary_proto=False,
              jvm_args=[],
              profile_options=None):
        started = []
        for node in list(self.nodes.values()):
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False,
                               jvm_args=jvm_args,
                               profile_options=profile_options)
                started.append((node, p, mark))

        if no_wait and not verbose:
            time.sleep(
                2
            )  # waiting 2 seconds to check for early errors and for the pid to be set
        else:
            for node, p, mark in started:
                try:
                    node.watch_log_for("Listening for thrift clients...",
                                       process=p,
                                       verbose=verbose,
                                       from_mark=mark)
                except RuntimeError:
                    return None

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if not no_wait and self.version() >= "0.8":
            # 0.7 gossip messages seems less predictible that from 0.8 onwards and
            # I don't care enough
            for node, _, mark in started:
                for other_node, _, _ in started:
                    if other_node is not node:
                        node.watch_log_for_alive(other_node, from_mark=mark)

        if wait_for_binary_proto:
            for node, _, mark in started:
                node.watch_log_for("Starting listening for CQL clients",
                                   process=p,
                                   verbose=verbose,
                                   from_mark=mark)
            time.sleep(0.2)

        return started
示例#9
0
    def start(self, no_wait=False, verbose=False, wait_for_binary_proto=False, wait_other_notice=False, jvm_args=None, profile_options=None, quiet_start=False):
        if jvm_args is None:
            jvm_args = []

        common.assert_jdk_valid_for_cassandra_version(self.cassandra_version())

        if wait_other_notice:
            marks = [(node, node.mark_log()) for node in list(self.nodes.values())]

        started = []
        for node in list(self.nodes.values()):
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False, jvm_args=jvm_args, profile_options=profile_options, verbose=verbose, quiet_start=quiet_start)
                started.append((node, p, mark))

        if no_wait and not verbose:
            time.sleep(2)  # waiting 2 seconds to check for early errors and for the pid to be set
        else:
            for node, p, mark in started:
                try:
                    start_message = "Listening for thrift clients..." if parse_version(self.version()) < parse_version("2.2") else "Starting listening for CQL clients"
                    node.watch_log_for(start_message, timeout=60, process=p, verbose=verbose, from_mark=mark)
                except RuntimeError:
                    return None

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if not no_wait and parse_version(self.version()) >= parse_version("0.8"):
            # 0.7 gossip messages seems less predictible that from 0.8 onwards and
            # I don't care enough
            for node, _, mark in started:
                for other_node, _, _ in started:
                    if other_node is not node:
                        node.watch_log_for_alive(other_node, from_mark=mark)

        if wait_other_notice:
            for old_node, mark in marks:
                for node, _, _ in started:
                    if old_node is not node:
                        old_node.watch_log_for_alive(node, from_mark=mark)

        if wait_for_binary_proto:
            for node, p, mark in started:
                node.wait_for_binary_interface(process=p, verbose=verbose, from_mark=mark)

        return started
示例#10
0
 def _wait_no_pending_flushes(self, wait_timeout=60):
     def no_pending_flushes():
         stdout, _ = self.nodetool('cfstats')
         pending_flushes = False
         for line in stdout.splitlines():
             line = line.strip()
             if line.startswith('Pending flushes'):
                 _, pending_flushes_str = line.split(':')
                 pending_flushes_count = int(pending_flushes_str.strip())
                 if pending_flushes_count > 0:
                     pending_flushes = True
         return not pending_flushes
     result = wait_for(no_pending_flushes, timeout=wait_timeout, step=1.0)
     if result is None:
         raise NodeError("Node %s still has pending flushes after "
                         "%s seconds" % (self.name, wait_timeout))
示例#11
0
    def _update_jmx_pid(self):
        pidfile = os.path.join(self.get_path(), 'scylla-jmx.pid')

        start = time.time()
        while not (os.path.isfile(pidfile) and os.stat(pidfile).st_size > 0):
            if time.time() - start > 30.0:
                print_("Timed out waiting for pidfile to be filled "
                       "(current time is %s)" % (datetime.datetime.now()))
                break
            else:
                time.sleep(0.1)

        try:
            with open(pidfile, 'r') as f:
                self.jmx_pid = int(f.readline().strip())
        except IOError as e:
            raise NodeError('Problem starting node %s scylla-jmx due to %s' %
                            (self.name, e))
示例#12
0
    def _update_pid(self):
        if not os.path.isfile(self._get_pid_file()):
            return

        start = time.time()
        while not (os.path.isfile(self._get_pid_file())
                   and os.stat(self._get_pid_file()).st_size > 0):
            if time.time() - start > 30.0:
                print_("Timed out waiting for pidfile to be filled "
                       "(current time is %s)" % (datetime.datetime.now()))
                break
            else:
                time.sleep(0.1)

        try:
            with open(self._get_pid_file(), 'r') as f:
                self._pid = int(f.readline().strip())
        except IOError as e:
            raise NodeError('Problem starting scylla-manager due to %s' % (e))
示例#13
0
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(
                ['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(
                ['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version(
        ) >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3',
                            wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log(
            "java.lang.UnsupportedOperationException: Cannot replace a live node..."
        )
        self.assertEqual(len(checkError), 1)
示例#14
0
    def start(self, no_wait=False, verbose=False, wait_for_binary_proto=False,
              wait_other_notice=True, jvm_args=None, profile_options=None,
              quiet_start=False, allow_root=False):
        if jvm_args is None:
            jvm_args = []

        common.assert_jdk_valid_for_cassandra_version(self.cassandra_version())

        # check whether all loopback aliases are available before starting any nodes
        for node in list(self.nodes.values()):
            if not node.is_running():
                for itf in node.network_interfaces.values():
                    if itf is not None:
                        if not common.check_socket_available(itf, return_on_error=True):
                            addr, port = itf
                            common.error("Inet address {}:{} is not available; a cluster may already be running or you may need to add the loopback alias".format(addr, port))
                            sys.exit(1)

        started = []
        for node in list(self.nodes.values()):
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False, jvm_args=jvm_args, profile_options=profile_options, verbose=verbose, quiet_start=quiet_start, allow_root=allow_root)

                # Prior to JDK8, starting every node at once could lead to a
                # nanotime collision where the RNG that generates a node's tokens
                # gives identical tokens to several nodes. Thus, we stagger
                # the node starts
                if common.get_jdk_version() < '1.8':
                    time.sleep(1)

                started.append((node, p, mark))

        if no_wait:
            time.sleep(2)  # waiting 2 seconds to check for early errors and for the pid to be set
        else:
            for node, p, mark in started:
                try:
                    start_message = "Listening for thrift clients..." if self.cassandra_version() < "2.2" else "Starting listening for CQL clients"
                    node.watch_log_for(start_message, timeout=60, process=p, verbose=verbose, from_mark=mark)
                except RuntimeError:
                    return None

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if not no_wait:
            if wait_other_notice:
                for (node, _, mark), (other_node, _, _) in itertools.permutations(started, 2):
                    node.watch_log_for_alive(other_node, from_mark=mark)

            if wait_for_binary_proto:
                for node, p, mark in started:
                    node.wait_for_binary_interface(process=p, verbose=verbose, from_mark=mark)

        return started
示例#15
0
    def start(self,
              no_wait=False,
              verbose=False,
              wait_for_binary_proto=True,
              wait_other_notice=True,
              jvm_args=None,
              profile_options=None,
              quiet_start=False,
              allow_root=False,
              jvm_version=None,
              **kwargs):
        if jvm_args is None:
            jvm_args = []

        extension.pre_cluster_start(self)

        # check whether all loopback aliases are available before starting any nodes
        for node in list(self.nodes.values()):
            if not node.is_running():
                for itf in node.network_interfaces.values():
                    if itf is not None:
                        common.assert_socket_available(itf)

        started = []
        for node in list(self.nodes.values()):
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False,
                               jvm_args=jvm_args,
                               jvm_version=jvm_version,
                               profile_options=profile_options,
                               verbose=verbose,
                               quiet_start=quiet_start,
                               allow_root=allow_root)

                # Prior to JDK8, starting every node at once could lead to a
                # nanotime collision where the RNG that generates a node's tokens
                # gives identical tokens to several nodes. Thus, we stagger
                # the node starts
                if common.get_jdk_version() < '1.8':
                    time.sleep(1)

                started.append((node, p, mark))

        if no_wait:
            time.sleep(
                2
            )  # waiting 2 seconds to check for early errors and for the pid to be set
        else:
            for node, p, mark in started:
                try:
                    timeout = kwargs.get(
                        'timeout',
                        int(
                            os.environ.get('CCM_CLUSTER_START_DEFAULT_TIMEOUT',
                                           60)))
                    timeout = int(
                        os.environ.get('CCM_CLUSTER_START_TIMEOUT_OVERRIDE',
                                       timeout))
                    start_message = "Listening for thrift clients..." if self.cassandra_version(
                    ) < "2.2" else "Starting listening for CQL clients"
                    node.watch_log_for(start_message,
                                       timeout=timeout,
                                       process=p,
                                       verbose=verbose,
                                       from_mark=mark)
                except RuntimeError:
                    return None

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if not no_wait:
            if wait_other_notice:
                for (node, _, mark), (other_node, _,
                                      _) in itertools.permutations(started, 2):
                    node.watch_log_for_alive(other_node, from_mark=mark)

            if wait_for_binary_proto:
                for node, p, mark in started:
                    node.wait_for_binary_interface(process=p,
                                                   verbose=verbose,
                                                   from_mark=mark)

        extension.post_cluster_start(self)

        return started
示例#16
0
    def start(self,
              join_ring=True,
              no_wait=False,
              verbose=False,
              update_pid=True,
              wait_other_notice=False,
              replace_token=None,
              replace_address=None,
              jvm_args=None,
              wait_for_binary_proto=False,
              profile_options=None,
              use_jna=False,
              quiet_start=False):
        """
        Start the node. Options includes:
          - join_ring: if false, start the node with -Dcassandra.join_ring=False
          - no_wait: by default, this method returns when the node is started and listening to clients.
            If no_wait=True, the method returns sooner.
          - wait_other_notice: if True, this method returns only when all other live node of the cluster
            have marked this node UP.
          - replace_token: start the node with the -Dcassandra.replace_token option.
          - replace_address: start the node with the -Dcassandra.replace_address option.
        """
        if jvm_args is None:
            jvm_args = []

        if self.is_running():
            raise NodeError("%s is already running" % self.name)

        for itf in list(self.network_interfaces.values()):
            if itf is not None and replace_address is None:
                common.check_socket_available(itf)

        if wait_other_notice:
            marks = [(node, node.mark_log())
                     for node in list(self.cluster.nodes.values())
                     if node.is_running()]

        cdir = self.get_install_dir()
        launch_bin = common.join_bin(cdir, 'bin', 'dse')
        # Copy back the dse scripts since profiling may have modified it the previous time
        shutil.copy(launch_bin, self.get_bin_dir())
        launch_bin = common.join_bin(self.get_path(), 'bin', 'dse')

        # If Windows, change entries in .bat file to split conf from binaries
        if common.is_win():
            self.__clean_bat()

        if profile_options is not None:
            config = common.get_config()
            if 'yourkit_agent' not in config:
                raise NodeError(
                    "Cannot enable profile. You need to set 'yourkit_agent' to the path of your agent in a {0}/config"
                    .format(common.get_default_path_display_name()))
            cmd = '-agentpath:%s' % config['yourkit_agent']
            if 'options' in profile_options:
                cmd = cmd + '=' + profile_options['options']
            print_(cmd)
            # Yes, it's fragile as shit
            pattern = r'cassandra_parms="-Dlog4j.configuration=log4j-server.properties -Dlog4j.defaultInitOverride=true'
            common.replace_in_file(launch_bin, pattern,
                                   '    ' + pattern + ' ' + cmd + '"')

        os.chmod(launch_bin, os.stat(launch_bin).st_mode | stat.S_IEXEC)

        env = common.make_dse_env(self.get_install_dir(), self.get_path())

        if common.is_win():
            self._clean_win_jmx()

        pidfile = os.path.join(self.get_path(), 'cassandra.pid')
        args = [launch_bin, 'cassandra']

        if self.workload is not None:
            if 'hadoop' in self.workload:
                args.append('-t')
            if 'solr' in self.workload:
                args.append('-s')
            if 'spark' in self.workload:
                args.append('-k')
            if 'cfs' in self.workload:
                args.append('-c')
        args += ['-p', pidfile, '-Dcassandra.join_ring=%s' % str(join_ring)]
        if replace_token is not None:
            args.append('-Dcassandra.replace_token=%s' % str(replace_token))
        if replace_address is not None:
            args.append('-Dcassandra.replace_address=%s' %
                        str(replace_address))
        if use_jna is False:
            args.append('-Dcassandra.boot_without_jna=true')
        args = args + jvm_args

        process = None
        if common.is_win():
            # clean up any old dirty_pid files from prior runs
            if (os.path.isfile(self.get_path() + "/dirty_pid.tmp")):
                os.remove(self.get_path() + "/dirty_pid.tmp")
            process = subprocess.Popen(args,
                                       cwd=self.get_bin_dir(),
                                       env=env,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
        else:
            process = subprocess.Popen(args,
                                       env=env,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)

        # Our modified batch file writes a dirty output with more than just the pid - clean it to get in parity
        # with *nix operation here.
        if common.is_win():
            self.__clean_win_pid()
            self._update_pid(process)
        elif update_pid:
            if no_wait:
                time.sleep(
                    2
                )  # waiting 2 seconds nevertheless to check for early errors and for the pid to be set
            else:
                for line in process.stdout:
                    if verbose:
                        print_(line.rstrip('\n'))

            self._update_pid(process)

            if not self.is_running():
                raise NodeError("Error starting node %s" % self.name, process)

        if wait_other_notice:
            for node, mark in marks:
                node.watch_log_for_alive(self, from_mark=mark)

        if wait_for_binary_proto:
            self.wait_for_binary_interface()

        if self.cluster.hasOpscenter():
            self._start_agent()

        return process
示例#17
0
    def stop(self,
             wait=True,
             wait_other_notice=False,
             other_nodes=None,
             gently=True,
             wait_seconds=127):
        """
        Stop the node.
          - wait: if True (the default), wait for the Scylla process to be
            really dead. Otherwise return after having sent the kill signal.
            stop() will wait up to wait_seconds, by default 127 seconds, for
            the Cassandra process to die. After this wait, it will throw an
            exception stating it couldn't stop the node.
          - wait_other_notice: return only when the other live nodes of the
            cluster have marked this node has dead.
          - gently: Let Scylla and Scylla JMX clean up and shut down properly.
            Otherwise do a 'kill -9' which shuts down faster.
        """
        marks = []
        if self.is_running():
            if wait_other_notice:
                if not other_nodes:
                    other_nodes = list(self.cluster.nodes.values())
                marks = [(node, node.mark_log()) for node in other_nodes
                         if node.is_live() and node is not self]
            self._update_jmx_pid()

            if self._process_jmx and self._process_scylla:
                if gently:
                    try:
                        self._process_jmx.terminate()
                    except OSError as e:
                        pass
                    try:
                        self._process_scylla.terminate()
                    except OSError as e:
                        pass
                else:
                    try:
                        self._process_jmx.kill()
                    except OSError as e:
                        pass
                    try:
                        self._process_scylla.kill()
                    except OSError as e:
                        pass
            else:
                signal_mapping = {True: signal.SIGTERM, False: signal.SIGKILL}
                for pid in [self.jmx_pid, self.pid]:
                    try:
                        os.kill(pid, signal_mapping[gently])
                    except OSError:
                        pass

            if wait_other_notice:
                for node, mark in marks:
                    node.watch_log_for_death(self, from_mark=mark)
            else:
                time.sleep(.1)

            still_running = self.is_running()
            if still_running and wait:
                # The sum of 7 sleeps starting at 1 and doubling each time
                # is 2**7-1 (=127). So to sleep an arbitrary wait_seconds
                # we need the first sleep to be wait_seconds/(2**7-1).
                wait_time_sec = wait_seconds / (2**7 - 1.0)
                for i in xrange(0, 7):
                    time.sleep(wait_time_sec)
                    if not self.is_running():
                        return True
                    wait_time_sec *= 2
                raise NodeError("Problem stopping node %s" % self.name)
            else:
                return True
        else:
            return False
示例#18
0
    def start(self,
              join_ring=True,
              no_wait=False,
              verbose=False,
              update_pid=True,
              wait_other_notice=False,
              replace_token=None,
              replace_address=None,
              jvm_args=None,
              wait_for_binary_proto=False,
              profile_options=None,
              use_jna=False,
              quiet_start=False):
        """
        Start the node. Options includes:
          - join_ring: if false, start the node with -Dcassandra.join_ring=False
          - no_wait: by default, this method returns when the node is started
            and listening to clients.
            If no_wait=True, the method returns sooner.
          - wait_other_notice: if True, this method returns only when all other
            live node of the cluster
            have marked this node UP.
          - replace_token: start the node with the -Dcassandra.replace_token
            option.
          - replace_address: start the node with the
            -Dcassandra.replace_address option.
        """
        if jvm_args is None:
            jvm_args = []

        scylla_cassandra_mapping = {
            '-Dcassandra.replace_address_first_boot':
            '--replace-address-first-boot'
        }
        # Replace args in the form
        # ['-Dcassandra.foo=bar'] to ['-Dcassandra.foo', 'bar']
        translated_args = []
        new_jvm_args = []
        for jvm_arg in jvm_args:
            if '=' in jvm_arg:
                split_option = jvm_arg.split("=")
                e_msg = ("Option %s not in the form '-Dcassandra.foo=bar'. "
                         "Please check your test" % jvm_arg)
                assert len(split_option) == 2, e_msg
                option, value = split_option
                # If we have information on how to translate the jvm option,
                # translate it
                if option in scylla_cassandra_mapping:
                    translated_args += [
                        scylla_cassandra_mapping[option], value
                    ]
                # Otherwise, just pass it as is
                else:
                    new_jvm_args.append(jvm_arg)
            else:
                new_jvm_args.append(jvm_arg)
        jvm_args = new_jvm_args

        if self.is_running():
            raise NodeError("%s is already running" % self.name)

        for itf in list(self.network_interfaces.values()):
            if itf is not None and replace_address is None:
                try:
                    common.check_socket_available(itf)
                except Exception as msg:
                    print("{}. Looking for offending processes...".format(msg))
                    for proc in psutil.process_iter():
                        if any(self.cluster.ipprefix in cmd
                               for cmd in proc.cmdline()):
                            print("name={} pid={} cmdline={}".format(
                                proc.name(), proc.pid, proc.cmdline()))
                    raise msg

        marks = []
        if wait_other_notice:
            marks = [(node, node.mark_log())
                     for node in list(self.cluster.nodes.values())
                     if node.is_running()]

        self.mark = self.mark_log()

        launch_bin = common.join_bin(self.get_path(), 'bin', 'scylla')
        options_file = os.path.join(self.get_path(), 'conf', 'scylla.yaml')

        os.chmod(launch_bin, os.stat(launch_bin).st_mode | stat.S_IEXEC)

        # TODO: we do not support forcing specific settings
        # TODO: workaround for api-address as we do not load it
        # from config file scylla#59
        conf_file = os.path.join(self.get_conf_dir(), common.SCYLLA_CONF)
        with open(conf_file, 'r') as f:
            data = yaml.safe_load(f)
        jvm_args = jvm_args + ['--api-address', data['api_address']]
        jvm_args = jvm_args + [
            '--collectd-hostname',
            '%s.%s' % (socket.gethostname(), self.name)
        ]

        # Let's add jvm_args and the translated args

        args = [
            launch_bin, '--options-file', options_file, '--log-to-stdout', '1'
        ] + jvm_args + translated_args

        # Lets search for default overrides in SCYLLA_EXT_OPTS
        scylla_ext_opts = os.getenv('SCYLLA_EXT_OPTS', "").split()
        opts_i = 0
        orig_args = list(args)
        while opts_i < len(scylla_ext_opts):
            if scylla_ext_opts[opts_i].startswith("--scylla-manager="):
                opts_i += 1
            elif scylla_ext_opts[opts_i].startswith('-'):
                add = False
                if scylla_ext_opts[opts_i] not in orig_args:
                    add = True
                    args.append(scylla_ext_opts[opts_i])
                opts_i += 1
                while opts_i < len(scylla_ext_opts) and not scylla_ext_opts[
                        opts_i].startswith('-'):
                    if add:
                        args.append(scylla_ext_opts[opts_i])
                    opts_i += 1

        if '--developer-mode' not in args:
            args += ['--developer-mode', 'true']
        if '--smp' not in args:
            # If --smp is not passed from cmdline, use default (--smp 1)
            args += ['--smp', str(self._smp)]
        elif self._smp_set_during_test:
            # If node.set_smp() is called during the test, ignore the --smp
            # passed from the cmdline.
            args[args.index('--smp') + 1] = str(self._smp)
        else:
            # Update self._smp based on command line parameter.
            # It may be used below, along with self._mem_mb_per_cpu, for calculating --memory
            self._smp = int(args[args.index('--smp') + 1])
        if '--memory' not in args:
            # If --memory is not passed from cmdline, use default (512M per cpu)
            args += [
                '--memory', '{}M'.format(self._mem_mb_per_cpu * self._smp)
            ]
        elif self._mem_set_during_test:
            # If node.set_mem_mb_per_cpu() is called during the test, ignore the --memory
            # passed from the cmdline.
            args[args.index('--memory') + 1] = '{}M'.format(
                self._mem_mb_per_cpu * self._smp)
        if '--default-log-level' not in args:
            args += ['--default-log-level', self.__global_log_level]
        # TODO add support for classes_log_level
        if '--collectd' not in args:
            args += ['--collectd', '0']
        if '--cpuset' not in args:
            args += ['--overprovisioned']
        if '--prometheus-address' not in args:
            args += ['--prometheus-address', data['api_address']]
        if replace_address:
            args += ['--replace-address', replace_address]
        args += ['--unsafe-bypass-fsync', '1']

        scylla_process = self._start_scylla(args, marks, update_pid,
                                            wait_other_notice,
                                            wait_for_binary_proto)
        self._start_jmx(data)

        if not self._wait_java_up(data):
            e_msg = (
                "Error starting node %s: unable to connect to scylla-jmx" %
                self.name)
            raise NodeError(e_msg, scylla_process)

        self.is_running()

        return scylla_process
示例#19
0
    def stop(self, wait=True, wait_other_notice=False, gently=True):
        """
        Stop the node.
          - wait: if True (the default), wait for the Scylla process to be
            really dead. Otherwise return after having sent the kill signal.
          - wait_other_notice: return only when the other live nodes of the
            cluster have marked this node has dead.
          - gently: Let Scylla and Scylla JMX clean up and shut down properly.
            Otherwise do a 'kill -9' which shuts down faster.
        """
        marks = []
        if self.is_running():
            if wait_other_notice:
                marks = [(node, node.mark_log())
                         for node in list(self.cluster.nodes.values())
                         if node.is_live() and node is not self]
            self._update_jmx_pid()

            if self._process_jmx and self._process_scylla:
                if gently:
                    try:
                        self._process_jmx.terminate()
                    except OSError as e:
                        pass
                    try:
                        self._process_scylla.terminate()
                    except OSError as e:
                        pass
                else:
                    try:
                        self._process_jmx.kill()
                    except OSError as e:
                        pass
                    try:
                        self._process_scylla.kill()
                    except OSError as e:
                        pass
            else:
                signal_mapping = {True: signal.SIGTERM, False: signal.SIGKILL}
                for pid in [self.jmx_pid, self.pid]:
                    try:
                        os.kill(pid, signal_mapping[gently])
                    except OSError:
                        pass

            if wait_other_notice:
                for node, mark in marks:
                    node.watch_log_for_death(self, from_mark=mark)
            else:
                time.sleep(.1)

            still_running = self.is_running()
            if still_running and wait:
                wait_time_sec = 1
                for i in xrange(0, 7):
                    time.sleep(wait_time_sec)
                    if not self.is_running():
                        return True
                    wait_time_sec *= 2
                raise NodeError("Problem stopping node %s" % self.name)
            else:
                return True
        else:
            return False
示例#20
0
    def start(self,
              no_wait=False,
              verbose=False,
              wait_for_binary_proto=False,
              wait_other_notice=False,
              jvm_args=None,
              profile_options=None,
              quiet_start=False):
        if not self.started and self.force_wait_for_cluster_start:
            wait_other_notice = True
            wait_for_binary_proto = True
        self.started = True

        p = None
        if jvm_args is None:
            jvm_args = []

        marks = []
        if wait_other_notice:
            marks = [(node, node.mark_log()) for node in self.nodes.values()]

        started = []
        for node in self.nodes.values():
            if not node.is_running():
                mark = 0
                if os.path.exists(node.logfilename()):
                    mark = node.mark_log()

                p = node.start(update_pid=False,
                               jvm_args=jvm_args,
                               profile_options=profile_options)
                # Let's ensure the nodes start at different times to avoid
                # race conditions while creating system tables
                time.sleep(1)
                started.append((node, p, mark))

        if no_wait and not verbose:
            # waiting 2 seconds to check for early errors and for the
            # pid to be set
            time.sleep(2)
        else:
            for node, p, mark in started:
                start_message = "Starting listening for CQL clients"
                try:
                    # updated code, scylla starts CQL only by default
                    # process should not be checked for scylla as the
                    # process is a boot script (that ends after boot)
                    node.watch_log_for(start_message,
                                       timeout=600,
                                       verbose=verbose,
                                       from_mark=mark)
                except RuntimeError:
                    raise Exception("Not able to find start "
                                    "message '%s' in Node '%s'" %
                                    (start_message, node.name))

        self.__update_pids(started)

        for node, p, _ in started:
            if not node.is_running():
                raise NodeError("Error starting {0}.".format(node.name), p)

        if not no_wait and self.cassandra_version() >= "0.8":
            # 0.7 gossip messages seems less predictable that from 0.8
            # onwards and I don't care enough
            for node, _, mark in started:
                for other_node, _, _ in started:
                    if other_node is not node:
                        node.watch_log_for_alive(other_node, from_mark=mark)

        if wait_other_notice:
            for old_node, mark in marks:
                for node, _, _ in started:
                    if old_node is not node:
                        old_node.watch_log_for_alive(node, from_mark=mark)

        if wait_for_binary_proto and self.version() >= '1.2':
            for node, _, mark in started:
                node.watch_log_for("Starting listening for CQL clients",
                                   verbose=verbose,
                                   from_mark=mark)
            time.sleep(0.2)

        if self._scylla_manager:
            self._scylla_manager.start()

        return started